# Tweets to Network (and then filtering)

This notebook ingests the data as harvested by Anne, parses them, transforms them into a directed network such that there is an edge from a tweet to its reply (so, essentially the other way around as in the original data where replies point to their parent).

```
parent ---> child --> grandchild
   |
   |
   V
 child      
    
````

After that, retrieve only conversations between two people.
Essentially:

```
Opening Post by user 1 ---> Reply by user 2 ---> reply by user 1 ... ---> ....
```

Or in Christel's words:

```
act --> interact --> double interact (--> double interact (--> double interact.....))
```


In [59]:
import igraph as ig
import json
import matplotlib.pyplot as plt

from pprint import pprint
from dataclasses import dataclass
from ast import literal_eval
from glob import glob
from tqdm.auto import tqdm
import re

In [21]:
data = []
filenames = glob("../data/convos-climate-change-2021-2022/*.json")
for fn in tqdm(filenames):
    with open(fn) as f:
        data.extend(json.loads(literal_eval(f.read())))
print(len(data))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=383.0), HTML(value='')))


38277


## Parse the JSON files 

In [24]:
def _parse_referenced(referenced_tweets):
    '''get the ID of the parent tweet from the "referenced tweets" key '''
    if type(referenced_tweets) is not list:
        return None
    for e in referenced_tweets:
        if e['type'] == 'replied_to':
            return e['id']

def get_node(somedict):
    datadict= somedict.get('data',None)
    if datadict:
        for tmp in datadict:
            extracted = {"text": tmp.get('text', None),
                         "author_id": tmp.get('author_id', None),
                        "tweet_id": tmp.get('id', None),
                        "in_reply_to": _parse_referenced(tmp.get('referenced_tweets',None))}
            yield extracted
    includedict = somedict.get('includes',None)
    # de OP-tweets zitten niet per se in data, maar kunnen ook in includes zitten
    if includedict:
        for tmp in includedict.get('tweets',[]):
            extracted = {"text": tmp.get('text', None),
                         "author_id": tmp.get('author_id', None),
                        "tweet_id": tmp.get('id', None),
                        "in_reply_to": None}
            yield extracted

In [25]:
flat_list = [item for sublist in [list(get_node(e)) for e in data] for item in sublist]
len(flat_list)

147703

In [26]:
g = ig.Graph(directed=True)


def _create_v_if_not_exists(g,v):
    try:
        _ = g.vs.find(v['tweet_id'])
    except ValueError: # does not exist
        g.add_vertex(v['tweet_id'], attributes=v)

print("Creating nodes...")
for v in tqdm(flat_list):
    _create_v_if_not_exists(g,v)

print("Creating edges...")
errors, successes = 0, 0
for v in tqdm(flat_list):
    try:
        g.add_edges([(v['in_reply_to'], v['tweet_id'])])
        successes+=1
    except:
        errors+=1
print(errors,successes)
g.write_pickle("graph.pkl")

Creating nodes...


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=147703.0), HTML(value='')))


Creating edges...


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=147703.0), HTML(value='')))


49605 98098


**dit is echt heel gek - voor 1/3 van de edges bestaat de parent dus niet in onze data**

In [2]:
#g = ig.Graph.Read_Pickle('graph.pkl')

## Analyzing the Graph

In [27]:
openingposts = g.vs.select(_outdegree_gt=0).select(_indegree_eq=0)
print(f"There are {len(openingposts)} opeining posts (i.e., posts that have at least one outdegree but no indegree)")

There are 14766 opeining posts (i.e., posts that have at least one outdegree but no indegree)


In [28]:
@dataclass
class Tweet:
    text: str
    author: str

class Dialogue:
    def __init__(self, user1: str):
        self.user1 = user1
        self.chain = []
    def add(self, tweet: Tweet):
        self.chain.append(tweet)
    


In [50]:
def get_chain(graph, node, chain=None, is_op=True):
    '''Based on a node in a network of tweets, recursively create a chain representing a dialogue 
    between two persons'''

    # ONLY CONSIDER starting starting a new chain if it's an opening post
    if chain==None and is_op:
        chain = Dialogue(user1=node.attributes()['name']) 
        if node.attributes()['attributes']['in_reply_to'] is None:
            chain.add(Tweet(text=node.attributes()['attributes']['text'],
                        author=node.attributes()['attributes']['author_id']))
    
    
    for childid in graph.neighbors(node, mode='out'):   
        child = graph.vs[childid]
        
        if len(chain.chain)==1 and child.attributes()['attributes']['author_id'] != chain.chain[0].author:  #second post
            chain.add(Tweet(text=child.attributes()['attributes']['text'],
                        author=child.attributes()['attributes']['author_id']))
        elif len(chain.chain)>1 and \
        (child.attributes()['attributes']['author_id'] == chain.chain[0].author or \
         child.attributes()['attributes']['author_id'] == chain.chain[1].author and \
         child.attributes()['attributes']['author_id'] != chain.chain[-1].author):  
            # only if it's one of the first two participants
            chain.add(Tweet(text=child.attributes()['attributes']['text'],
            author=child.attributes()['attributes']['author_id']))
        else:
            #print("Skipping")
            pass
            
        # recursive functions to check for next child
        grandchild = get_chain(graph, childid, chain, is_op=False)
        if grandchild:
            chain = grandchild
        return chain
#chains = [get_chain(g, node) for node in openingposts]
chains = [get_chain(g, node).chain for node in tqdm(openingposts)]
longchains = [c for c in chains if len(c)>5]

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=14766.0), HTML(value='')))





In [51]:
print(len(longchains))

327


In [90]:
#for c in longchains:
#    pprint(c)
#    print()

In [58]:
with open('gesprekken.txt', mode='w') as fo:
    for c in longchains:
        for i, tweet in enumerate(c):
            fo.write(f"{i}\t{tweet.text}\n")
        fo.write('\n****************************\n')

In [89]:
with open('gesprekken_clean.txt', mode='w') as fo:
    for c in longchains:
        for i, tweet in enumerate(c):
            cleantweet =  re.sub(r"@.+?\b","@USER",tweet.text).replace('\n','    ')
            cleantweet
            fo.write(f"{i}\t{cleantweet}\n")
        fo.write('\n****************************\n')