In [26]:

from bson import json_util
import json
import ast
import random
from collections import defaultdict
from collections import Counter

### Climate-change issue

#### read data

In [4]:
f_data = []

for i in range(1, 384):
    try: 
        with open(f'../data/get-twitter-data/convos-climate-change-2020-2021/convo_{i}.json') as json_file:
            data = json.load(json_file)
            f_data.append(ast.literal_eval(data))
    except:
        print(i)

flat_data = [item for sublist in f_data for item in sublist]
len(flat_data)

38277

#### Some cleaning...

In [13]:
print( flat_data[2]) #some errors while making API requests; remove those

short_list = [ x for x in flat_data if x != flat_data[2]]
print(f"after removing bad entries we keep {len(short_list)} data points")

clean_short_list = [x for x in short_list if 'data' in x if 'meta' in x]   ## keep only data points that have 'data' and 'meta' included
short_results = [x for x in clean_short_list if "next_token" not in x['meta']]  ## disregard long conversations

print(f"Fianlly keeping  {len(short_results)} data points") 


{'meta': {'result_count': 0}}
after removing bad entries we keep 10758 data points
Fianlly keeping  10461 data points


####  keep starting tweets that contain the key word (double check as the API should have only provided those)

In [9]:
starting_tweets = []

for x in flat_data:
    try:
        for i in x['includes']['tweets']:
            if 'referenced_tweets' not in i:
                if i['text'].lower().count('klimaatverandering') > 0:
                    starting_tweets.append(i)
                    
    except:
        pass

In [10]:
len(starting_tweets)

7976

#### Create final list of conversation ids to be used

In [11]:
convos = list( set([x['conversation_id'] for x in starting_tweets])) 
print(len(convos))

7899


#### Filter the data so to only keep data points with a valid conversation Id

In [16]:
def get_replies(collection, conId):

    '''get replies to valid Conversation Ids'''
    
    for x in collection:
        if x['data'][0]['conversation_id'] == conId: 
            return x

results = [ get_replies(short_results, conId) for conId in convos ] ## calling the function on `short_results` ; our 'cleaned' data
results = [x for x in results if x is not None]

### Set of functions to match author of the "mother" tweet with replies 
In addition, a dict is created to look up the names of the users by id

In [19]:


def get_author_id(entry_number, results):

    for i in results[entry_number]['includes']['tweets']:
        if 'referenced_tweets' not in i: 
            if i['text'].lower().count('klimaatverandering') > 0:
 
                return i['author_id'], i['text']

def get_direct_replies(entry_number, results):

    direct_replies = []
    names = []
    ids = []

    author_id, author_text  = get_author_id(entry_number, results)

    for _ in range(0, len(results[entry_number]['data']) ): 
        
        try:
            ids.append(results[entry_number]['includes']['users'][_]['id'])
            names.append(results[entry_number]['includes']['users'][_]['username'])
        except:
            pass

      #if results[entry_number]['data'][_]['in_reply_to_user_id'] == author_id : ##in reply to "mother" tweet;
             
        if results[entry_number]['data'][_]['author_id'] != author_id: ##if rely is NOT by the author of the "mother" (in that case its a thread)
            direct_replies.append( results[entry_number]['data'][_])

    return direct_replies[::-1], author_id, author_text, dict(zip(ids, names))

def get_unique_ids_replies(direct_replies):

    '''Check the number of unique user IDs in the replies. 
    in a next step, we will only keep conversations in which N unique individuals have responded.''' 

    unique_ids_replies = []

    for _ in range(0, len(direct_replies)):

        unique_ids_replies.append(len(set([direct_replies[_][i]['author_id'] for i in range(0, len(direct_replies[_])) ] ) ))
    
    return unique_ids_replies

def keep_conversations_with_N_authors(i):
    
    unique_ids_replies = get_unique_ids_replies([ get_direct_replies(_, results)[0] for _ in range(0, len(results)) ])
    
    d_replies = [ get_direct_replies(_, results)[0] for _ in range(0, len(results)) ]
    author_id = [ get_direct_replies(_, results)[1] for _ in range(0, len(results)) ]
    author_text = [ get_direct_replies(_, results)[2] for _ in range(0, len(results)) ]
    usernames = [ get_direct_replies(_, results)[3] for _ in range(0, len(results)) ]

    ## keep if n=2; that means 2 unique user ids should be in the replies.
    ### if len(d) > 2 if len(d) < 11: keep conversations that are between 2 and 11 replies long.

    return [(n, d, a_id, a_text)[i] for n, d, a_id, a_text in zip(unique_ids_replies, d_replies, author_id, author_text) if n==2 if len(d) > 2 if len(d) < 11], usernames

#### Calling functions

In [21]:
d_replies = keep_conversations_with_N_authors(1)[0]
author_id = keep_conversations_with_N_authors(2)[0]
author_text = keep_conversations_with_N_authors(3)[0]
usernames = keep_conversations_with_N_authors(3)[1]

assert len(d_replies) == len(author_id) == len(author_text) 
print(len(d_replies))

322


In [23]:
#### populate the dict to look up usernames

f_usernames = {}
for d in usernames:
    f_usernames.update(d)

In [27]:
Counter([len(x) for x in d_replies])

Counter({3: 178, 4: 77, 7: 12, 5: 26, 6: 17, 8: 1, 10: 4, 9: 7})

In [38]:
def get_camera_ready(entry_number):

    string_text = f"**************************\n\n\n{f_usernames[author_id[entry_number]]} (start tweet)\n" + author_text[entry_number]
    for _ in range(0, len(d_replies[entry_number])):
        string_text = string_text + "\n\n" + f"{f_usernames[ d_replies[entry_number][_]['author_id'] ]} (reply created at: {d_replies[entry_number][_]['created_at']}\n" 
        string_text = string_text + d_replies[entry_number][_]['text']+ "\n"
    return string_text.encode('utf-8', 'replace').decode()

res_ = [get_camera_ready(i) for i in range(0, len(d_replies)) ]
print(len(res_))

with open('climate-change-conversations-24.txt', 'w') as f:
    for line in res_:
        f.write(f"{line}\n")


322


### write data to json

In [36]:
my_dict = {}
for entry_number in range(0, len(d_replies)):
    my_dict[entry_number] = {
                   'starting-tweet': author_text[entry_number], 
                    'screenname-first-author':f_usernames[author_id[entry_number]], 
                   'replies' : [d_replies[entry_number][_]['text'] for _ in range( 0, len(d_replies[entry_number]))] ,
                   'screennames-replies:' : [ f_usernames[ d_replies[entry_number][_]['author_id']]  for _ in range( 0, len(d_replies[entry_number]))]   }


In [37]:
import json
with open('climate-24.json', 'w') as fp:
    json.dump(my_dict, fp)