# Data Mining ⛏

**Purpose:** Collect all relevant Tweet's pertaining to the reopening of schools in the COVID-19 pandemic between Jan. 1, 2020 and Sept. 15, 2020.

**Twitter Threads**
* Thread showing original video of protestor being detained by [federal agents](https://twitter.com/matcha_chai/status/1283328232033411072)
* Thread showing video of police saying they're using [contact tracing](https://twitter.com/NBCNews/status/1266758240018276352)

**Pipeline:**
1.  Connect to Twitter's Search Tweets API (v2), to the `full archive` endpoint using the academic research track
2. *for each thread* 
    1. Take that tweets conversation ID
    2. Collect the tweet's entire thread by matching the conversation ID
        1. Only collect english tweets with no external links/media that are in direct reply to the original post (deeper tweets are too specific)
    3. Store collection of tweets in Pandas dataframe, and only keep relevant features (text, author, *etc.*)
    4. Add an extra column that is the cleaned tweet text.
    5. Save dataframe to CSV
    6. Solve the pandemic

In [2]:
from searchtweets import collect_results, gen_request_parameters, load_credentials, ResultStream
from tqdm.auto import tqdm 
import pandas as pd
import numpy as np
import json
import os

# Load api credentials
search_args = load_credentials(filename="./api-secret.yaml",
                               yaml_key="search_tweets_v2",
                               env_overwrite=False)


## 1. Tweet Conversation ID

In [10]:
t_details = { "kidnapping": 
             {"id": "R1",
              "author_id": "matcha_chai",
              "conversation_id": "1283328232033411072",
              "start_time": "2020-07-14",
              "end_time": "2020-07-18"
             },
             "contact_tracing": 
             {"id": "R2",
              "author_id": "NBCNews",
              "conversation_id": "1266741567961825284",
              "start_time": "2020-05-29",
              "end_time": "2020-06-02"
             },
            }


In [11]:
t_name = 'kidnapping'
thread_object = t_details[t_name]

## 2. Collect Twitter Thread

In [12]:
def build_request(t_obj,**kwargs):
    c_id,a_id = t_obj['conversation_id'],t_obj['author_id']
    s_time,e_time = t_obj['start_time'],t_obj['end_time']
    query = f"conversation_id:{c_id} to:{a_id} lang:en -has:links -has:media"
    tweet_fields = 'author_id,created_at,conversation_id'
    return gen_request_parameters(query,
                                  start_time=s_time,
                                  end_time=e_time,
                                  tweet_fields=tweet_fields)

request = build_request(thread_object)

In [13]:
def return_tweets(request,name=None,max_requests=1):
    fp = "../data/raw/{}.json".format(name)
    # If we've already made this request just load the data and return
    if os.path.isfile(fp):
        with open(fp) as fin:
            return json.load(fin),name
    print(f"Making request: {name}")
    rs = ResultStream(request_parameters=request,
                      max_requests=max_requests,
                      max_tweets=10**7,
                      **search_args)
    tweets = list(rs.stream())
    print(f"Writing {len(tweets)} tweets for request: {name}")
    with open(fp, 'w') as fout:
        json.dump(tweets,fout,indent=4)
    return tweets,name

tweets,f_name = return_tweets(request,t_name,max_requests=200)

## 2. To Dataframe

In [14]:
tweets_df = pd.DataFrame.from_records(tweets).set_index('id')
tweets_df = tweets_df.dropna(how='any',
                             subset=['text','created_at','author_id'])
tweets_df = tweets_df[['created_at','author_id','text']]
tweets_df['rumour'] = thread_object['id']
tweets_df['created_at'] = pd.to_datetime(tweets_df['created_at'])
tweets_df = tweets_df.sort_values('created_at')
tweets_df.head()

Unnamed: 0_level_0,created_at,author_id,text,rumour
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1283331905346142208,2020-07-15 09:25:44+00:00,1315508486,@matcha_chai was there even a plate on the bac...,R1
1283332603676889089,2020-07-15 09:28:30+00:00,20835728,@matcha_chai Odds are they're DHS.\n\n#ThisIsW...,R1
1283335188643078145,2020-07-15 09:38:46+00:00,1149001506684669952,@matcha_chai @brianorwhatevr Weird AF!!!,R1
1283336306228031488,2020-07-15 09:43:13+00:00,59222066,@matcha_chai That was so weird and happened so...,R1
1283337938013569025,2020-07-15 09:49:42+00:00,980888591957045248,@matcha_chai After seeing a video that started...,R1


### 2.a Clean Text

In [15]:
# Remove mentions
import html
import re
import ast
def clean_text(text):
    #unescape html
    text = html.unescape(text)
    #remove mentions
    text = re.sub("@[A-Za-z0-9]\w+","",text)
    return text.strip()

tweets_df['clean_text'] = tweets_df['text'].apply(clean_text)
tweets_df['code'] = -1
tweets_df.head()

Unnamed: 0_level_0,created_at,author_id,text,rumour,clean_text,code
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1283331905346142208,2020-07-15 09:25:44+00:00,1315508486,@matcha_chai was there even a plate on the bac...,R1,was there even a plate on the back of the car?,-1
1283332603676889089,2020-07-15 09:28:30+00:00,20835728,@matcha_chai Odds are they're DHS.\n\n#ThisIsW...,R1,Odds are they're DHS.\n\n#ThisIsWhatFascismLoo...,-1
1283335188643078145,2020-07-15 09:38:46+00:00,1149001506684669952,@matcha_chai @brianorwhatevr Weird AF!!!,R1,Weird AF!!!,-1
1283336306228031488,2020-07-15 09:43:13+00:00,59222066,@matcha_chai That was so weird and happened so...,R1,That was so weird and happened so fast. They d...,-1
1283337938013569025,2020-07-15 09:49:42+00:00,980888591957045248,@matcha_chai After seeing a video that started...,R1,After seeing a video that started from the beg...,-1


## 3. To CSV

In [16]:
tweets_df = tweets_df[['created_at','rumour','author_id','text','clean_text','code']]
tweets_df.to_csv("../data/processed/{}.csv".format(t_name))