In [None]:
import pandas as pd
#import json
from datetime import date
import time

In [None]:
# Import the Twython class
from twython import Twython
from twython import TwythonError, TwythonRateLimitError, TwythonAuthError # to check the returned API errors
import json

# Load credentials from json file
with open("twitter_credentials.json", "r") as file:
    creds = json.load(file)

# Instantiate an object
python_tweets = Twython(creds['CONSUMER_KEY'], creds['CONSUMER_SECRET'])


In [None]:
# Loading the dataset
#df =pd.read_csv('/home/benjamin/Documents/EPFL/hackathlon/[controversial]China_Flu_tweets.csv')
df1 =pd.read_csv('/home/benjamin/Documents/EPFL/hackathlon/[controversial]China_Flu_tweets.csv')
df2 =pd.read_csv('/home/benjamin/Documents/EPFL/hackathlon/[neutral]Coronavirus_tweets.csv')
df3 =pd.read_csv('/home/benjamin/Documents/EPFL/hackathlon/[scientific]COVID-19_tweets.csv')


In [None]:
from urllib.parse import urlparse

In [None]:
import pysad.collect as pc

In [None]:
import importlib
importlib.reload(pc)

## Collecting the tweets details from the controversial list
It may take some time due to the Twitter API limit.

In [None]:
tweet_list = []
missed_requests = []
access_error = []
for url in df1['url']:
    parsed = urlparse(url)
    tweetid = parsed.path.split('/')[-1]
    try:
        tweet = python_tweets.show_status(id=tweetid, include_rts = True, tweet_mode='extended')
    except TwythonAuthError as e_auth:
        print('Cannot access to twitter API, authentification error. {}'.format(e_auth.error_code))
        break
    except TwythonRateLimitError as e_lim:
        missed_requests.append(tweetid)
        print('API rate limit reached')
        print(e_lim)
        wait_time = int(e_lim.retry_after) - time.time()
        print('Retry after {} seconds.'.format(wait_time))
        time.sleep(wait_time + 1)
        continue
    except TwythonError as e:
        print('Tweet with id {} not found. Twython error: {}'.format(tweetid,e.error_code))
        access_error.append(tweetid)
        continue
    tweet_list.append(pc.extract_tweet_infos(tweet))
print('Missed requests due to API rate limits:',missed_requests)

In [None]:
tweet_df = pd.DataFrame(tweet_list)

In [None]:
tweet_df.to_csv('tweets_controversial_full.csv')

## Checking the tweetid errors (tweets removed by Twitter?)

In [None]:
missed_links = [tweetid for tweetid in access_error if len(tweetid) == 19] # 19 is the length of the id
print('Nb of missing tweets:', len(missed_links))

In [None]:
# Make a dataframe wit the full info on the missing tweets
missed_df = pd.DataFrame()
for tid in missed_links:
    tdf = df1[df1['url'].str.contains(tid)]
    missed_df = missed_df.append(tdf)   

In [None]:
missed_df.to_csv('missing_tweets.csv')

## Reading the tweets obtained

See notebook `Extracting_info_from_controversial_tweets`

## Creating the user list from the list of tweets

In [None]:
def extract_users(tweet_df):
    user_list = []
    for url in tweet_df['url']:
        parsed = urlparse(url)
        if len(parsed.path.split('/')) > 1:
            user = parsed.path.split('/')[1]
        else:
            continue
        user_list.append(user)
    return user_list

In [None]:
df_dic = {'controversial': df1, 'neutral': df2, 'scientific': df3}
users = []
for key in df_dic:
    user_list = extract_users(df_dic[key])
    for user in user_list:
        users.append({'user': user, 'type': key})
user_df = pd.DataFrame(users)

In [None]:
user_df

In [None]:
user_df.to_csv('reddit_user_list.csv')

## Getting the users of the missing tweets

In [None]:
user_list = extract_users(missed_df)
user_df = pd.DataFrame(user_list)

In [None]:
user_df.to_csv('missingtweets_user_list.csv')