# Twitter API

## Import Statements

In [36]:
import pandas as pd
import matplotlib.pyplot as plt

import json
import glob
import os

import googlemaps
from datetime import datetime
gmaps = googlemaps.Client(key='AIzaSyD807g4CS6Cn-3fe-CVPvXnhK3269kdrQY')

## Helper Funtions

In [2]:
# Renvoie une dataframe avec le contenu de tous les fichiers .json du dossier passé en argument
def createDF(path_to_folder, test_set=False):
    
    os.chdir(path_to_folder)

    df_results = pd.DataFrame(columns=['contributors', 'coordinates', 'created_at', 'display_text_range',
           'entities', 'extended_tweet', 'favorite_count', 'favorited',
           'filter_level', 'geo', 'id', 'id_str', 'in_reply_to_screen_name',
           'in_reply_to_status_id', 'in_reply_to_status_id_str',
           'in_reply_to_user_id', 'in_reply_to_user_id_str', 'is_quote_status',
           'lang', 'matching_rules', 'place', 'possibly_sensitive', 'quote_count',
           'quoted_status', 'quoted_status_id', 'quoted_status_id_str',
           'quoted_status_permalink', 'reply_count', 'retweet_count', 'retweeted',
           'retweeted_status', 'source', 'text', 'truncated', 'user','grbfcj','fghjk'])
        
    for filename in glob.glob("*.json"):
        data = json.load(open(filename))
        df_current = pd.DataFrame(data["results"])
        df_results = pd.concat([df_results, df_current], sort=False)
    
    if test_set:
        df_results = df_results.sample(10)
    
    return df_results

In [3]:
# Ajoute une colonne date à partir de la colonne created_at
map_month = {'Jan':1, 'Feb':2, 'Mar':3, 'Apr':4, 'May':5, 'Jun':6, 'Jul':7, 'Aug':8,'Sep':9, 'Oct':10, 'Nov':11, 'Dec':12,}

def extractDate(df):
    # création d'une colonne date avec les premiers caractères de la colonne created_at 
    df['date'] = df.created_at.str[:10]
    
    # division de la colonne date en un tuple new = weekday, month, date
    new = df['date'].str.split(' ', n = 2, expand = True)
    
    # ajout des colonnes day, month, year
    df['day'] = new[2] 
    df['month'] = new[1] 
    df.month = df.month.map(map_month)
    df['year'] = '2020'
    
    # modification de la colonne date pour avoir le format yyyy-mm-dd
    df['date'] = df['year'].str.cat(df[['month', 'day']].astype(str), sep="-")
    df['date'] = df['date'].astype('datetime64')
    
    # suppression des colonnes inutiles
    del df['day']
    del df['month']
    del df['year']

In [4]:
# Fonction d'affichage d'un histogramme pour une dataframe avec une colonne date (stack overflow)
def visualize(df, column_name='date', color='#494949', title=''):
    """
    Visualize a dataframe with a date column.

    Parameters
    ----------
    df : Pandas dataframe
    column_name : str
        Column to visualize
    color : str
    title : str
    """
    plt.figure(figsize=(20, 10))
    ax = (df[column_name].groupby([df[column_name].dt.month,df[column_name].dt.day])
                         .count()).plot(kind="bar", color=color)
    ax.set_facecolor('#ffffff')
    ax.set_xlabel("jour")
    ax.set_ylabel("nombre de tweets")
    ax.set_title(title)
    plt.show()

In [15]:
def extractUserfromDf (df):
    df_results = pd.DataFrame(columns = [
          "id",
          "id_str",
          "name",
          "screen_name",
          "location",
          "url",
          "description",
          "translator_type",
          "protected",
          "verified",
          "followers_count",
          "friends_count",
          "listed_count",
          "favourites_count",
          "statuses_count",
          "created_at",
          "utc_offset",
          "time_zone",
          "geo_enabled",
          "lang",
          "contributors_enabled",
          "is_translator",
          "profile_background_color",
          "profile_background_image_url",
          "profile_background_image_url_https",
          "profile_background_tile",
          "profile_link_color",
          "profile_sidebar_border_color",
          "profile_sidebar_fill_color",
          "profile_text_color",
          "profile_use_background_image",
          "profile_image_url",
          "profile_image_url_https",
          "profile_banner_url",
          "default_profile",
          "default_profile_image",
          "following",
          "follow_request_sent",
          "notifications"])
    
    for i in range(df.shape[0]):
        current_dict = df['user'].iloc[i]
        temp_df = pd.DataFrame(current_dict, index=[0])
        df_results = pd.concat([df_results, temp_df], sort=False)
    
    return df_results

In [116]:
def extractUserReply(df):
    df_results = pd.DataFrame(columns=['contributors', 'coordinates', 'created_at', 'display_text_range',
      'entities', 'extended_tweet', 'favorite_count', 'favorited',
      'filter_level', 'geo', 'id', 'id_str', 'in_reply_to_screen_name',
      'in_reply_to_status_id', 'in_reply_to_status_id_str',
      'in_reply_to_user_id', 'in_reply_to_user_id_str', 'is_quote_status',
      'lang', 'matching_rules', 'place', 'possibly_sensitive', 'quote_count',
      'quoted_status', 'quoted_status_id', 'quoted_status_id_str',
      'quoted_status_permalink', 'reply_count', 'retweet_count', 'retweeted',
      'retweeted_status', 'source', 'text', 'truncated', 'user'])
    
    for i in range(df.shape[0]):
        current_dict = df['retweeted_status'].iloc[i]
        current_dict_str = json.dumps(current_dict)
        a_json = json.loads(current_dict_str)
        temp_df = (pd.DataFrame.from_dict(a_json, orient="index")).T
        df_results = pd.concat([df_results, temp_df], sort=False)
    
    df_results = extractUserfromDf(df_results)
    
    return (df_results)

In [117]:
# Returns all users involved in a df of tweets (authors and auhors they are retweeting)
def all_users_from(df):
    users = extractUserfromDf(df)
    reply_to = extractUserReply(df)
    df_results = pd.concat([users, reply_to])
    df_results = df_results.dropna(subset=['location'])
    df_results = df_results[df_results.location != 'France']
    df_results = df_results[df_results.location != 'france']
    df_results = df_results[df_results.location != 'Somewhere']
    df_results = df_results[df_results.location != 'earth']
    return df_results

In [163]:
def nodes_from_df(df):
    df_results = all_users_from(df)
    df_results['lon'] = 0.0
    df_results['lat'] = 0.0
    df_results.index = range(df_results.shape[0])
    for i in range(df_results.shape[0]):
        #print("Localisation courante : ", df_results.iloc[i]['location'])
        current_geocode = gmaps.geocode(df_results.iloc[i]['location'])

        
        if current_geocode:  
            print(i, "- GPS : ", current_geocode[0]['geometry']['location'])
            df_results.at[i,'lon'] = current_geocode[0]['geometry']['location']['lng'] # get value of a specific cell
            df_results.at[i,'lat'] = current_geocode[0]['geometry']['location']['lat']
        else:
            print(i, "- Wrong geo !")
        
    return (df_results[["screen_name", "lon", "lat"]].drop_duplicates())

In [166]:
nodes_hydro_all.to_csv(r'/Users/benji/Desktop/GI01/IC05/Projet/Essaiv3/Network/nodes_hydrol.csv',index=False)

In [147]:
nodes_5G_all =  nodes_from_df(df_5g_wnan)

0 - GPS :  {'lat': 48.856614, 'lng': 2.3522219}
1 - GPS :  {'lat': 48.856614, 'lng': 2.3522219}
2 - GPS :  {'lat': 43.124228, 'lng': 5.928}
3 - GPS :  {'lat': 43.2951, 'lng': -0.370797}
4 - GPS :  {'lat': 45.54151, 'lng': 3.248128}
5 - GPS :  {'lat': 48.856614, 'lng': 2.3522219}
6 - GPS :  {'lat': 50.62925, 'lng': 3.057256}
7 - GPS :  {'lat': 48.856614, 'lng': 2.3522219}
8 - GPS :  {'lat': 48.856614, 'lng': 2.3522219}
9 - GPS :  {'lat': 46.8138783, 'lng': -71.2079809}
10 - GPS :  {'lat': 53.9332706, 'lng': -116.5765035}
11 - GPS :  {'lat': 46.2196009, 'lng': 6.079443299999999}
12 - GPS :  {'lat': 48.856614, 'lng': 2.3522219}
13 - GPS :  {'lat': 48.3181795, 'lng': 7.441624099999999}
14 - GPS :  {'lat': 46.818188, 'lng': 8.227511999999999}
15 - GPS :  {'lat': 47.162494, 'lng': 19.5033041}
16 - Wrong geo !
17 - GPS :  {'lat': 28.033886, 'lng': 1.659626}
18 - GPS :  {'lat': 48.841082, 'lng': 2.999366}
19 - GPS :  {'lat': 37.7794551, 'lng': -122.4103292}
20 - GPS :  {'lat': 48.9742302999999

171 - GPS :  {'lat': 49.9183927, 'lng': 5.3745927}
172 - GPS :  {'lat': 36.4165052, 'lng': -6.1461102}
173 - GPS :  {'lat': 49.524641, 'lng': 0.8828328999999999}
174 - GPS :  {'lat': 26.252778, 'lng': -98.24291799999999}
175 - GPS :  {'lat': 48.856614, 'lng': 2.3522219}
176 - Wrong geo !
177 - Wrong geo !
178 - Wrong geo !
179 - GPS :  {'lat': 37.4274745, 'lng': -122.169719}
180 - GPS :  {'lat': 44.837789, 'lng': -0.57918}
181 - GPS :  {'lat': 49.0615901, 'lng': 2.1581351}
182 - GPS :  {'lat': 45.5013261, 'lng': -73.55572819999999}
183 - GPS :  {'lat': 33.52059330000001, 'lng': -7.5680595}
184 - GPS :  {'lat': 44.837789, 'lng': -0.57918}
185 - GPS :  {'lat': 49.41781599999999, 'lng': 2.826145}
186 - GPS :  {'lat': 3.8480325, 'lng': 11.5020752}
187 - GPS :  {'lat': 45.9018486, 'lng': 6.121139299999999}
188 - GPS :  {'lat': 46.1987811, 'lng': -74.4114335}
189 - GPS :  {'lat': 47.795818, 'lng': -3.58491}
190 - Wrong geo !
191 - GPS :  {'lat': 30.2333511, 'lng': -92.6631994}
192 - GPS :  {

339 - GPS :  {'lat': 46.394984, 'lng': 6.612114}
340 - Wrong geo !
341 - GPS :  {'lat': 46.227638, 'lng': 2.213749}
342 - GPS :  {'lat': 32.9210902, 'lng': 10.4508956}
343 - GPS :  {'lat': 48.856614, 'lng': 2.3522219}
344 - GPS :  {'lat': 43.529742, 'lng': 5.447426999999999}
345 - GPS :  {'lat': 45.764043, 'lng': 4.835659}
346 - GPS :  {'lat': 43.529742, 'lng': 5.447426999999999}
347 - GPS :  {'lat': 43.529742, 'lng': 5.447426999999999}
348 - GPS :  {'lat': 48.856614, 'lng': 2.3522219}
349 - GPS :  {'lat': 39.746416, 'lng': -104.979468}
350 - GPS :  {'lat': 47.7632836, 'lng': -0.3299687}
351 - GPS :  {'lat': 3.8480325, 'lng': 11.5020752}
352 - GPS :  {'lat': 48.856614, 'lng': 2.3522219}
353 - GPS :  {'lat': 48.856614, 'lng': 2.3522219}
354 - GPS :  {'lat': 41.591369, 'lng': 9.278311}
355 - GPS :  {'lat': 41.4428048, 'lng': -85.85879}
356 - GPS :  {'lat': 54.5259614, 'lng': 15.2551187}
357 - GPS :  {'lat': 54.5259614, 'lng': 15.2551187}
358 - GPS :  {'lat': 3.8480325, 'lng': 11.5020752}

505 - Wrong geo !
506 - GPS :  {'lat': 43.604652, 'lng': 1.444209}
507 - GPS :  {'lat': 43.604652, 'lng': 1.444209}
508 - GPS :  {'lat': 43.604652, 'lng': 1.444209}
509 - GPS :  {'lat': 43.604652, 'lng': 1.444209}
510 - GPS :  {'lat': 43.604652, 'lng': 1.444209}
511 - GPS :  {'lat': 44.890891, 'lng': 1.217292}
512 - GPS :  {'lat': 43.604652, 'lng': 1.444209}
513 - GPS :  {'lat': 43.604652, 'lng': 1.444209}
514 - GPS :  {'lat': 43.604652, 'lng': 1.444209}
515 - GPS :  {'lat': 43.604652, 'lng': 1.444209}
516 - GPS :  {'lat': 58.3957606, 'lng': 26.7463007}
517 - GPS :  {'lat': 58.3957606, 'lng': 26.7463007}
518 - GPS :  {'lat': 58.3957606, 'lng': 26.7463007}
519 - GPS :  {'lat': 58.3957606, 'lng': 26.7463007}
520 - GPS :  {'lat': 58.3957606, 'lng': 26.7463007}
521 - GPS :  {'lat': 43.529742, 'lng': 5.447426999999999}
522 - GPS :  {'lat': 41.591369, 'lng': 9.278311}
523 - GPS :  {'lat': 36.1520894, 'lng': -95.9517558}
524 - GPS :  {'lat': 41.591369, 'lng': 9.278311}
525 - GPS :  {'lat': 43

672 - GPS :  {'lat': 14.641528, 'lng': -61.024174}
673 - GPS :  {'lat': 14.641528, 'lng': -61.024174}
674 - GPS :  {'lat': 48.542105, 'lng': 2.6554}
675 - GPS :  {'lat': 14.641528, 'lng': -61.024174}
676 - GPS :  {'lat': 48.856614, 'lng': 2.3522219}
677 - GPS :  {'lat': 45.7605474, 'lng': 4.861117699999999}
678 - GPS :  {'lat': 48.856614, 'lng': 2.3522219}
679 - GPS :  {'lat': 14.641528, 'lng': -61.024174}
680 - GPS :  {'lat': 14.641528, 'lng': -61.024174}
681 - GPS :  {'lat': 45.77722199999999, 'lng': 3.087025}
682 - GPS :  {'lat': 48.856614, 'lng': 2.3522219}
683 - GPS :  {'lat': 32.699635, 'lng': 35.303546}
684 - GPS :  {'lat': 48.856614, 'lng': 2.3522219}
685 - GPS :  {'lat': 46.88761909999999, 'lng': 9.656999599999999}
686 - GPS :  {'lat': 48.856614, 'lng': 2.3522219}
687 - GPS :  {'lat': 48.856614, 'lng': 2.3522219}
688 - GPS :  {'lat': 48.856614, 'lng': 2.3522219}
689 - GPS :  {'lat': 48.62715799999999, 'lng': 2.593706}


In [152]:
nodes_5G_all.head(691)

Unnamed: 0,screen_name,lon,lat
0,ChanPerco,2.352222,48.856614
1,parisimonea,2.352222,48.856614
2,ThierryLovato,5.928000,43.124228
3,olivierclave,-0.370797,43.295100
4,jeromelebon63,3.248128,45.541510
5,DimitriHommel,2.352222,48.856614
6,Playmogeek,3.057256,50.629250
7,Numerama,2.352222,48.856614
8,Dagolek_00108,2.352222,48.856614
9,voyousebas,-71.207981,46.813878


In [131]:
geo =  gmaps.geocode('Paris')
if geo:
    print('Wrong Geocode')
else :
    print('OK')

Wrong Geocode


In [121]:
all_users_from(df_5g_wnan)

Unnamed: 0,id,id_str,name,screen_name,location,url,description,translator_type,protected,verified,...,profile_text_color,profile_use_background_image,profile_image_url,profile_image_url_https,profile_banner_url,default_profile,default_profile_image,following,follow_request_sent,notifications
0,48130604,48130604,Jonathan Chan 💡📣,ChanPerco,"Paris, France",https://www.linkedin.com/in/chanperco/,Relayeur Actu Digitale | Chef de Projet Digita...,none,False,False,...,333333,True,http://pbs.twimg.com/profile_images/5518069362...,https://pbs.twimg.com/profile_images/551806936...,https://pbs.twimg.com/profile_banners/48130604...,False,False,,,
0,929360357902225408,929360357902225408,EmmaPeal,parisimonea,"Paris, France",,"Tout comme l'homme, les animaux ressentent le ...",none,False,False,...,333333,True,http://pbs.twimg.com/profile_images/1321788474...,https://pbs.twimg.com/profile_images/132178847...,https://pbs.twimg.com/profile_banners/92936035...,True,False,,,
0,1074975306,1074975306,Thierry Lovato ⚓️,ThierryLovato,"Toulon, France",http://www.var.cci.fr/,Work Now ⛴️🛳️🆙 #PortsRadeToulon #CCIduVar // B...,none,False,False,...,784726,True,http://pbs.twimg.com/profile_images/1252037464...,https://pbs.twimg.com/profile_images/125203746...,https://pbs.twimg.com/profile_banners/10749753...,False,False,,,
0,67608740,67608740,Olivier Clavé,olivierclave,"Pau, France",https://www.linkedin.com/profile/view?id=10289...,#Com et #marketingdigital.#CM #Innovation #Tra...,none,False,False,...,000000,False,http://pbs.twimg.com/profile_images/1010071770...,https://pbs.twimg.com/profile_images/101007177...,https://pbs.twimg.com/profile_banners/67608740...,False,False,,,
0,855370072205950976,855370072205950976,jeromegame,jeromelebon63,"Issoire, France",http://www.twitch.tv/jeromelebon,Aime la randonnée et l’univers vidéo ludique e...,none,False,False,...,000000,False,http://pbs.twimg.com/profile_images/1250710934...,https://pbs.twimg.com/profile_images/125071093...,https://pbs.twimg.com/profile_banners/85537007...,False,False,,,
0,376230264,376230264,Dimitri Hommel,DimitriHommel,Paris,https://www.heroiks.com/,Ex-journaliste radio - #Dircom @HeroiksGroup -...,none,False,False,...,333333,True,http://pbs.twimg.com/profile_images/1055474192...,https://pbs.twimg.com/profile_images/105547419...,https://pbs.twimg.com/profile_banners/37623026...,False,False,,,
0,17229611,17229611,Jérôme,Playmogeek,"Lille, France",http://www.playmobilsandco.org,"accessibilité,geek,actu,WTF,serial veilleur",none,False,False,...,333333,True,http://pbs.twimg.com/profile_images/73759479/p...,https://pbs.twimg.com/profile_images/73759479/...,https://pbs.twimg.com/profile_banners/17229611...,False,False,,,
0,51004523,51004523,Numerama,Numerama,"Paris, Europe",https://www.numerama.com,"Sinon, ça va ? 🎥 Sur YouTube: http://youtube....",none,False,True,...,333333,False,http://pbs.twimg.com/profile_images/1159750563...,https://pbs.twimg.com/profile_images/115975056...,https://pbs.twimg.com/profile_banners/51004523...,False,False,,,
0,763077024676184064,763077024676184064,LG Abdel.,Dagolek_00108,Paris,,Je pense donc je suis !,none,False,False,...,333333,True,http://pbs.twimg.com/profile_images/1323792387...,https://pbs.twimg.com/profile_images/132379238...,https://pbs.twimg.com/profile_banners/76307702...,True,False,,,
0,942102274758897664,942102274758897664,🇨🇦❗💥⚜🐺Seb{Cor}Jordan Bee🐝~741Hz🍯👽,voyousebas,"Québec, Canada",,Je renonce au plan mondialiste. C'est ma versi...,none,False,False,...,333333,True,http://pbs.twimg.com/profile_images/1335796999...,https://pbs.twimg.com/profile_images/133579699...,https://pbs.twimg.com/profile_banners/94210227...,True,False,,,


## Load Data

In [7]:
df_5G = createDF('/Users/benji/Desktop/GI01/IC05/Projet/Essaiv3/5G')
df_5G.shape

(1228, 38)

In [8]:
df_Gates = createDF('/Users/benji/Desktop/GI01/IC05/Projet/Essaiv3/Gates')
df_Gates.shape

(5183, 38)

In [9]:
df_hydro = createDF('/Users/benji/Desktop/GI01/IC05/Projet/Essaiv3/Hydroxychloroquine/')
df_hydro.shape

(14087, 38)

### Keeping only retweets

In [13]:
df_gates_wnan = df_Gates.dropna(subset=['retweeted_status'])
df_5g_wnan = df_5G.dropna(subset=['retweeted_status'])
df_hydro_wnan = df_hydro.dropna(subset=['retweeted_status'])

In [29]:
print(f"Shape Gates: {df_gates_wnan.shape}")
print(f"Shape 5G: {df_5g_wnan.shape}")
print(f"Shape Hydro: {df_hydro_wnan.shape}")

Shape Gates: (4448, 36)
Shape 5G: (657, 36)
Shape Hydro: (12637, 36)


### Keeping only known localisation

In [None]:
df_gates_loc = extractUserfromDf(df_gates_wnan.dropna(subset=[''])
df_5g_loc = df_5G.dropna(subset=['retweeted_status'])
df_hydro_loc = df_hydro.dropna(subset=['retweeted_status'])

## Extract User

In [11]:
users_5G = extractUserfromDf(df_5G)

## User Extraction & Graph Vertexes definition

In [12]:
# Renvoie une dataframe avec deux colonnes 'from' et 'to' à partir d'une base de tweets
def edges_from_df(df):
    users = extractUserfromDf(df)
    reply_to = extractUserReply(df)
    df_results = pd.concat([users['screen_name'], reply_to['screen_name']], axis=1, keys=['from', 'to'])
    return df_results

In [23]:
# Renvoie une dataframe de noeuds à partir d'une base de tweets (3 colonnes : nom, longitude, lattitude)
def nodes_from_edges(df):
    users = extractUserfromDf(df)
    reply_to = extractUserReply(df)
    df_results = pd.concat([users['screen_name'], reply_to['screen_name']], axis=1, keys=['from', 'to'])
    
    df_results = pd.concat([df['from'], df['to']])
    df_results =  pd.DataFrame(pd.unique(df_results), columns=['users'])
    return df_results

In [33]:
df_edges_5G = edges_from_df(df_5g_wnan)
df_nodes_5G = nodes_from_edges(df_edges_5G)

<class 'pandas.core.series.Series'>


In [34]:
df_edges_gates = edges_from_df(df_gates_wnan)
df_nodes_gates = nodes_from_edges(df_edges_gates)

<class 'pandas.core.series.Series'>


In [150]:
df_nodes_5G.to_csv(r'./nodes_5G.csv',index=False)

In [151]:
df_edges_5G.to_csv(r'./edges_5G.csv',index=False)

In [162]:
nodes_5G_all2.to_csv(r'/Users/benji/Desktop/GI01/IC05/Projet/Essaiv3/Network/nodes_5Gl2.csv',index=False)