In [1]:
#!pip install searchtweets

from searchtweets import ResultStream, gen_rule_payload, load_credentials
from tweet_parser.tweet import Tweet
import pandas as pd
import itertools
import networkx as nx
import os
import json
import csv

In [2]:
premium_search_args = load_credentials("./.twitter_keys.yaml",
                                       yaml_key="search_tweets_api",
                                       env_overwrite=False)

In [3]:
def clean(tweet, geo='False'):
    
    # the same tags in all the tweets
    tweet_id = tweet["id_str"] #OK for future computations
    created_at = tweet["created_at"] #OK
    username = tweet["user"]["id_str"] #user_id/user/name/user_screenName???
    #acctdesc = tweet.user.description
    location = tweet["user"]["location"] #OK
    coordinate = tweet["coordinates"]
    #following = tweet["user"]["friends_count"] #OK to study the diffusion
    #followers = tweet["user"]["followers_count"] #OK to study the diffusion
    user_mentions = []
    for result in tweet["entities"]["user_mentions"]: # get the mentioned users in the tweet and store them in a list
        user_mentions.append(result["screen_name"]) 
    #totaltweets = tweet.user.statuses_count
    #usercreatedts = tweet.user.created_at
    #tweetcreatedts = tweet.created_at
    retweetcount = tweet["retweet_count"] #OK to study the diffusion
    hashtags = []
    for result in tweet["entities"]["hashtags"]: # get the hastags and store in a list
        hashtags.append(result["text"])
    
    full_text = tweet["text"]  
    
    clean_tweet = [tweet_id, created_at, username, location, coordinate, user_mentions, retweetcount, full_text, hashtags] # save all the data in a list 
    
    if(geo == 'True'):
        #Extract only the tweets geolocalized
        clean_tweet_geo = 0      
        if(location == None and coordinate == None):
            print('no geo')
        else:
            clean_tweet_geo = [tweet_id, created_at, username, location, coordinate, user_mentions, retweetcount, full_text, hashtags]
        return clean_tweet, clean_tweet_geo
        
    return clean_tweet

In [4]:
'''
    Two empty dataframes: one for all the cleaned tweets and one for the geolocalized ones.
'''
# Create a dataframe to store all the tweets
tweets_df = pd.DataFrame(columns = ['tweet_id', 'created_at', 'username', 'location', 'coordinates', 'user_mentions', 'retweetcount', 'full_text', 'hashtags'])

# Create a dataframe to store only the tweets that are geolocalized
tweets_df_geo =  pd.DataFrame(columns = ['tweet_id', 'created_at', 'username', 'location', 'coordinates', 'user_mentions', 'retweetcount', 'full_text', 'hashtags'])


In [5]:
'''
    Cycle from may to december. Query the twitter api two times per month (one time for the first 15 days and another time for the last 15).
    Use 28 days per month to avoid any problem with February.
'''
for i in range(5,12,1): 
    
    rule = gen_rule_payload("libertà di scelta vaccinale morbillo", results_per_call=100, from_date="2017-"+str(i)+"-01", to_date="2017-"+str(i)+"-15")
    rule2 = gen_rule_payload("libertà di scelta vaccinale morbillo", results_per_call=100, from_date="2017-"+str(i)+"-16", to_date="2017-"+str(i)+"-28")
    rs = ResultStream(rule_payload=rule, max_results=100, max_pages=1, **premium_search_args)
    rs2 = ResultStream(rule_payload=rule2, max_results=100, max_pages=1, **premium_search_args)

    tweets = list(rs.stream()) + list(rs2.stream())

    
    #create json file and store in the folder data/tweets
    for tweet in tweets:
        filename =  tweet.id + ".json"
        with open(f'data/tweets/' + filename, 'w') as f:
            json.dump(tweet, f)
            f.close()
    
    for tweet in tweets:
        tweet_id = tweet["id"]
        id_ = tweet_id
        path = f'data/tweets/{tweet_id}.json'
        clean_tweets = [] #store all the clean tweets
        clean_tweets_geo = [] #store the tweets that are geolocalized
        
        # get the jsons and clean the tweets
        # append to "clean_tweets" and to "clean_tweets_geo" if they are geolocalized
        # if the tweet is geolocalized save its json in data/tweets_geo
        with open(path) as json_file:
            tweet = json.load(json_file)
            clean_tweet, clean_tweet_geo = clean(tweet, geo='True')
            clean_tweets.append(clean_tweet) # use the method in order to get the filtered tweet and append it to a list
            if(clean_tweet_geo != 0):
                clean_tweets_geo.append(clean_tweet_geo)
                # Insert the json in file in a new folder
                filename_geo = str(id_) + ".json"
                with open(f'data/tweets_geo/' + filename_geo, 'w') as f:
                    json.dump(tweet, f)
                    f.close()
    
        # create dataframe from the filtered tweets using the list created reading all the json files
        tweet_df = pd.DataFrame(clean_tweets)
        tweet_df.columns = ['tweet_id', 'created_at', 'username', 'location', 'coordinates', 'user_mentions', 'retweetcount', 'full_text', 'hashtags']

    
        #add data to the already existing dataframe
        tweets_df.loc[len(tweets_df)]=(clean_tweets[0])  
        
        if(len(clean_tweets_geo)!=0):
            tweets_df_geo.loc[len(tweets_df_geo)]=(clean_tweets_geo[0])

In [6]:
# Store the dataframe in path
path = "data/allTweets.csv"
tweets_df.to_csv(path,header=True)

path_geo = "data/allTweets_geo.csv"
tweets_df_geo.to_csv(path_geo,header=True)

In [36]:
# Upload the italianTweets dataset
df_italianTweets = pd.read_csv("dataset/morbillo_vaccinazioni/allTweets_geo.csv")
df_italianTweets.drop('Unnamed: 0',axis=1, inplace=True)