# Task 1.2 Data Preparation
*Alberto Roberto Marinelli, Giacomo Cignoni, Alessandro Bucci*
## Importing Libraries
First we import the libraries necessary to process the data

In [1]:
import math
import sys
import numpy as np
import pandas as pd
import seaborn as sn
import scipy.stats as stats
import matplotlib.pyplot as plt
from sklearn import decomposition

Then we load the cleaned dataset adn the users datset

In [None]:
df_cleaned = pd.read_csv('../dataset/tweets.csv', sep=',', index_col=0) # Load cleaned dataset
df_users = pd.read_csv('../dataset/users.csv', sep=',', index_col=0)  # load users

And we set pandas options


In [None]:
pd.set_option('mode.use_inf_as_na', True) #Set the inf values as pd.NA


In [None]:
df_users.info()

In [None]:
df_cleaned.info()

## Plot graphics and correlation map

## Functions used to extract indicators

In [3]:
min_date = np.datetime64('2006-07-15 00:00:00') # Twitter creation year, used to check if a tweet is published after the creation of twitter
max_date = np.datetime64('2020-12-31 23:59:59') # Data scraping year, used to check if a tweet is published before the data scraping

def get_tweet_outside_of_possible_publishing_years(indicators_dataframe, user_id, tweet):
    if tweet.created_at < min_date or tweet.created_at > max_date:  # if the tweet is outside of the publishable period of time
        tweet_outside_of_range_counter = indicators_dataframe.at[user_id, 'tweet_outside_of_possible_publishing_years']
        if tweet_outside_of_range_counter is None:
            return 1 # set the counter of tweet_outside_of_possible_publishing_years to 1
        else:
            return tweet_outside_of_range_counter+1

In [None]:
def get_number_of_likes(tweet):
    try:
        return np.int64(tweet['favorite_count'])
    except:
        return 0

In [None]:
def get_avg_tweet_len(indicators_dataframe, user_id ,tweet):
    tweet_len = len(str(tweet['text']))
    
    if user_id not in indicators_dataframe.index:
        avg_tweet_len = tweet_len
    else:
        previous_tweet_count = indicators_dataframe.at[user_id, 'tweet_count']
        previous_avg = indicators_dataframe.at[user_id, 'avg_tweet_len']

        # summing the previous average multiplied for n/n+1 with the current tweet length multiplied for
        # 1/n+1 gives us the current average where n is the previous tweet count
        avg_tweet_len = previous_avg * (previous_tweet_count / (previous_tweet_count + 1)) \
                        + tweet_len * (1 / (previous_tweet_count + 1))
    
    return avg_tweet_len

In [None]:
def set_indicators(indicators_dataframe, user_id, tweet):
    # if a user published a tweet and is not into the dataframe
    if user_id not in indicators_dataframe.index:
        # tweet count is set to 1 and the average length is the length of the sole tweet published
        #Assign to the indicator dataframe
        indicators_dataframe.at[user_id, 'tweet_count'] = 1
        indicators_dataframe.at[user_id, 'total_num_of_likes'] = get_number_of_likes(tweet)
        indicators_dataframe.at[user_id, 'avg_tweet_len'] = get_avg_tweet_len(indicators_dataframe, user_id ,tweet)
        indicators_dataframe.at[user_id, 'tweet_outside_of_possible_publishing_years'] = get_tweet_outside_of_possible_publishing_years(indicators_dataframe,user_id,tweet)

    # if a user published a tweet and is into the dataframe
    else:
        #Update to the indicator dataframe
        indicators_dataframe.at[user_id, 'tweet_count'] += 1
        indicators_dataframe.at[user_id, 'total_num_of_likes'] += get_number_of_likes(tweet)
        indicators_dataframe.at[user_id, 'avg_tweet_len'] = get_avg_tweet_len(indicators_dataframe, user_id ,tweet)
        indicators_dataframe.at[user_id, 'tweet_outside_of_possible_publishing_years'] = get_tweet_outside_of_possible_publishing_years(indicators_dataframe,user_id,tweet)

In [None]:
def get_ratios(indicators_dataframe):
    for id_user, user in indicators_dataframe.iterrows():
        # tweet count is always >0, so there is no risk of a zero division
        user['like_ratio_per_tweet'] = user['total_num_of_likes'] / user['tweet_count']

### Entropy extraction functions

In [None]:
#The threshold is needed to consider a meaningful number of tweets for applying the entropy
entropy_threshold = 2

def get_entropy_from_timedeltas(dict_of_timedeltas,user_id):
    if len(dict_of_timedeltas[user_id]) >= entropy_threshold:
        #The probability of a timedelta to appear is the number of times the unique timedelta has appeared over total number of times timedeltas appeared
        total_number_of_timedeltas = sum(dict_of_timedeltas.values()) #the total number of times timedeltas appeared
        entropy = 0. #entropy set to 0.
        for timedelta in dict_of_timedeltas:
            number_of_timedelta = dict_of_timedeltas[timedelta] #the number of times the unique timedelta has appeared
            entropy -= number_of_timedelta/total_number_of_timedeltas * np.log2(number_of_timedelta/total_number_of_timedeltas) #shannon's entropy

        return entropy
    else:
        return 0

In [31]:
def apply_time_precision(timedelta, time_precision='second'):
    if time_precision=='minute':
        return timedelta.round(freq='min') # approximate to nearest minute
    elif time_precision=='15minutes':
        return timedelta.round(freq='15min') # approximate to nearest 15 minutes
    elif time_precision=='hour':
        return timedelta.round(freq='h') # approximate to nearest hour
    elif time_precision=='day': 
        return timedelta.round(freq='D') # approximate to nearest day
    else:
        return timedelta.round(freq='s') # # approximate to nearest second (default)

1 days 04:43:35.000011
1 days 00:00:00


In [None]:
def get_timedelta_list_per_user(time_precision='second'):
    tweets_dataframe = pd.read_csv('./dataset/tweets_sample.csv', sep=',', index_col=0)  # load tweets
    tweets_dataframe.created_at = pd.to_datetime(tweets_dataframe.created_at, errors='coerce') # convert created_at to datetime
    tweets_dataframe.sort_values(by="created_at",inplace=True) # is needed to be sorted by date in order to be able to subtract the previous date from the current

    last_tweet_encountered = dict() #a dict where the key is user_id and the value is the last post datetime64. It is needed to get the timedelta between posts
    tweet_timedeltas = dict() #a dict where the key is user_id and the value is a dict containing the timedeltas:number_of_times_timedelta_encountered

    for id_tweet, tweet in tweets_dataframe.iterrows(): #iterating on rows
        try: #if user_id cannot be casted into int64 it skips the tweet
            user_id = np.int64(tweet['user_id'])
        except:
            continue

        if user_id not in tweet_timedeltas.keys(): # if user is not into the timedeltas
            last_tweet_encountered[user_id] = tweet.created_at #the first datetime is saved
            tweet_timedeltas[user_id] = dict() #the dict containing timedelta:number_of_times_timedelta_encountered is created
        else:
            timedelta = tweet.created_at - last_tweet_encountered[user_id] #subrtacting the previous datetime64 to the current datetime64 gives the timedelta between the 2
            timedelta = apply_time_precision(timedelta, time_precision) #approximate the time to the nearest timedelta given the precision
            last_tweet_encountered[user_id] = tweet.created_at # the last datetime64 is saved
            if timedelta not in tweet_timedeltas[user_id]: # if the timedelta is not in the dict containing timedelta:number_of_times_timedelta_encountered
                tweet_timedeltas[user_id][timedelta] = 1 # it is the first timedelta, so it has appeared only 1 time
            else:
                tweet_timedeltas[user_id][timedelta] += 1 # it has already appeared, so the number of times encountered increases by 1
    
    return tweet_timedeltas

In [None]:
def get_entropy_over_time(time_precision='second'):
    #Subtract from each datetime the previous datetime, obtaining the timedeltas.
    #Calculate the entropy on those timedeltas (if the timedeltas are the same, the entropy will be lower)
    
    user_entropy_list = dict() # The return of the function; a dict where the key is user_id and the value is the entropy of the user
    list_of_timedeltas = get_timedelta_list_per_user(time_precision)
    
    user_entropy_list[user_id] = get_entropy_from_timedeltas(list_of_timedeltas[user_id],user_id) #the entropy is calculated

    df_entropy = pd.DataFrame(columns=['user_id', 'entropy'])

    #saving the dict as a dataframe
    index = 0
    for user_id in user_entropy_list: 
        df_entropy.at[index, 'user_id'] = user_id
        df_entropy.at[index, 'entropy'] = user_entropy_list[user_id]
        index += 1

    df_entropy.to_csv('../dataset/users_entropy.csv')
    

## Extracting indicators
Indicators per user covered: 

* total number of tweets    
* average tweet length, 
* total number of likes, 
* like ratio per tweet,
* counter of tweets outside of possible publishing years
* entropy peer user with second wise precision

In [None]:
def get_indicators_csv():
    df_tweets = pd.read_csv('../dataset/tweets.csv', sep=',', index_col=0)  # load tweets
    # create a dataframe for storing indicators
    df_indicators = pd.DataFrame(columns=['tweet_count', 'avg_tweet_len', 'total_num_of_likes', 'like_ratio_per_tweet', 'tweet_outside_of_possible_publishing_years'])
    # setting the index column name to 'id'
    df_indicators.index.names = ['user_id']

    # iterating on tweets
    for id_tweet, tweet in df_tweets.iterrows():
        try: #if user_id cannot be casted into int64 it skips the tweet
            user_id = np.int64(tweet['user_id'])
        except:
            continue
        set_indicators(df_indicators, user_id, tweet)

    # ratios
    get_ratios(df_indicators)

    df_entropies = pd.read_csv('../dataset/users_entropy.csv')

    df_indicators.join(df_entropies, on='user_id',how='inner')

    # save csv with indicators
    df_indicators.to_csv('../dataset/indicators.csv')

## Correlation map on indicators