# Task 4 Time Series Analysis
*Alberto Roberto Marinelli, Giacomo Cignoni, Alessandro Bucci*
## Importing Libraries
First we import the libraries necessary to extract the time series


In [63]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
from math import pi
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [None]:
df = pd.read_csv("../dataset/cleaned_joined_tweets.csv", index_col = 0) 
del df['lang']
del df['bot']
del df['text']
del df['created_at_user']
del df['statuses_count']
del df['id'] #of the tweet

In [None]:
df.created_at = pd.to_datetime(df.created_at, errors='ignore')
df.sort_values(by=['user_id', 'created_at'], inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11277758 entries, 4500061 to 4080317
Data columns (total 8 columns):
 #   Column          Dtype         
---  ------          -----         
 0   user_id         int64         
 1   retweet_count   int64         
 2   reply_count     int64         
 3   favorite_count  int64         
 4   num_hashtags    int64         
 5   num_urls        int64         
 6   num_mentions    int64         
 7   created_at      datetime64[ns]
dtypes: datetime64[ns](1), int64(7)
memory usage: 774.4 MB


Get only the tweets in 2019:

In [None]:
min_date = np.datetime64('2019-01-01 00:00:00')
max_date = np.datetime64('2019-12-31 23:59:59')

df_2019 = df[(df.created_at > min_date) & (df.created_at < max_date)].copy()

For each user we define a series of 365 values initialized at -1, so we define a dataframe of Number_of_users * 365

In [None]:
df_2019.head(15)

Unnamed: 0,user_id,retweet_count,reply_count,favorite_count,num_hashtags,num_urls,num_mentions,created_at
3343357,722623,0,0,2,0,0,0,2019-01-03 03:03:11
3343475,722623,0,0,0,0,0,0,2019-01-03 03:04:43
3344644,722623,0,0,0,0,0,0,2019-01-03 05:28:17
3344229,722623,0,0,1,1,0,0,2019-01-03 07:03:36
3344396,722623,2,0,0,0,0,1,2019-01-03 07:04:02
3344173,722623,3,0,0,0,0,1,2019-01-03 11:31:17
3344626,722623,1,0,1,0,0,2,2019-01-03 13:55:58
3344868,722623,0,0,0,0,0,0,2019-01-03 14:56:42
3343007,722623,1,0,0,0,0,2,2019-01-03 17:06:42
3344802,722623,0,0,0,0,0,0,2019-01-03 18:19:27


In [None]:
user_2019 = pd.DataFrame(columns=df_2019.user_id.unique())
user_2019 = user_2019.append([pd.Series(-1, index=user_2019.columns) for _ in range(365)], ignore_index=True)

In [None]:
user_2019.describe()

Unnamed: 0,722623,755746,806975,887281,1382561,3888491,5812422,5820222,6296742,6775342,...,2654027174,2658726517,2662897087,2675560628,2680793168,2682782132,2688868016,2711226669,2717999764,2722021425
count,365,365,365,365,365,365,365,365,365,365,...,365,365,365,365,365,365,365,365,365,365
unique,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
top,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
freq,365,365,365,365,365,365,365,365,365,365,...,365,365,365,365,365,365,365,365,365,365


Ordering the tweets dataframe by user and created_at

In [None]:
df_2019.sort_values(by=['user_id', 'created_at'], inplace=True)

In [None]:
df_2019.head()

Unnamed: 0,user_id,retweet_count,reply_count,favorite_count,num_hashtags,num_urls,num_mentions,created_at
3343357,722623,0,0,2,0,0,0,2019-01-03 03:03:11
3343475,722623,0,0,0,0,0,0,2019-01-03 03:04:43
3344644,722623,0,0,0,0,0,0,2019-01-03 05:28:17
3344229,722623,0,0,1,1,0,0,2019-01-03 07:03:36
3344396,722623,2,0,0,0,0,1,2019-01-03 07:04:02


In [None]:
def get_day_success_score(tweets_of_the_day, date):
    if tweets_of_the_day.empty:
        return -1
    
    success_score_data = tweets_of_the_day.iloc[:,1:7].sum() # 1 to 7 are from retweet_count to num_mentions
    acceptance_score = success_score_data[0] + success_score_data[1] + success_score_data[2]
    diffusion_score = success_score_data[3] + success_score_data[4] + success_score_data[5] + 0.1
    
    return acceptance_score / diffusion_score

In [None]:
def get_tweets_of_the_day(date, user_id):
    tweets_of_user = df_2019[df_2019.user_id == user_id]
    tweets_of_the_date = tweets_of_user[tweets_of_user.created_at.astype('datetime64[D]') == date]
    
    return tweets_of_the_date

In [None]:
def get_time_series(timeseries ,user_id):
    date = np.datetime64('2019-01-01 00:00:00').astype('datetime64[D]')

    for day in range(365): # 0 to 364
      
        tweets_of_the_day = get_tweets_of_the_day(date, user_id)
        
        timeseries.iloc[day] = get_day_success_score(tweets_of_the_day, date)

        date += np.timedelta64(1, 'D')
    
    return timeseries


In [None]:
user_ids = df_2019.user_id.unique()
for user_id in tqdm(user_ids):
    user_2019[user_id] = get_time_series(user_2019[user_id], user_id)

In [None]:
user_2019.to_csv('../dataset/time_series.csv')