In [1]:
import numpy as np
import pandas as pd
import os
from csv import QUOTE_NONE
from glob import glob
from tqdm import tqdm

In [2]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

In [34]:
os.getcwd()

'D:\\Desktop\\SUTD\\Term 8\\50.021 AI\\Final Project\\data'

In [3]:
def readTSVData(fileObj) -> pd.DataFrame:
    df = pd.read_csv(fileObj,
            sep='\t',     
#             comment='#',
            names=[
                "tweet_id",
                "username",
                "timestamp",
                "#followers",
                "#friends",
                "#favorites",
                "Retweets",
                "Entities",
                "Sentiment",
                "Mentions",
                "Hashtags",
                "URLs"
            ],
            skipinitialspace=True,
            skip_blank_lines=True,
            error_bad_lines=True,
            quoting=QUOTE_NONE,
            warn_bad_lines=True,
            encoding="utf-8"
            ).sort_index()
    df.dropna(how="all", inplace=True)

    return df

In [4]:
import sys
tsvs = glob("**/*.tsv")
tsvDfs = pd.DataFrame()
df_list = list()
for tsvFilePath in tsvs:
    df = readTSVData(tsvFilePath)
    print('==================')
    print(f"Memory size of df: {sys.getsizeof(df)}")
    print(df.info())
    df_list.append(df)
# tsvDfs.info()

Memory size of df: 5186148618
<class 'pandas.core.frame.DataFrame'>
Int64Index: 8151524 entries, 0 to 8151523
Data columns (total 12 columns):
 #   Column      Dtype 
---  ------      ----- 
 0   tweet_id    int64 
 1   username    object
 2   timestamp   object
 3   #followers  int64 
 4   #friends    int64 
 5   #favorites  int64 
 6   Retweets    int64 
 7   Entities    object
 8   Sentiment   object
 9   Mentions    object
 10  Hashtags    object
 11  URLs        object
dtypes: int64(5), object(7)
memory usage: 808.5+ MB
None
Memory size of df: 1230233704
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1922405 entries, 0 to 1922404
Data columns (total 12 columns):
 #   Column      Dtype 
---  ------      ----- 
 0   tweet_id    int64 
 1   username    object
 2   timestamp   object
 3   #followers  int64 
 4   #friends    int64 
 5   #favorites  int64 
 6   Retweets    int64 
 7   Entities    object
 8   Sentiment   object
 9   Mentions    object
 10  Hashtags    object
 11  URLs

In [37]:
def cleanDataFrame(df: pd.DataFrame) -> pd.DataFrame:
    """
    Takes in a dataframe and returns another dataframe that contains only the data we want.
    Might wanna normalise the data as well
    Maybe fill the nulls with zeros or other appropriate values.
    """
    df.dropna(axis = 0,
               how='any', 
               subset=["tweet_id", "timestamp"], 
               inplace=True)
    df.dropna(axis = 0,
               how = 'all', 
               subset=[
                    "#followers",
                    "#friends",
                    "#favorites",
                    "Retweets"
               ], 
               inplace=True)
    return df

In [38]:
tsvDfs_cleaned = cleanDataFrame(tsvDfs)

In [39]:
def pre_process_data(df):
#     df = df_original.copy()
    
    df["timestamp"] = pd.to_datetime(df["timestamp"], format="%a %b %d %H:%M:%S %z %Y")
    print("Done formatting datetime")
    df["yr_sin"] = np.sin((df["timestamp"].dt.dayofyear / 365) * 2 * np.pi)
    df["yr_cos"] = np.cos((df["timestamp"].dt.dayofyear / 365) * 2 * np.pi)
    
    df["time_sin"] = np.sin((df["timestamp"].dt.hour / 12) * 2 * np.pi)
    df["time_cos"] = np.cos((df["timestamp"].dt.hour / 12) * 2 * np.pi)
    
    df.drop(columns=['timestamp'], inplace=True)
    print("Done w timestamps")
    l = np.vectorize(len)
    df["Mentions"] = df["Mentions"].str.replace("null;", "")
    df["Mentions"] = df["Mentions"].fillna("")
    df["nMentions"] = l(df["Mentions"].str.split())

    df["Hashtags"] = df["Hashtags"].str.replace("null;", "")
    df["Hashtags"] = df["Hashtags"].fillna("")
    df["nHashtags"] = l(df["Hashtags"].str.split())
    
    df["URLs"] = df["URLs"].str.replace("null;", "")
    df["URLs"] = df["URLs"].fillna("")
    df["nURLs"] = l(df["URLs"].str.split())
    df.drop(columns=['URLs', "Hashtags", "Mentions"], inplace=True)
    print("Done w numbers")
    
    df["splitSent"] = df["Sentiment"].str.split()
    df["positive_sentiment"] = df["splitSent"].str[0].astype(int)
    df["negative_sentiment"] = df["splitSent"].str[1].astype(int)
    df.drop(columns=['splitSent', "Sentiment"], inplace=True)
    
    print("Dropping unwanted columns...")
    df.drop(columns=['tweet_id', "username", "Entities"], inplace=True)
    print("Done pre processing")
    
    return df   

In [40]:
tsvDfs_pre_processed = pre_process_data(tsvDfs_cleaned)

Done formatting datetime
Done w timestamps
Done w numbers
Dropping unwanted columns...
Done pre processing


In [41]:
def label_classification_data(df):
    df["Retweeted"] = (df["Retweets"] > 0).astype(int)
    return df

tsvDfs_labelled = label_classification_data(tsvDfs_pre_processed)

In [65]:
readyData = tsvDfs_labelled.copy()

In [43]:
readyData.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20112480 entries, 0 to 20112479
Data columns (total 14 columns):
 #   Column              Dtype  
---  ------              -----  
 0   #followers          int64  
 1   #friends            int64  
 2   #favorites          int64  
 3   Retweets            int64  
 4   yr_sin              float64
 5   yr_cos              float64
 6   time_sin            float64
 7   time_cos            float64
 8   nMentions           int32  
 9   nHashtags           int32  
 10  nURLs               int32  
 11  positive_sentiment  int32  
 12  negative_sentiment  int32  
 13  Retweeted           int32  
dtypes: float64(4), int32(6), int64(4)
memory usage: 1.8 GB


In [None]:
readyData.to_csv("numerical_data.csv")