In [1]:
# imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime

In [2]:
header = [
    "Tweet Id", 
    "Username", 
    "Timestamp", 
    "#Followers",
    "#Friends",
    "#Retweets",
    "#Favorites",
    "Entities",
    "Sentiment",
    "Mentions",
    "Hashtags",
    "URLs"]

data = pd.read_csv("/Users/amarjyotkaur/Downloads/TweetsCOV19.tsv.gz", compression='gzip', names=header, sep='\t', quotechar='"')
test_data = pd.read_csv("/Users/amarjyotkaur/Downloads/TweetsCOV19_052020.tsv.gz", compression='gzip', names=header, sep='\t', quotechar='"')


In [3]:
def find_user_stats(dataframe):
    # Load raw data
    data=dataframe

    # Number of unique users
    print(data['Username'].nunique())

    # Group by users
    df=data.groupby('Username')[['#Followers', '#Friends', '#Retweets', '#Favorites']]
    
    # Get minimum
    df_min=df.min()
    df_min.rename(columns=lambda x: x + "_min", inplace=True)
    
    # Get maximum
    df_max=df.max()
    df_max.rename(columns=lambda x: x + "_max", inplace=True)
        
    # Get mean
    df_mean=df.mean()
    df_mean.rename(columns=lambda x: x + "_mean", inplace=True)
        
    # Save pickle and return dataframe
    df=pd.concat([df_min, df_max, df_mean], axis=1)
    df.to_pickle('user-stats.pkl')
    return df

user_data = find_user_stats(test_data)
user_data

1118001


Unnamed: 0_level_0,#Followers_min,#Friends_min,#Retweets_min,#Favorites_min,#Followers_max,#Friends_max,#Retweets_max,#Favorites_max,#Followers_mean,#Friends_mean,#Retweets_mean,#Favorites_mean
Username,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
00000998260226834ffdbdf98ff33eb7,1852,1482,0,0,1852,1482,0,0,1852.0,1482.0,0.0,0.0
000016e54a4dc155432ebad949c2546e,6853,987,0,0,6953,992,0,0,6903.0,989.5,0.0,0.0
00001c34da8eab17b175a9e049078b72,341,350,0,0,341,350,0,0,341.0,350.0,0.0,0.0
00001d45dd97d52b5accb3333e3790e3,854,3012,0,0,854,3012,0,0,854.0,3012.0,0.0,0.0
00003291a067882da356e7f963d3dca8,104,805,0,0,104,805,0,0,104.0,805.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
ffffd2b829300cc638eb4c78c0fc1882,2944,2552,2,0,2987,2553,2,0,2965.5,2552.5,2.0,0.0
ffffd8b7f90bd8937218b42ee841dc20,366,817,0,0,366,817,0,0,366.0,817.0,0.0,0.0
ffffda7501c5f86d5ae850ca7a9fbd1f,1924,925,0,0,1924,925,0,0,1924.0,925.0,0.0,0.0
ffffeca2c4676546be82c9bf9df9c322,335,521,2,3,335,521,2,3,335.0,521.0,2.0,3.0


In [4]:
def datetime_col(dataframe, col_name):
    ''' returns dataframe with added 2 columns - 1. day of week, 2.Int num of sec from 1/1/2019 '''
    daykey = {"Sun":0,"Mon":1,"Tue":2,"Wed":3,"Thu":4,"Fri":5,"Sat":6}
    day = []
    sec = []
    for i in dataframe[col_name]:
        t = i.split(" ")
        day.append(daykey[t[0]])
        date = datetime.datetime.strptime(i, '%a %b %d %H:%M:%S +0000 %Y')
        sec.append(str((date - datetime.datetime(2019, 1, 1)).total_seconds())[:-2])
    dataframe['Day of Week'] = pd.Series(day)
    dataframe['Time Int'] = pd.Series(sec)
    return dataframe
    
        

In [5]:
data = datetime_col(test_data, 'Timestamp')
data

Unnamed: 0,Tweet Id,Username,Timestamp,#Followers,#Friends,#Retweets,#Favorites,Entities,Sentiment,Mentions,Hashtags,URLs,Day of Week,Time Int
0,1255980348229529601,fa5fd446e778da0acba3504aeab23da5,Thu Apr 30 22:00:24 +0000 2020,29697,24040,0,0,null;,1 -1,null;,Opinion Next2blowafrica thoughts,null;,4,41983224
1,1255981220640546816,547501e9cc84b8148ae1b8bde04157a4,Thu Apr 30 22:03:52 +0000 2020,799,1278,4,6,null;,1 -1,null;,null;,null;,4,41983432
2,1255981244560683008,840ac60dab55f6b212dc02dcbe5dfbd6,Thu Apr 30 22:03:58 +0000 2020,586,378,1,2,null;,2 -1,null;,null;,https://www.bbc.com/news/uk-england-beds-bucks...,4,41983438
3,1255981472285986816,37c68a001198b5efd4a21e2b68a0c9bc,Thu Apr 30 22:04:52 +0000 2020,237,168,0,0,null;,1 -1,null;,null;,https://lockdownsceptics.org/2020/04/30/latest...,4,41983492
4,1255981581354905600,8c3620bdfb9d2a1acfdf2412c9b34e06,Thu Apr 30 22:05:18 +0000 2020,423,427,0,0,i hate u:I_Hate_U:-1.8786140035817729;quaranti...,1 -4,null;,null;,null;,4,41983518
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1912065,1267207472424660992,ae1b1e6bf2a30cd0e1047ddd0baf5ad0,Sun May 31 21:32:59 +0000 2020,15,45,0,0,spotify:Spotify:-0.9407337067771776;wifi:Wi-Fi...,2 -1,null;,null;,null;,0,44659979
1912066,1267207883487354881,0e4323d01d164b9eb6e33f35564c7e25,Sun May 31 21:34:37 +0000 2020,43,931,0,0,china:China:-2.113921624336916;death penalty:C...,1 -2,null;,null;,null;,0,44660077
1912067,1267209309559173122,00fc2c96e4012e27a6eee351723ab461,Sun May 31 21:40:17 +0000 2020,256,451,0,0,null;,2 -1,null;,null;,null;,0,44660417
1912068,1267212987938545667,0f99a3b8b0d490f062215575d074518b,Sun May 31 21:54:54 +0000 2020,1467,1505,0,0,omg:OMG_%28Usher_song%29:-2.580063760606172;,2 -1,lsddrq,null;,null;,0,44661294


In [6]:
# Tokenizing SentiStrength Score
def sentiment_col(dataframe, col_name):
    dataframe["Positive"] = pd.Series([ int(x.split(' ')[0]) for x in data['Sentiment']])
    dataframe["Negative"] = pd.Series([ int(x.split(' ')[1]) for x in data['Sentiment']])
    dataframe["Sentiment Disparity"] = pd.Series([int(x.split(' ')[0])-int(x.split(' ')[1]) for x in data['Sentiment']])
    return dataframe

        

In [7]:
data = sentiment_col(data, 'Sentiment')
data

Unnamed: 0,Tweet Id,Username,Timestamp,#Followers,#Friends,#Retweets,#Favorites,Entities,Sentiment,Mentions,Hashtags,URLs,Day of Week,Time Int,Positive,Negative,Sentiment Disparity
0,1255980348229529601,fa5fd446e778da0acba3504aeab23da5,Thu Apr 30 22:00:24 +0000 2020,29697,24040,0,0,null;,1 -1,null;,Opinion Next2blowafrica thoughts,null;,4,41983224,1,-1,2
1,1255981220640546816,547501e9cc84b8148ae1b8bde04157a4,Thu Apr 30 22:03:52 +0000 2020,799,1278,4,6,null;,1 -1,null;,null;,null;,4,41983432,1,-1,2
2,1255981244560683008,840ac60dab55f6b212dc02dcbe5dfbd6,Thu Apr 30 22:03:58 +0000 2020,586,378,1,2,null;,2 -1,null;,null;,https://www.bbc.com/news/uk-england-beds-bucks...,4,41983438,2,-1,3
3,1255981472285986816,37c68a001198b5efd4a21e2b68a0c9bc,Thu Apr 30 22:04:52 +0000 2020,237,168,0,0,null;,1 -1,null;,null;,https://lockdownsceptics.org/2020/04/30/latest...,4,41983492,1,-1,2
4,1255981581354905600,8c3620bdfb9d2a1acfdf2412c9b34e06,Thu Apr 30 22:05:18 +0000 2020,423,427,0,0,i hate u:I_Hate_U:-1.8786140035817729;quaranti...,1 -4,null;,null;,null;,4,41983518,1,-4,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1912065,1267207472424660992,ae1b1e6bf2a30cd0e1047ddd0baf5ad0,Sun May 31 21:32:59 +0000 2020,15,45,0,0,spotify:Spotify:-0.9407337067771776;wifi:Wi-Fi...,2 -1,null;,null;,null;,0,44659979,2,-1,3
1912066,1267207883487354881,0e4323d01d164b9eb6e33f35564c7e25,Sun May 31 21:34:37 +0000 2020,43,931,0,0,china:China:-2.113921624336916;death penalty:C...,1 -2,null;,null;,null;,0,44660077,1,-2,3
1912067,1267209309559173122,00fc2c96e4012e27a6eee351723ab461,Sun May 31 21:40:17 +0000 2020,256,451,0,0,null;,2 -1,null;,null;,null;,0,44660417,2,-1,3
1912068,1267212987938545667,0f99a3b8b0d490f062215575d074518b,Sun May 31 21:54:54 +0000 2020,1467,1505,0,0,omg:OMG_%28Usher_song%29:-2.580063760606172;,2 -1,lsddrq,null;,null;,0,44661294,2,-1,3


In [24]:
def scaledtransform(dataframe, cols):
    for col,m in cols:
        dataframe["scaled_" + col ] = dataframe[col].apply(lambda x: x/m)
        dataframe = dataframe.drop(col,1)
    return dataframe


def logtransform(dataframe,cols):
    for col in cols:
        dataframe["log_" + col ] = dataframe[col].apply(lambda x: np.log10(int(x)+1))
        dataframe = dataframe.drop(col,1)
    return dataframe

def ohetransform(dataframe, cols):
    for col in cols:
        one_hot = pd.get_dummies(dataframe[col])
        dataframe["ohe_" + col ] = one_hot.values.tolist()
        dataframe = dataframe.drop(col,1)
    return dataframe

def unpackcol(dataframe,cols):
    for col in cols:
        unpacked = pd.DataFrame(dataframe[col].tolist(), columns=[f'{col}_{idx + 1}' for idx in range(len(dataframe[col].values[0]))], index= dataframe.index)
        dataframe = dataframe.drop(col,axis=1)
        dataframe = pd.concat([dataframe, unpacked], axis=1, join='inner')
    return dataframe

In [9]:
def entities_col(dataframe, col_name):
     #Entities 
    entities = dataframe[col_name].str.split(";")
    entity_no = []
    for ent in entities:
        ent.pop()
        if ent[0]=='null':
            entity_no.append(0)
        else:
            entity_no.append(len(ent))
    dataframe['No. of Entities'] = entity_no
    return dataframe

In [10]:
data = entities_col(data, 'Entities')
data

Unnamed: 0,Tweet Id,Username,Timestamp,#Followers,#Friends,#Retweets,#Favorites,Entities,Sentiment,Mentions,Hashtags,URLs,Day of Week,Time Int,Positive,Negative,Sentiment Disparity,No. of Entities
0,1255980348229529601,fa5fd446e778da0acba3504aeab23da5,Thu Apr 30 22:00:24 +0000 2020,29697,24040,0,0,null;,1 -1,null;,Opinion Next2blowafrica thoughts,null;,4,41983224,1,-1,2,0
1,1255981220640546816,547501e9cc84b8148ae1b8bde04157a4,Thu Apr 30 22:03:52 +0000 2020,799,1278,4,6,null;,1 -1,null;,null;,null;,4,41983432,1,-1,2,0
2,1255981244560683008,840ac60dab55f6b212dc02dcbe5dfbd6,Thu Apr 30 22:03:58 +0000 2020,586,378,1,2,null;,2 -1,null;,null;,https://www.bbc.com/news/uk-england-beds-bucks...,4,41983438,2,-1,3,0
3,1255981472285986816,37c68a001198b5efd4a21e2b68a0c9bc,Thu Apr 30 22:04:52 +0000 2020,237,168,0,0,null;,1 -1,null;,null;,https://lockdownsceptics.org/2020/04/30/latest...,4,41983492,1,-1,2,0
4,1255981581354905600,8c3620bdfb9d2a1acfdf2412c9b34e06,Thu Apr 30 22:05:18 +0000 2020,423,427,0,0,i hate u:I_Hate_U:-1.8786140035817729;quaranti...,1 -4,null;,null;,null;,4,41983518,1,-4,5,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1912065,1267207472424660992,ae1b1e6bf2a30cd0e1047ddd0baf5ad0,Sun May 31 21:32:59 +0000 2020,15,45,0,0,spotify:Spotify:-0.9407337067771776;wifi:Wi-Fi...,2 -1,null;,null;,null;,0,44659979,2,-1,3,3
1912066,1267207883487354881,0e4323d01d164b9eb6e33f35564c7e25,Sun May 31 21:34:37 +0000 2020,43,931,0,0,china:China:-2.113921624336916;death penalty:C...,1 -2,null;,null;,null;,0,44660077,1,-2,3,3
1912067,1267209309559173122,00fc2c96e4012e27a6eee351723ab461,Sun May 31 21:40:17 +0000 2020,256,451,0,0,null;,2 -1,null;,null;,null;,0,44660417,2,-1,3,0
1912068,1267212987938545667,0f99a3b8b0d490f062215575d074518b,Sun May 31 21:54:54 +0000 2020,1467,1505,0,0,omg:OMG_%28Usher_song%29:-2.580063760606172;,2 -1,lsddrq,null;,null;,0,44661294,2,-1,3,1


In [11]:
import logging
import numpy as np
import gensim.downloader

from gensim.models import Word2Vec
from gensim.test.utils import common_texts

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)



In [12]:
#LOAD EMBEDDINGS
hashtag_embeddings = Word2Vec.load('/Users/amarjyotkaur/Downloads/hashtag_embeddings (1)')
mention_embeddings = Word2Vec.load('/Users/amarjyotkaur/Downloads/mention_embeddings')
hashtags = hashtag_embeddings.wv.index_to_key
mentions = mention_embeddings.wv.index_to_key


2021-08-06 14:54:01,696 : INFO : loading Word2Vec object from /Users/amarjyotkaur/Downloads/hashtag_embeddings (1)
2021-08-06 14:54:01,701 : INFO : loading wv recursively from /Users/amarjyotkaur/Downloads/hashtag_embeddings (1).wv.* with mmap=None
2021-08-06 14:54:01,701 : INFO : setting ignored attribute cum_table to None
2021-08-06 14:54:01,705 : INFO : Word2Vec lifecycle event {'fname': '/Users/amarjyotkaur/Downloads/hashtag_embeddings (1)', 'datetime': '2021-08-06T14:54:01.705929', 'gensim': '4.0.1', 'python': '3.8.8 (default, Apr 13 2021, 12:59:45) \n[Clang 10.0.0 ]', 'platform': 'macOS-10.16-x86_64-i386-64bit', 'event': 'loaded'}
2021-08-06 14:54:01,706 : INFO : loading Word2Vec object from /Users/amarjyotkaur/Downloads/mention_embeddings
2021-08-06 14:54:01,708 : INFO : loading wv recursively from /Users/amarjyotkaur/Downloads/mention_embeddings.wv.* with mmap=None
2021-08-06 14:54:01,709 : INFO : setting ignored attribute cum_table to None
2021-08-06 14:54:01,714 : INFO : Word

In [13]:
def hashtag_col(dataframe, col_name):
    #LOAD EMBEDDINGS
    hashtag_embeddings = Word2Vec.load('/Users/amarjyotkaur/Downloads/hashtag_embeddings (1)')
    hashtags = hashtag_embeddings.wv.index_to_key
    ls = []
    for i in dataframe['Hashtags']:
        if i in hashtags and i!="null;":
            ls.append(hashtag_embeddings.wv[i])
            length = len(hashtag_embeddings.wv[i])
        else:
             ls.append(np.zeros((25,), dtype=int))
    for i in range(length):
        dataframe['Hashtag Emb'+str(i)] = pd.Series([x[i] for x in ls])
    return dataframe

def mention_col(dataframe, col_name):
    #LOAD EMBEDDINGS
    mention_embeddings = Word2Vec.load('/Users/amarjyotkaur/Downloads/mention_embeddings')
    mentions = mention_embeddings.wv.index_to_key
    ls = []
    for i in dataframe['Mentions']:
        if i in mentions and i!="null;":
            ls.append(mention_embeddings.wv[i])
            length = len(mention_embeddings.wv[i])
        else:
             ls.append(np.zeros((25,), dtype=int))
    for i in range(length):
        dataframe['Mention Emb'+str(i)] = pd.Series([x[i] for x in ls])
    return dataframe
    



In [14]:
hashtag_col(data, 'Hashtags')
mention_col(data, "Mentions")

2021-08-06 14:54:01,727 : INFO : loading Word2Vec object from /Users/amarjyotkaur/Downloads/hashtag_embeddings (1)
2021-08-06 14:54:01,728 : INFO : loading wv recursively from /Users/amarjyotkaur/Downloads/hashtag_embeddings (1).wv.* with mmap=None
2021-08-06 14:54:01,728 : INFO : setting ignored attribute cum_table to None
2021-08-06 14:54:01,732 : INFO : Word2Vec lifecycle event {'fname': '/Users/amarjyotkaur/Downloads/hashtag_embeddings (1)', 'datetime': '2021-08-06T14:54:01.732876', 'gensim': '4.0.1', 'python': '3.8.8 (default, Apr 13 2021, 12:59:45) \n[Clang 10.0.0 ]', 'platform': 'macOS-10.16-x86_64-i386-64bit', 'event': 'loaded'}
2021-08-06 14:54:28,091 : INFO : loading Word2Vec object from /Users/amarjyotkaur/Downloads/mention_embeddings
2021-08-06 14:54:28,092 : INFO : loading wv recursively from /Users/amarjyotkaur/Downloads/mention_embeddings.wv.* with mmap=None
2021-08-06 14:54:28,092 : INFO : setting ignored attribute cum_table to None
2021-08-06 14:54:28,097 : INFO : Word

Unnamed: 0,Tweet Id,Username,Timestamp,#Followers,#Friends,#Retweets,#Favorites,Entities,Sentiment,Mentions,...,Mention Emb15,Mention Emb16,Mention Emb17,Mention Emb18,Mention Emb19,Mention Emb20,Mention Emb21,Mention Emb22,Mention Emb23,Mention Emb24
0,1255980348229529601,fa5fd446e778da0acba3504aeab23da5,Thu Apr 30 22:00:24 +0000 2020,29697,24040,0,0,null;,1 -1,null;,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1255981220640546816,547501e9cc84b8148ae1b8bde04157a4,Thu Apr 30 22:03:52 +0000 2020,799,1278,4,6,null;,1 -1,null;,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1255981244560683008,840ac60dab55f6b212dc02dcbe5dfbd6,Thu Apr 30 22:03:58 +0000 2020,586,378,1,2,null;,2 -1,null;,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1255981472285986816,37c68a001198b5efd4a21e2b68a0c9bc,Thu Apr 30 22:04:52 +0000 2020,237,168,0,0,null;,1 -1,null;,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1255981581354905600,8c3620bdfb9d2a1acfdf2412c9b34e06,Thu Apr 30 22:05:18 +0000 2020,423,427,0,0,i hate u:I_Hate_U:-1.8786140035817729;quaranti...,1 -4,null;,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1912065,1267207472424660992,ae1b1e6bf2a30cd0e1047ddd0baf5ad0,Sun May 31 21:32:59 +0000 2020,15,45,0,0,spotify:Spotify:-0.9407337067771776;wifi:Wi-Fi...,2 -1,null;,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1912066,1267207883487354881,0e4323d01d164b9eb6e33f35564c7e25,Sun May 31 21:34:37 +0000 2020,43,931,0,0,china:China:-2.113921624336916;death penalty:C...,1 -2,null;,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1912067,1267209309559173122,00fc2c96e4012e27a6eee351723ab461,Sun May 31 21:40:17 +0000 2020,256,451,0,0,null;,2 -1,null;,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1912068,1267212987938545667,0f99a3b8b0d490f062215575d074518b,Sun May 31 21:54:54 +0000 2020,1467,1505,0,0,omg:OMG_%28Usher_song%29:-2.580063760606172;,2 -1,lsddrq,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
for x in ['Tweet Id', 'Timestamp','Entities','Sentiment', 'Mentions', 'Hashtags', 'URLs']:
    del data[x]
data

Unnamed: 0,Username,#Followers,#Friends,#Retweets,#Favorites,Day of Week,Time Int,Positive,Negative,Sentiment Disparity,...,Mention Emb15,Mention Emb16,Mention Emb17,Mention Emb18,Mention Emb19,Mention Emb20,Mention Emb21,Mention Emb22,Mention Emb23,Mention Emb24
0,fa5fd446e778da0acba3504aeab23da5,29697,24040,0,0,4,41983224,1,-1,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,547501e9cc84b8148ae1b8bde04157a4,799,1278,4,6,4,41983432,1,-1,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,840ac60dab55f6b212dc02dcbe5dfbd6,586,378,1,2,4,41983438,2,-1,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,37c68a001198b5efd4a21e2b68a0c9bc,237,168,0,0,4,41983492,1,-1,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,8c3620bdfb9d2a1acfdf2412c9b34e06,423,427,0,0,4,41983518,1,-4,5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1912065,ae1b1e6bf2a30cd0e1047ddd0baf5ad0,15,45,0,0,0,44659979,2,-1,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1912066,0e4323d01d164b9eb6e33f35564c7e25,43,931,0,0,0,44660077,1,-2,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1912067,00fc2c96e4012e27a6eee351723ab461,256,451,0,0,0,44660417,2,-1,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1912068,0f99a3b8b0d490f062215575d074518b,1467,1505,0,0,0,44661294,2,-1,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
data=scaledtransform(data,[("Positive",5),("Negative",-5), ("Sentiment Disparity",10)])
data

Unnamed: 0,Username,#Followers,#Friends,#Retweets,#Favorites,Day of Week,Time Int,No. of Entities,Hashtag Emb0,Hashtag Emb1,...,Mention Emb18,Mention Emb19,Mention Emb20,Mention Emb21,Mention Emb22,Mention Emb23,Mention Emb24,scaled_Positive,scaled_Negative,scaled_Sentiment Disparity
0,fa5fd446e778da0acba3504aeab23da5,29697,24040,0,0,4,41983224,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.2,0.2
1,547501e9cc84b8148ae1b8bde04157a4,799,1278,4,6,4,41983432,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.2,0.2
2,840ac60dab55f6b212dc02dcbe5dfbd6,586,378,1,2,4,41983438,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.4,0.2,0.3
3,37c68a001198b5efd4a21e2b68a0c9bc,237,168,0,0,4,41983492,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.2,0.2
4,8c3620bdfb9d2a1acfdf2412c9b34e06,423,427,0,0,4,41983518,2,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.8,0.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1912065,ae1b1e6bf2a30cd0e1047ddd0baf5ad0,15,45,0,0,0,44659979,3,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.4,0.2,0.3
1912066,0e4323d01d164b9eb6e33f35564c7e25,43,931,0,0,0,44660077,3,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.4,0.3
1912067,00fc2c96e4012e27a6eee351723ab461,256,451,0,0,0,44660417,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.4,0.2,0.3
1912068,0f99a3b8b0d490f062215575d074518b,1467,1505,0,0,0,44661294,1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.4,0.2,0.3


In [17]:
data=logtransform(data,["#Retweets","#Followers", "#Friends", "No. of Entities","#Favorites", "Time Int"])
data

Unnamed: 0,Username,Day of Week,Hashtag Emb0,Hashtag Emb1,Hashtag Emb2,Hashtag Emb3,Hashtag Emb4,Hashtag Emb5,Hashtag Emb6,Hashtag Emb7,...,Mention Emb24,scaled_Positive,scaled_Negative,scaled_Sentiment Disparity,log_#Retweets,log_#Followers,log_#Friends,log_No. of Entities,log_#Favorites,log_Time Int
0,fa5fd446e778da0acba3504aeab23da5,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.2,0.2,0.2,0.00000,4.472727,4.380953,0.000000,0.000000,7.623076
1,547501e9cc84b8148ae1b8bde04157a4,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.2,0.2,0.2,0.69897,2.903090,3.106871,0.000000,0.845098,7.623078
2,840ac60dab55f6b212dc02dcbe5dfbd6,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.4,0.2,0.3,0.30103,2.768638,2.578639,0.000000,0.477121,7.623078
3,37c68a001198b5efd4a21e2b68a0c9bc,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.2,0.2,0.2,0.00000,2.376577,2.227887,0.000000,0.000000,7.623079
4,8c3620bdfb9d2a1acfdf2412c9b34e06,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.2,0.8,0.5,0.00000,2.627366,2.631444,0.477121,0.000000,7.623079
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1912065,ae1b1e6bf2a30cd0e1047ddd0baf5ad0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.4,0.2,0.3,0.00000,1.204120,1.662758,0.602060,0.000000,7.649919
1912066,0e4323d01d164b9eb6e33f35564c7e25,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.2,0.4,0.3,0.00000,1.643453,2.969416,0.602060,0.000000,7.649919
1912067,00fc2c96e4012e27a6eee351723ab461,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.4,0.2,0.3,0.00000,2.409933,2.655138,0.000000,0.000000,7.649923
1912068,0f99a3b8b0d490f062215575d074518b,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.4,0.2,0.3,0.00000,3.166726,3.177825,0.301030,0.000000,7.649931


In [18]:
data = ohetransform(data, ["Day of Week"])
data

Unnamed: 0,Username,Hashtag Emb0,Hashtag Emb1,Hashtag Emb2,Hashtag Emb3,Hashtag Emb4,Hashtag Emb5,Hashtag Emb6,Hashtag Emb7,Hashtag Emb8,...,scaled_Positive,scaled_Negative,scaled_Sentiment Disparity,log_#Retweets,log_#Followers,log_#Friends,log_No. of Entities,log_#Favorites,log_Time Int,ohe_Day of Week
0,fa5fd446e778da0acba3504aeab23da5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.2,0.2,0.2,0.00000,4.472727,4.380953,0.000000,0.000000,7.623076,"[0, 0, 0, 0, 1, 0, 0]"
1,547501e9cc84b8148ae1b8bde04157a4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.2,0.2,0.2,0.69897,2.903090,3.106871,0.000000,0.845098,7.623078,"[0, 0, 0, 0, 1, 0, 0]"
2,840ac60dab55f6b212dc02dcbe5dfbd6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.4,0.2,0.3,0.30103,2.768638,2.578639,0.000000,0.477121,7.623078,"[0, 0, 0, 0, 1, 0, 0]"
3,37c68a001198b5efd4a21e2b68a0c9bc,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.2,0.2,0.2,0.00000,2.376577,2.227887,0.000000,0.000000,7.623079,"[0, 0, 0, 0, 1, 0, 0]"
4,8c3620bdfb9d2a1acfdf2412c9b34e06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.2,0.8,0.5,0.00000,2.627366,2.631444,0.477121,0.000000,7.623079,"[0, 0, 0, 0, 1, 0, 0]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1912065,ae1b1e6bf2a30cd0e1047ddd0baf5ad0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.4,0.2,0.3,0.00000,1.204120,1.662758,0.602060,0.000000,7.649919,"[1, 0, 0, 0, 0, 0, 0]"
1912066,0e4323d01d164b9eb6e33f35564c7e25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.2,0.4,0.3,0.00000,1.643453,2.969416,0.602060,0.000000,7.649919,"[1, 0, 0, 0, 0, 0, 0]"
1912067,00fc2c96e4012e27a6eee351723ab461,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.4,0.2,0.3,0.00000,2.409933,2.655138,0.000000,0.000000,7.649923,"[1, 0, 0, 0, 0, 0, 0]"
1912068,0f99a3b8b0d490f062215575d074518b,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.4,0.2,0.3,0.00000,3.166726,3.177825,0.301030,0.000000,7.649931,"[1, 0, 0, 0, 0, 0, 0]"


In [25]:
data = unpackcol(data, ["ohe_Day of Week"])
data

Unnamed: 0,Username,Hashtag Emb0,Hashtag Emb1,Hashtag Emb2,Hashtag Emb3,Hashtag Emb4,Hashtag Emb5,Hashtag Emb6,Hashtag Emb7,Hashtag Emb8,...,log_No. of Entities,log_#Favorites,log_Time Int,ohe_Day of Week_1,ohe_Day of Week_2,ohe_Day of Week_3,ohe_Day of Week_4,ohe_Day of Week_5,ohe_Day of Week_6,ohe_Day of Week_7
0,fa5fd446e778da0acba3504aeab23da5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,7.623076,0,0,0,0,1,0,0
1,547501e9cc84b8148ae1b8bde04157a4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.845098,7.623078,0,0,0,0,1,0,0
2,840ac60dab55f6b212dc02dcbe5dfbd6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.477121,7.623078,0,0,0,0,1,0,0
3,37c68a001198b5efd4a21e2b68a0c9bc,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,7.623079,0,0,0,0,1,0,0
4,8c3620bdfb9d2a1acfdf2412c9b34e06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.477121,0.000000,7.623079,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1912065,ae1b1e6bf2a30cd0e1047ddd0baf5ad0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.602060,0.000000,7.649919,1,0,0,0,0,0,0
1912066,0e4323d01d164b9eb6e33f35564c7e25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.602060,0.000000,7.649919,1,0,0,0,0,0,0
1912067,00fc2c96e4012e27a6eee351723ab461,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,7.649923,1,0,0,0,0,0,0
1912068,0f99a3b8b0d490f062215575d074518b,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.301030,0.000000,7.649931,1,0,0,0,0,0,0


In [None]:
#data = data.drop(["Username"],1)
#data

In [None]:
data.to_pickle("test_data_processeD.pkl")

In [None]:
data.to_pickle("data_processed_WITH_USERNAME.pkl")