# W266 Russian Troll Detection Project

# Data Pre-Processing Notebook, Part 1 of 2

This notebook takes our initial data and performs all the preprocessing steps required for it to be trained in an ML algorithm (e.g., an LSTM neural network).

In [1]:
#Import stuff
import numpy as np
import pandas as pd
from pandas.io import gbq
import sys
import regex as re
from csv import reader
from collections import defaultdict

import ast  #Used to convert strings that look like lists and convert them to actual lists

import timeit  #For timing

In [2]:
#Set Global Options
pd.set_option('display.max_columns', 500)

# Import Data

We use Google Big Query to import our data into 4 pandas dataframes:

  1. Genuine Tweets
  2. Bot Tweets
  3. Genuine Users
  4. Bot Users

In [3]:
#Set GBQ Project ID
project_id='w266-nlp'

#Define query
def gbq_execute(query):
    return gbq.read_gbq(query=query, dialect ='standard', project_id=project_id)

## Get Genuine Tweets

In [4]:
#Get Genuine Tweets
### This cell takes a while to run (about 7 minutes) ###
time_start = timeit.default_timer()

query="""
SELECT id,
    text,
    user_id,
    in_reply_to_status_id,
    retweeted_status_id,
    retweet_count,
    favorite_count,
    num_hashtags,
    num_urls,
    num_mentions,
    created_at

FROM `genuine_accounts.tweets`
"""
df_gentweets = gbq_execute(query)

#Timing
time_end = timeit.default_timer()
print('Time to run query: {:.1f} seconds'.format(time_end - time_start))

df_gentweets



Time to run query: 460.6 seconds


Unnamed: 0,id,text,user_id,in_reply_to_status_id,retweeted_status_id,retweet_count,favorite_count,num_hashtags,num_urls,num_mentions,created_at
0,590401415559917568,#bouncy with the #bubbles #godfreyspubcrawl #c...,2.167197e+07,0.000000e+00,0.000000e+00,0.0,0.0,4.0,1.0,0.0,Tue Apr 21 06:27:21 +0000 2015
1,589654081049731073,#RIP KJ. My life would be a whole lot differen...,2.167197e+07,0.000000e+00,0.000000e+00,0.0,0.0,1.0,0.0,0.0,Sun Apr 19 04:57:42 +0000 2015
2,588877648891285505,#tbt 2001 ish #vegas http://t.co/S5ONJ2DkVa,2.167197e+07,0.000000e+00,0.000000e+00,0.0,0.0,2.0,0.0,0.0,Fri Apr 17 01:32:26 +0000 2015
3,580866760070549504,#TRUTH CHOOSE #Happy. It's awesome! Abi #sorry...,2.167197e+07,0.000000e+00,0.000000e+00,0.0,0.0,6.0,1.0,0.0,Wed Mar 25 23:00:02 +0000 2015
4,581642720592023552,I'll be in #NorthBeach tomorrow night. Messag...,2.167197e+07,0.000000e+00,0.000000e+00,0.0,1.0,3.0,0.0,0.0,Sat Mar 28 02:23:25 +0000 2015
5,582747063667933184,Apparently shooting a bb gun in the dark with...,2.167197e+07,0.000000e+00,0.000000e+00,0.0,0.0,1.0,1.0,0.0,Tue Mar 31 03:31:41 +0000 2015
6,582359963995078656,Auntie Florence #92 #sassy in #leopard #sanfr...,2.167197e+07,0.000000e+00,0.000000e+00,0.0,0.0,5.0,1.0,0.0,Mon Mar 30 01:53:29 +0000 2015
7,581985393395945472,Here I am. So happy http://t.co/FPhf0nllBW,2.167197e+07,0.000000e+00,0.000000e+00,0.0,0.0,0.0,0.0,0.0,Sun Mar 29 01:05:05 +0000 2015
8,575395864060940290,I want ice cream #planetfitness #blah #whiner ...,2.167197e+07,0.000000e+00,0.000000e+00,0.0,0.0,5.0,0.0,0.0,Tue Mar 10 20:40:38 +0000 2015
9,570847926450307072,I could do this every damn day. #hike #presidi...,2.167197e+07,0.000000e+00,0.000000e+00,0.0,0.0,6.0,1.0,0.0,Thu Feb 26 07:28:46 +0000 2015


In [5]:
#Add Target
df_gentweets['target'] = 0

#Change id column to tweet_id (to match bots and to be more explicit)
df_gentweets.rename(columns={'id': 'tweet_id'}, inplace=True)

In [6]:
#Inspect column types and memory usage
df_gentweets.info(memory_usage='deep', verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2799999 entries, 0 to 2799998
Data columns (total 12 columns):
tweet_id                 int64
text                     object
user_id                  float64
in_reply_to_status_id    float64
retweeted_status_id      float64
retweet_count            float64
favorite_count           float64
num_hashtags             float64
num_urls                 float64
num_mentions             float64
created_at               object
target                   int64
dtypes: float64(8), int64(2), object(2)
memory usage: 833.3 MB


In [7]:
#Convert a bunch of columns:
convert_to_float64 = ['in_reply_to_status_id', 'retweeted_status_id', 'num_mentions']
convert_to_float32 = ['retweet_count', 'favorite_count', 'num_hashtags', 'num_urls']
convert_to_int8 = ['target']

for item in convert_to_float64:
    df_gentweets[item] = df_gentweets[item].astype(np.float64)

for item in convert_to_float32:
    df_gentweets[item] = df_gentweets[item].astype(np.float32)
    
for item in convert_to_int8:
    df_gentweets[item] = df_gentweets[item].astype(np.int8)
    
#Drop a high memory usage column
### Maybe add back in once we can convert to a datetime object (surely these take less memory than text ...)
df_gentweets = df_gentweets.drop(columns=['created_at'])

#Re-check object types and memory
df_gentweets.info(memory_usage='deep', verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2799999 entries, 0 to 2799998
Data columns (total 11 columns):
tweet_id                 int64
text                     object
user_id                  float64
in_reply_to_status_id    float64
retweeted_status_id      float64
retweet_count            float32
favorite_count           float32
num_hashtags             float32
num_urls                 float32
num_mentions             float64
target                   int8
dtypes: float32(4), float64(4), int64(1), int8(1), object(1)
memory usage: 539.6 MB


In [8]:
df_gentweets.describe()

Unnamed: 0,tweet_id,user_id,in_reply_to_status_id,retweeted_status_id,retweet_count,favorite_count,num_hashtags,num_urls,num_mentions,target
count,2799999.0,2799999.0,2799999.0,2799999.0,2799999.0,2799999.0,2799999.0,2799999.0,2799999.0,2799999.0
mean,5.077163e+17,688254900.0,1.357087e+17,1.360882e+17,846.7202,0.466547,0.2477533,0.1503083,0.7708767,0.0
std,1.102379e+17,814440200.0,2.326975e+17,2.342418e+17,16661.81,5.916929,0.7242683,0.3664264,0.8776832,0.0
min,6460111.0,678033.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,4.782673e+17,26528110.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,5.508972e+17,233444500.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
75%,5.80095e+17,612711600.0,3.07583e+17,3.047875e+17,1.0,0.0,0.0,0.0,1.0,0.0
max,5.941405e+17,2351779000.0,5.941393e+17,5.941401e+17,3350111.0,4278.0,28.0,5.0,19.0,0.0


## Get Bot Tweets

In [9]:
#Get Bot Tweets
time_start = timeit.default_timer()

query="""
SELECT tweet_id,
    text,
    user_id,
    in_reply_to_status_id,
    retweeted_status_id,
    retweet_count,
    favorite_count,
    hashtags,
    expanded_urls,
    mentions,
    created_at

FROM `bots.tweets`
"""
df_bottweets = gbq_execute(query)

#Timing
time_end = timeit.default_timer()
print('Time to run query: {:.0f} seconds'.format(time_end - time_start))

df_bottweets

Time to run query: 35 seconds


Unnamed: 0,tweet_id,text,user_id,in_reply_to_status_id,retweeted_status_id,retweet_count,favorite_count,hashtags,expanded_urls,mentions,created_at
0,564839233957818369,"Either, we heal, now, as a team, or we will di...",,,,,,[],[],[],1423503142000
1,676409305944469504,RT @AmyMek: Shaker Aamer is not even a citizen...,,,,,,[],[],[],1450103520000
2,564822564556382208,https://t.co/5jtxjrQrKa just watch it),,,,,,[],[],[],1423499167000
3,614985782739087360,"RT @_wintergirl93: Dude, seriously? Go away. h...",,,,,,[],[],[],1435459011000
4,578247211655434240,What is he doing?? http://t.co/w19JHah4pV,,,,,,[],[],[],1426699853000
5,689503880091021312,RT @BringTheFlag: Iran to Begin Construction o...,,,,,,[],[],[],1453225510000
6,564830562670501888,"We turn, not older with years, but newer every...",,,,,,[],[],[],1423501074000
7,564821530824015873,To be able to say how much you love is to love...,,,,,,[],[],[],1423498921000
8,673889392025600001,RT @keriqbaum: Mondays be like... https://t.co...,,,,,,[],[],[],1449502725000
9,554292771658268674,Literally why people voted for him http://t.co...,,,,,,[],[],[],1420988669000


In [10]:
#Add Target
df_bottweets['target'] = 1

#Change some nulls to 0 (to match genuine tweets format)
fillna_cols = ['tweet_id', 'user_id', 'in_reply_to_status_id', 
               'retweeted_status_id', 'retweet_count', 'favorite_count']

for col in fillna_cols:
    df_bottweets[col].fillna(0, inplace=True)

### _Rationale for Converting NAs to 0_

Although we recognize changing NAs to 0 reduces fidelity of the original data, because only the Russian Troll tweets contain NAs, we don't want to give our ML algorithms any 'unfair advantage' to learn that every tweet with an NA value must be a Russian Troll.  This is a side effect (that we can live with) of not being able to source the data ourselves, instead having to rely on data from two different sources.

In [11]:
#Inspect column types and memory usage
df_bottweets.info(memory_usage='deep', verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 203482 entries, 0 to 203481
Data columns (total 12 columns):
tweet_id                 203482 non-null int64
text                     203482 non-null object
user_id                  203482 non-null int64
in_reply_to_status_id    203482 non-null int64
retweeted_status_id      203482 non-null int64
retweet_count            203482 non-null int64
favorite_count           203482 non-null int64
hashtags                 203482 non-null object
expanded_urls            203482 non-null object
mentions                 203482 non-null object
created_at               203461 non-null object
target                   203482 non-null int64
dtypes: int64(7), object(5)
memory usage: 102.9 MB


In [12]:
#Convert a bunch of columns:
convert_to_float64 = ['in_reply_to_status_id', 'retweeted_status_id']
convert_to_float32 = ['retweet_count', 'favorite_count']
convert_to_int8 = ['target']

for item in convert_to_float64:
    df_bottweets[item] = pd.to_numeric(df_bottweets[item]).astype(np.float64)
    
for item in convert_to_float32:
    df_bottweets[item] = pd.to_numeric(df_bottweets[item]).astype(np.float32)
    
for item in convert_to_int8:
    df_bottweets[item] = df_bottweets[item].astype(np.int8)
    
#Drop a high memory usage column
### Add back in once we can convert to a datetime object (surely these take less memory than text ...)
df_bottweets = df_bottweets.drop(columns=['created_at'])

#Re-check object types and memory
df_bottweets.info(memory_usage='deep', verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 203482 entries, 0 to 203481
Data columns (total 11 columns):
tweet_id                 203482 non-null int64
text                     203482 non-null object
user_id                  203482 non-null int64
in_reply_to_status_id    203482 non-null float64
retweeted_status_id      203482 non-null float64
retweet_count            203482 non-null float32
favorite_count           203482 non-null float32
hashtags                 203482 non-null object
expanded_urls            203482 non-null object
mentions                 203482 non-null object
target                   203482 non-null int8
dtypes: float32(2), float64(2), int64(2), int8(1), object(4)
memory usage: 92.2 MB


In [13]:
df_bottweets.describe()

Unnamed: 0,tweet_id,user_id,in_reply_to_status_id,retweeted_status_id,retweet_count,favorite_count,target
count,203482.0,203482.0,203482.0,203482.0,203482.0,203482.0,203482.0
mean,7.647237e+17,1.347937e+16,2120555000000000.0,1.521647e+17,11.3156,10.131908,1.0
std,1.082553e+17,9.97375e+16,4.041618e+16,3.094346e+17,156.524471,145.163499,0.0
min,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,7.636466e+17,1660771000.0,0.0,0.0,0.0,0.0,1.0
50%,7.884667e+17,1768260000.0,0.0,0.0,0.0,0.0,1.0
75%,8.148155e+17,2589354000.0,0.0,0.0,0.0,0.0,1.0
max,9.12604e+17,7.892661e+17,8.00997e+17,8.927026e+17,20494.0,26655.0,1.0


### Convert Hashtag, URL, and Mention Strings to Counts

The Russian tweets contained the actual hashtags, URLs, and mentions, where as the genuine tweets only contained counts of these items.  Beacuse we can't source these items from the genuine tweets, we'll convert the Russian tweets to counts, to match the information from the genuine tweets.

In [14]:
#Convert strings that look like lists to actual lists

#Syntax to convert a single list:
#x = ast.literal_eval(c['expanded_urls'][4])

df_bottweets['hashtags'] = df_bottweets['hashtags'].apply(ast.literal_eval).apply(len).astype(np.float32)
df_bottweets['expanded_urls'] = df_bottweets['expanded_urls'].apply(ast.literal_eval).apply(len).astype(np.float32)
df_bottweets['mentions'] = df_bottweets['mentions'].apply(ast.literal_eval).apply(len).astype(np.float64)

df_bottweets.rename(columns={'hashtags': 'num_hashtags', 
                             'expanded_urls': 'num_urls', 
                             'mentions': 'num_mentions'}, 
                    inplace=True)

### CONSIDER FOR LATER, SOME TWEETS HAVE SEVERAL LISTS WITH MULTIPLE ITEMS THAT ARE ALL ""
    #e.g., ["","","","","",""], which counts as 6, even though they're all blank.
    #Decision to leave this as is for now, but didn't want it to go unnoticed.
    
#Re-check object types and memory
df_bottweets.info(memory_usage='deep', verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 203482 entries, 0 to 203481
Data columns (total 11 columns):
tweet_id                 203482 non-null int64
text                     203482 non-null object
user_id                  203482 non-null int64
in_reply_to_status_id    203482 non-null float64
retweeted_status_id      203482 non-null float64
retweet_count            203482 non-null float32
favorite_count           203482 non-null float32
num_hashtags             203482 non-null float32
num_urls                 203482 non-null float32
num_mentions             203482 non-null float64
target                   203482 non-null int8
dtypes: float32(4), float64(3), int64(2), int8(1), object(1)
memory usage: 56.4 MB


In [15]:
#Compare to gentweets.  They should be the same.
df_gentweets.info(memory_usage='deep', verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2799999 entries, 0 to 2799998
Data columns (total 11 columns):
tweet_id                 int64
text                     object
user_id                  float64
in_reply_to_status_id    float64
retweeted_status_id      float64
retweet_count            float32
favorite_count           float32
num_hashtags             float32
num_urls                 float32
num_mentions             float64
target                   int8
dtypes: float32(4), float64(4), int64(1), int8(1), object(1)
memory usage: 539.6 MB


## Get Genuine Users

In [16]:
#Get Genuine Users
time_start = timeit.default_timer()

query="""
SELECT id AS user_id,
    statuses_count,
    followers_count,
    friends_count,
    favourites_count,
    listed_count,
    lang

FROM `genuine_accounts.users`

"""

df_genusers = gbq_execute(query)

#Timing
time_end = timeit.default_timer()
print('Time to run query: {:.0f} seconds'.format(time_end - time_start))

df_genusers

Time to run query: 3 seconds


Unnamed: 0,user_id,statuses_count,followers_count,friends_count,favourites_count,listed_count,lang
0,2482378631,1041,78,204,125,0,ar
1,2884619608,652,21,37,599,0,da
2,539977804,48011,20004,2301,15958,315,da
3,324351388,476,18,235,439,0,de
4,2595828119,20326,199,67,12821,1,de
5,152742559,26220,162,258,785,3,de
6,151855545,3236,29,16,188,5,de
7,2973161031,1486,1391,1473,4817,6,de
8,137666026,15001,500,220,27839,24,de
9,1492569428,10436,4217,4200,3501,9,el


In [17]:
#Inspect column types and memory usage
df_genusers.info(memory_usage='deep', verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3474 entries, 0 to 3473
Data columns (total 7 columns):
user_id             3474 non-null int64
statuses_count      3474 non-null int64
followers_count     3474 non-null int64
friends_count       3474 non-null int64
favourites_count    3474 non-null int64
listed_count        3474 non-null int64
lang                3474 non-null object
dtypes: int64(6), object(1)
memory usage: 363.3 KB


## Get Bot Users

In [18]:
#Get Bot Users
time_start = timeit.default_timer()

query="""
SELECT id AS user_id,
    statuses_count,
    followers_count,
    friends_count,
    favourites_count,
    listed_count,
    lang

FROM `bots.users`

"""
df_botusers = gbq_execute(query)

#Timing
time_end = timeit.default_timer()
print('Time to run query: {:.0f} seconds'.format(time_end - time_start))

df_botusers

Time to run query: 3 seconds


Unnamed: 0,user_id,statuses_count,followers_count,friends_count,favourites_count,listed_count,lang
0,1681425248,,,,,,
1,4508630900,,,,,,
2,4332740714,,,,,,
3,4224729994,,,,,,
4,4224912857,,,,,,
5,4272870988,,,,,,
6,4437233895,,,,,,
7,4840551713,,,,,,
8,737420000043139072,,,,,,
9,749931338573250560,15,8,100,11,0,de


In [19]:
#Inspect column types and memory usage
df_botusers.info(memory_usage='deep', verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 392 entries, 0 to 391
Data columns (total 7 columns):
user_id             392 non-null int64
statuses_count      383 non-null object
followers_count     383 non-null object
friends_count       383 non-null object
favourites_count    383 non-null object
listed_count        383 non-null object
lang                392 non-null object
dtypes: int64(1), object(6)
memory usage: 93.9 KB


In [20]:
#Change some nulls to 0 (to match genuine users format)
fillna_cols = ['statuses_count', 'followers_count', 'friends_count', 'favourites_count', 'listed_count']

for col in fillna_cols:
    df_botusers[col].fillna(0, inplace=True)

df_botusers

Unnamed: 0,user_id,statuses_count,followers_count,friends_count,favourites_count,listed_count,lang
0,1681425248,0,0,0,0,0,
1,4508630900,0,0,0,0,0,
2,4332740714,0,0,0,0,0,
3,4224729994,0,0,0,0,0,
4,4224912857,0,0,0,0,0,
5,4272870988,0,0,0,0,0,
6,4437233895,0,0,0,0,0,
7,4840551713,0,0,0,0,0,
8,737420000043139072,0,0,0,0,0,
9,749931338573250560,15,8,100,11,0,de


In [21]:
#Inspect column types and memory usage
df_botusers.info(memory_usage='deep', verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 392 entries, 0 to 391
Data columns (total 7 columns):
user_id             392 non-null int64
statuses_count      392 non-null int64
followers_count     392 non-null int64
friends_count       392 non-null int64
favourites_count    392 non-null int64
listed_count        392 non-null int64
lang                392 non-null object
dtypes: int64(6), object(1)
memory usage: 41.0 KB


In [22]:
#Stack them
df_allusers = df_genusers.append(df_botusers, ignore_index=True)
    #ignore_index=True gives us new index values so that every row has a unique index
    #Otherwise there will be duplicate indices

df_allusers

Unnamed: 0,user_id,statuses_count,followers_count,friends_count,favourites_count,listed_count,lang
0,2482378631,1041,78,204,125,0,ar
1,2884619608,652,21,37,599,0,da
2,539977804,48011,20004,2301,15958,315,da
3,324351388,476,18,235,439,0,de
4,2595828119,20326,199,67,12821,1,de
5,152742559,26220,162,258,785,3,de
6,151855545,3236,29,16,188,5,de
7,2973161031,1486,1391,1473,4817,6,de
8,137666026,15001,500,220,27839,24,de
9,1492569428,10436,4217,4200,3501,9,el


In [23]:
#Some quick stats
df_allusers.describe()

Unnamed: 0,user_id,statuses_count,followers_count,friends_count,favourites_count,listed_count
count,3866.0,3866.0,3866.0,3866.0,3866.0,3866.0
mean,7097849000000000.0,15697.529488,1721.764097,800.694516,4369.713916,21.308329
std,7.223887e+16,29462.143056,16676.414319,1947.97603,10994.194475,151.462719
min,678033.0,0.0,0.0,0.0,0.0,0.0
25%,175894800.0,1523.0,152.0,174.0,255.25,0.0
50%,574546700.0,5366.5,371.5,338.0,1278.0,3.0
75%,2183229000.0,17735.75,942.75,737.75,4276.25,10.0
max,7.892661e+17,399555.0,986837.0,46310.0,313954.0,6166.0


In [24]:
df_allusers.info(memory_usage='deep', verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3866 entries, 0 to 3865
Data columns (total 7 columns):
user_id             3866 non-null int64
statuses_count      3866 non-null int64
followers_count     3866 non-null int64
friends_count       3866 non-null int64
favourites_count    3866 non-null int64
listed_count        3866 non-null int64
lang                3866 non-null object
dtypes: int64(6), object(1)
memory usage: 404.3 KB


In [25]:
#We can convert several of the int64 columns to int32
convert_to_int32 = ['statuses_count', 'followers_count', 'friends_count', 'favourites_count', 'listed_count']
convert_to_float64 = ['user_id']  #Changing to float64 to match tweets; might help wtih join

for item in convert_to_int32:
    df_allusers[item] = pd.to_numeric(df_allusers[item]).astype(np.int32)
    
# for item in convert_to_float64:
#     df_allusers[item] = pd.to_numeric(df_allusers[item]).astype(np.float64)

    
#Re-check object types and memory
df_allusers.info(memory_usage='deep', verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3866 entries, 0 to 3865
Data columns (total 7 columns):
user_id             3866 non-null int64
statuses_count      3866 non-null int32
followers_count     3866 non-null int32
friends_count       3866 non-null int32
favourites_count    3866 non-null int32
listed_count        3866 non-null int32
lang                3866 non-null object
dtypes: int32(5), int64(1), object(1)
memory usage: 328.8 KB


## Quick Comparisons

In [26]:
print(df_gentweets.shape)
print(df_bottweets.shape)
print(df_allusers.shape)

(2799999, 11)
(203482, 11)
(3866, 7)


## Stack the Tweet Dataframes

Now that we have our two tweet dataframes, we can stack them into a single dataframe to use the rest of the way.

In [27]:
#Big stacking operation!
df_alltweets = df_gentweets.append(df_bottweets, ignore_index=True)
    #ignore_index=True gives us new index values so that every row has a unique index
    #Otherwise there will be duplicate indices

df_alltweets

### SEE IF WE CAN REMOVE df_gentweets AND df_bottweets FROM MEMORY

Unnamed: 0,tweet_id,text,user_id,in_reply_to_status_id,retweeted_status_id,retweet_count,favorite_count,num_hashtags,num_urls,num_mentions,target
0,590401415559917568,#bouncy with the #bubbles #godfreyspubcrawl #c...,2.167197e+07,0.000000e+00,0.000000e+00,0.0,0.0,4.0,1.0,0.0,0
1,589654081049731073,#RIP KJ. My life would be a whole lot differen...,2.167197e+07,0.000000e+00,0.000000e+00,0.0,0.0,1.0,0.0,0.0,0
2,588877648891285505,#tbt 2001 ish #vegas http://t.co/S5ONJ2DkVa,2.167197e+07,0.000000e+00,0.000000e+00,0.0,0.0,2.0,0.0,0.0,0
3,580866760070549504,#TRUTH CHOOSE #Happy. It's awesome! Abi #sorry...,2.167197e+07,0.000000e+00,0.000000e+00,0.0,0.0,6.0,1.0,0.0,0
4,581642720592023552,I'll be in #NorthBeach tomorrow night. Messag...,2.167197e+07,0.000000e+00,0.000000e+00,0.0,1.0,3.0,0.0,0.0,0
5,582747063667933184,Apparently shooting a bb gun in the dark with...,2.167197e+07,0.000000e+00,0.000000e+00,0.0,0.0,1.0,1.0,0.0,0
6,582359963995078656,Auntie Florence #92 #sassy in #leopard #sanfr...,2.167197e+07,0.000000e+00,0.000000e+00,0.0,0.0,5.0,1.0,0.0,0
7,581985393395945472,Here I am. So happy http://t.co/FPhf0nllBW,2.167197e+07,0.000000e+00,0.000000e+00,0.0,0.0,0.0,0.0,0.0,0
8,575395864060940290,I want ice cream #planetfitness #blah #whiner ...,2.167197e+07,0.000000e+00,0.000000e+00,0.0,0.0,5.0,0.0,0.0,0
9,570847926450307072,I could do this every damn day. #hike #presidi...,2.167197e+07,0.000000e+00,0.000000e+00,0.0,0.0,6.0,1.0,0.0,0


In [28]:
print(df_alltweets.shape)
df_alltweets.info(memory_usage='deep', verbose=True)
df_allusers.info(memory_usage='deep', verbose=True)

(3003481, 11)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3003481 entries, 0 to 3003480
Data columns (total 11 columns):
tweet_id                 int64
text                     object
user_id                  float64
in_reply_to_status_id    float64
retweeted_status_id      float64
retweet_count            float32
favorite_count           float32
num_hashtags             float32
num_urls                 float32
num_mentions             float64
target                   int8
dtypes: float32(4), float64(4), int64(1), int8(1), object(1)
memory usage: 596.0 MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3866 entries, 0 to 3865
Data columns (total 7 columns):
user_id             3866 non-null int64
statuses_count      3866 non-null int32
followers_count     3866 non-null int32
friends_count       3866 non-null int32
favourites_count    3866 non-null int32
listed_count        3866 non-null int32
lang                3866 non-null object
dtypes: int32(5), int64(1), object(1)
memory 

## Join User Data Into Tweet Dataframe

In [29]:
#Let's see if we can skip this ... nope
df_alltweets[['user_id']] = df_alltweets[['user_id']].astype(np.int64)
df_allusers[['user_id']] = df_allusers[['user_id']].astype(np.int64)
# df_alltweets.info(memory_usage='deep', verbose=True)
# df_allusers.info(memory_usage='deep', verbose=True)

In [30]:
#Join df_allusers data into df_alltweets
#Join on user_id
#Left join (so as to keep everything in df_alltweets, even if no match in df_allusers)

#df_alltweets_users = df_alltweets.join(df_allusers, on='user_id', how='left', lsuffix='', rsuffix='_rsuf')
df_alltweets_users = df_alltweets.merge(df_allusers, on='user_id', how='left', suffixes=['', '_rsuf'])
df_alltweets_users

Unnamed: 0,tweet_id,text,user_id,in_reply_to_status_id,retweeted_status_id,retweet_count,favorite_count,num_hashtags,num_urls,num_mentions,target,statuses_count,followers_count,friends_count,favourites_count,listed_count,lang
0,590401415559917568,#bouncy with the #bubbles #godfreyspubcrawl #c...,21671968,0.000000e+00,0.000000e+00,0.0,0.0,4.0,1.0,0.0,0,1148.0,82.0,157.0,38.0,8.0,en
1,589654081049731073,#RIP KJ. My life would be a whole lot differen...,21671968,0.000000e+00,0.000000e+00,0.0,0.0,1.0,0.0,0.0,0,1148.0,82.0,157.0,38.0,8.0,en
2,588877648891285505,#tbt 2001 ish #vegas http://t.co/S5ONJ2DkVa,21671968,0.000000e+00,0.000000e+00,0.0,0.0,2.0,0.0,0.0,0,1148.0,82.0,157.0,38.0,8.0,en
3,580866760070549504,#TRUTH CHOOSE #Happy. It's awesome! Abi #sorry...,21671968,0.000000e+00,0.000000e+00,0.0,0.0,6.0,1.0,0.0,0,1148.0,82.0,157.0,38.0,8.0,en
4,581642720592023552,I'll be in #NorthBeach tomorrow night. Messag...,21671968,0.000000e+00,0.000000e+00,0.0,1.0,3.0,0.0,0.0,0,1148.0,82.0,157.0,38.0,8.0,en
5,582747063667933184,Apparently shooting a bb gun in the dark with...,21671968,0.000000e+00,0.000000e+00,0.0,0.0,1.0,1.0,0.0,0,1148.0,82.0,157.0,38.0,8.0,en
6,582359963995078656,Auntie Florence #92 #sassy in #leopard #sanfr...,21671968,0.000000e+00,0.000000e+00,0.0,0.0,5.0,1.0,0.0,0,1148.0,82.0,157.0,38.0,8.0,en
7,581985393395945472,Here I am. So happy http://t.co/FPhf0nllBW,21671968,0.000000e+00,0.000000e+00,0.0,0.0,0.0,0.0,0.0,0,1148.0,82.0,157.0,38.0,8.0,en
8,575395864060940290,I want ice cream #planetfitness #blah #whiner ...,21671968,0.000000e+00,0.000000e+00,0.0,0.0,5.0,0.0,0.0,0,1148.0,82.0,157.0,38.0,8.0,en
9,570847926450307072,I could do this every damn day. #hike #presidi...,21671968,0.000000e+00,0.000000e+00,0.0,0.0,6.0,1.0,0.0,0,1148.0,82.0,157.0,38.0,8.0,en


In [31]:
df_alltweets_users.describe()

Unnamed: 0,tweet_id,user_id,in_reply_to_status_id,retweeted_status_id,retweet_count,favorite_count,num_hashtags,num_urls,num_mentions,target,statuses_count,followers_count,friends_count,favourites_count,listed_count
count,3003481.0,3003481.0,3003481.0,3003481.0,3003481.0,3003481.0,3003481.0,3003481.0,3003481.0,3003481.0,2994267.0,2994267.0,2994267.0,2994267.0,2994267.0
mean,5.251282e+17,913210700000000.0,1.266583e+17,1.371773e+17,790.0753,1.121363,0.2713012,0.1567465,0.7373344,0.06774872,22803.63,2803.032,883.6345,5363.716,39.41101
std,1.276513e+17,2.618029e+16,2.274147e+17,2.401148e+17,16090.01,38.30617,0.7755918,0.4246305,0.875264,0.2513143,35215.88,32735.04,1823.085,10786.24,209.1005
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,4.856163e+17,149245800.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4159.0,258.0,235.0,630.0,1.0
50%,5.567537e+17,488351800.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,10577.0,567.0,454.0,2082.0,5.0
75%,5.835401e+17,1689431000.0,0.0,2.506151e+17,1.0,0.0,0.0,0.0,1.0,0.0,25701.0,1330.0,957.0,5849.0,24.0
max,9.12604e+17,7.892661e+17,8.00997e+17,8.927026e+17,3350111.0,26655.0,45.0,45.0,19.0,1.0,333504.0,986837.0,25600.0,185467.0,4840.0


In [32]:
df_alltweets_users.info(memory_usage='deep', verbose=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3003481 entries, 0 to 3003480
Data columns (total 17 columns):
tweet_id                 int64
text                     object
user_id                  int64
in_reply_to_status_id    float64
retweeted_status_id      float64
retweet_count            float32
favorite_count           float32
num_hashtags             float32
num_urls                 float32
num_mentions             float64
target                   int8
statuses_count           float64
followers_count          float64
friends_count            float64
favourites_count         float64
listed_count             float64
lang                     object
dtypes: float32(4), float64(8), int64(2), int8(1), object(2)
memory usage: 902.3 MB


## More Cleanup

In [33]:
#Count Nulls
df_alltweets_users.isna().sum()

tweet_id                     0
text                     12472
user_id                      0
in_reply_to_status_id        0
retweeted_status_id          0
retweet_count                0
favorite_count               0
num_hashtags                 0
num_urls                     0
num_mentions                 0
target                       0
statuses_count            9214
followers_count           9214
friends_count             9214
favourites_count          9214
listed_count              9214
lang                      9214
dtype: int64

In [34]:
#Eliminate null texts (if there's no text, why bother classifying it)
df_alltweets_users.dropna(subset=['text'], inplace=True)
df_alltweets_users.isna().sum()

tweet_id                    0
text                        0
user_id                     0
in_reply_to_status_id       0
retweeted_status_id         0
retweet_count               0
favorite_count              0
num_hashtags                0
num_urls                    0
num_mentions                0
target                      0
statuses_count           9214
followers_count          9214
friends_count            9214
favourites_count         9214
listed_count             9214
lang                     9214
dtype: int64

In [35]:
def count_nulls_by_target():
    print('GENUINE TWEETS')
    print(df_alltweets_users[df_alltweets_users['target'] == 0].isna().sum())

    print('\nRUSSIAN TWEETS')
    print(df_alltweets_users[df_alltweets_users['target'] == 1].isna().sum())
    
count_nulls_by_target()

GENUINE TWEETS
tweet_id                 0
text                     0
user_id                  0
in_reply_to_status_id    0
retweeted_status_id      0
retweet_count            0
favorite_count           0
num_hashtags             0
num_urls                 0
num_mentions             0
target                   0
statuses_count           0
followers_count          0
friends_count            0
favourites_count         0
listed_count             0
lang                     0
dtype: int64

RUSSIAN TWEETS
tweet_id                    0
text                        0
user_id                     0
in_reply_to_status_id       0
retweeted_status_id         0
retweet_count               0
favorite_count              0
num_hashtags                0
num_urls                    0
num_mentions                0
target                      0
statuses_count           9214
followers_count          9214
friends_count            9214
favourites_count         9214
listed_count             9214
lang             

In [36]:
#Change some nulls (to match genuine tweets format)

#Convert null counts to 0
fillna_cols = ['statuses_count', 'followers_count', 'friends_count', 
               'favourites_count', 'listed_count']
for col in fillna_cols:
    df_alltweets_users[col].fillna(0, inplace=True)

#Convert null lang to 'NotListed'
fillna_cols = ['lang']
for col in fillna_cols:
    df_alltweets_users[col].fillna('LanguageNotListed', inplace=True)
    
count_nulls_by_target()

GENUINE TWEETS
tweet_id                 0
text                     0
user_id                  0
in_reply_to_status_id    0
retweeted_status_id      0
retweet_count            0
favorite_count           0
num_hashtags             0
num_urls                 0
num_mentions             0
target                   0
statuses_count           0
followers_count          0
friends_count            0
favourites_count         0
listed_count             0
lang                     0
dtype: int64

RUSSIAN TWEETS
tweet_id                 0
text                     0
user_id                  0
in_reply_to_status_id    0
retweeted_status_id      0
retweet_count            0
favorite_count           0
num_hashtags             0
num_urls                 0
num_mentions             0
target                   0
statuses_count           0
followers_count          0
friends_count            0
favourites_count         0
listed_count             0
lang                     0
dtype: int64


In [37]:
df_alltweets_users.info(memory_usage='deep', verbose=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2991009 entries, 0 to 3003480
Data columns (total 17 columns):
tweet_id                 int64
text                     object
user_id                  int64
in_reply_to_status_id    float64
retweeted_status_id      float64
retweet_count            float32
favorite_count           float32
num_hashtags             float32
num_urls                 float32
num_mentions             float64
target                   int8
statuses_count           float64
followers_count          float64
friends_count            float64
favourites_count         float64
listed_count             float64
lang                     object
dtypes: float32(4), float64(8), int64(2), int8(1), object(2)
memory usage: 900.5 MB


# Pickle!

We'll save this file to disk, and reload it in our next data pre processing notebook.  Doing so will improve processing time for the next bit of preprocessing (and it might help clear any unwanted memory objects).

In [38]:
#Write to a pkl file and beginning a new notebook
df_alltweets_users.to_pickle('data/df_alltweets.pkl')

#Interesting, ~600MB memory usage in Jupyter but only 400MB on disk (with only tweet data, no user data joined).
#Interesting, ~1.1GB memory usage in Jupyter but only 550MB on disk (with user data joined).
    #This could indicate compression when writing to disk, because of the large number of similar user entries.

In [39]:
!ls -lah

total 2.8M
drwxr-xr-x  4 brandon_cummings brandon_cummings 4.0K Aug 10 12:56 .
drwxr-xr-x 18 brandon_cummings brandon_cummings 4.0K Jul 23 00:30 ..
-rw-r--r--  1 brandon_cummings brandon_cummings  12K Jul 23 00:13 BC_preprocessing.ipynb
-rw-r--r--  1 brandon_cummings brandon_cummings  13K Jul 23 00:13 BC_w266Proj_LSTM.ipynb
-rw-r--r--  1 brandon_cummings brandon_cummings 9.4K Aug  1 11:08 big-query-example.ipynb
-rw-r--r--  1 brandon_cummings brandon_cummings 3.0K Jul 23 00:13 Brandon-LSTM.ipynb
-rw-r--r--  1 brandon_cummings brandon_cummings 149K Jul 25 19:42 CK_w266Proj_LSTM.ipynb
-rw-r--r--  1 brandon_cummings brandon_cummings 234K Aug  5 14:20 CK_w266Proj_LSTM_Plus_Meta.ipynb
-rw-r--r--  1 brandon_cummings brandon_cummings  54K Aug  5 13:00 CK_w266Proj_PreProcess_Part1.ipynb
-rw-r--r--  1 brandon_cummings brandon_cummings 234K Aug  3 16:24 CK_w266Proj_PreProcess_Part2.ipynb
drwxr-xr-x  3 brandon_cummings brandon_cummings 4.0K Aug  9 21:43 data
drwxr-xr-x  2 brandon_cumm

In [40]:
!ls ./data -lah

total 3.0G
drwxr-xr-x 3 brandon_cummings brandon_cummings 4.0K Aug  9 21:43 .
drwxr-xr-x 4 brandon_cummings brandon_cummings 4.0K Aug 10 12:56 ..
-rw-r--r-- 1 brandon_cummings brandon_cummings 229M Aug  9 21:43 arr_metadata.npy
-rw-r--r-- 1 brandon_cummings brandon_cummings  92M Aug  9 21:43 arr_otherdata.npy
-rw-r--r-- 1 brandon_cummings brandon_cummings 2.9M Aug  9 21:43 arr_targetdata.npy
-rw-r--r-- 1 brandon_cummings brandon_cummings 537M Aug 10 12:57 df_alltweets.pkl
-rw-r--r-- 1 brandon_cummings brandon_cummings 1.3G Aug  3 16:23 df_mini_alltweets_with_embed.pkl
drwxr-xr-x 2 brandon_cummings brandon_cummings 4.0K Jul 19 12:42 GLoVE
-rw-r--r-- 1 brandon_cummings brandon_cummings  528 Aug  3 16:23 GloVe_Unknown_50.npy
-rw-r--r-- 1 brandon_cummings brandon_cummings 243M Aug  9 21:43 text.pkl
-rw-r--r-- 1 brandon_cummings brandon_cummings 575M Aug  9 21:43 token.pkl
