# W266 Russian Troll Detection Project

# Data Pre-Processing Notebook, Part 2 of 2

This notebook takes our initial data and performs all the preprocessing steps required for it to be trained in an ML algorithm (e.g., an LSTM neural network).

In [1]:
#Import stuff
import numpy as np
import pandas as pd
import sys
import regex as re
from csv import reader
from collections import defaultdict
import itertools
import pickle

In [2]:
#Set Global Options
pd.set_option('display.max_columns', 500)

# Load Data

Here we'll load the data that we saved as the final output of Pre-Processing notebook #1.

In [3]:
#Read from pkl file
df_alltweets = pd.read_pickle('data/df_alltweets.pkl')
df_alltweets.info(memory_usage='deep', verbose=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2991009 entries, 0 to 3003480
Data columns (total 17 columns):
tweet_id                 int64
text                     object
user_id                  int64
in_reply_to_status_id    float64
retweeted_status_id      float64
retweet_count            float32
favorite_count           float32
num_hashtags             float32
num_urls                 float32
num_mentions             float64
target                   int8
statuses_count           float64
followers_count          float64
friends_count            float64
favourites_count         float64
listed_count             float64
lang                     object
dtypes: float32(4), float64(8), int64(2), int8(1), object(2)
memory usage: 900.5 MB


In [4]:
#Counts
def count_tweets_by_target(df):
    print('Genuine Tweets: {:,};  IRA Tweets: {:,}'.format(sum(df['target'] == 0),
                                                            sum(df['target'] == 1)))
    
count_tweets_by_target(df_alltweets)

Genuine Tweets: 2,787,527;  IRA Tweets: 203,482


In [5]:
#Drop for now, maybe add back later
df_alltweets = df_alltweets.drop(columns=['lang'])
df_alltweets.info(memory_usage='deep', verbose=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2991009 entries, 0 to 3003480
Data columns (total 16 columns):
tweet_id                 int64
text                     object
user_id                  int64
in_reply_to_status_id    float64
retweeted_status_id      float64
retweet_count            float32
favorite_count           float32
num_hashtags             float32
num_urls                 float32
num_mentions             float64
target                   int8
statuses_count           float64
followers_count          float64
friends_count            float64
favourites_count         float64
listed_count             float64
dtypes: float32(4), float64(8), int64(2), int8(1), object(1)
memory usage: 731.9 MB


In [7]:
#Get out of pandas

#Step 1. Text id to list
text_list = list(df_alltweets['text'])

#Step 2. Metadata columns to array (we don't yet have token lengh ... need to add later)
meta_cols = ['retweet_count','favorite_count','num_hashtags','num_urls','num_mentions',
             'statuses_count','followers_count','friends_count','favourites_count','listed_count']
arr_metadata = np.array(df_alltweets[meta_cols])

#Step 3. Metadata columns to array (we don't yet have token lengh ... need to add later)
target_cols = ['target']
arr_targetdata = np.array(df_alltweets[target_cols])

#Step 4. Save other data (might use it later)
other_data = ['tweet_id','user_id','in_reply_to_status_id','retweeted_status_id']
arr_otherdata = np.array(df_alltweets[other_data])

#Delete pandas dataframe
del df_alltweets

# Tokenize and Canonicalize Tweet Text

In [8]:
# Functions to Tokenize and Canonicalize Tweet Text

"""
Source:  https://gist.github.com/tokestermw/cb87a97113da12acb388

preprocess-twitter.py
python preprocess-twitter.py "Some random text with #hashtags, @mentions and http://t.co/kdjfkdjf (links). :)"
Script for preprocessing tweets by Romain Paulus
with small modifications by Jeffrey Pennington
with translation to Python by Motoki Wu
Translation of Ruby script to create features for GloVe vectors for Twitter data.
http://nlp.stanford.edu/projects/glove/preprocess-twitter.rb
"""

FLAGS = re.MULTILINE | re.DOTALL

def hashtag(text):
    text = text.group()
    hashtag_body = text[1:]
    if hashtag_body.isupper():
        result = " {} ".format(hashtag_body.lower())
    else:
        result = " ".join(["<hashtag>"] + re.split(r"(?=[A-Z])", hashtag_body, flags=FLAGS))
    return result

def allcaps(text):
    text = text.group()
    return text.lower() + " <allcaps>"


def tokenize(text):
    # Different regex parts for smiley faces
    eyes = r"[8:=;]"
    nose = r"['`\-]?"

    # function so code less repetitive
    def re_sub(pattern, repl):
        return re.sub(pattern, repl, text, flags=FLAGS)

    text = re_sub(r"https?:\/\/\S+\b|www\.(\w+\.)+\S*", "<url>")
    text = re_sub(r"@\w+", "<user>")
    text = re_sub(r"{}{}[)dD]+|[)dD]+{}{}".format(eyes, nose, nose, eyes), "<smile>")
    text = re_sub(r"{}{}p+".format(eyes, nose), "<lolface>")
    text = re_sub(r"{}{}\(+|\)+{}{}".format(eyes, nose, nose, eyes), "<sadface>")
    text = re_sub(r"{}{}[\/|l*]".format(eyes, nose), "<neutralface>")
    text = re_sub(r"/"," / ")
    text = re_sub(r"<3","<heart>")
    text = re_sub(r"[-+]?[.\d]*[\d]+[:,.\d]*", "<number>")
    text = re_sub(r"#\S+", hashtag)
    text = re_sub(r"([!?.]){2,}", r"\1 <repeat>")
    text = re_sub(r"\b(\S*?)(.)\2{2,}\b", r"\1\2 <elong>")

    ## -- I just don't understand why the Ruby script adds <allcaps> to everything so I limited the selection.
    # text = re_sub(r"([^a-z0-9()<>'`\-]){2,}", allcaps)
    text = re_sub(r"([A-Z]){2,}", allcaps)

    ### ORIGINAL ###
    #return text.lower().split()  #CK added the .split() on the end.
    #return text.lower()
    
    ### NEW ###
    output = text.lower().split()  #CK added the .split() on the end.
    #return list(itertools.chain(*[re.split('(\W)', x) for x in output]))  #Fails because it splits < and > in tags
    output = list(itertools.chain(*[re.split(r'([^\w<>])', x) for x in output]))  #Splits punctuation, keeping < and >
    return [item for item in output if item != '']  #Removes blank strings from list


In [9]:
#Test our tokenizer
a = 'abc d.ef #blah <new> word: user: <user>:'
c = tokenize(a)
c


['abc',
 'd',
 '.',
 'ef',
 '<hashtag>',
 'blah',
 '<new>',
 'wor<smile>',
 'user',
 ':',
 '<user>',
 ':']

In [10]:
#Make new column with tokenized, canonicalized text
token_list = list(map(tokenize, text_list))

print('Text Size: {:,} bytes'.format(sys.getsizeof(text_list)))
print('Token Size: {:,} bytes'.format(sys.getsizeof(token_list)))

Text Size: 26,919,192 bytes
Token Size: 26,791,768 bytes


# Pickle / Save Time!

In [19]:
print(type(text_list))
print(type(token_list))
#print(type(embed_arr))
print(type(arr_metadata))
print(type(arr_targetdata))
print(type(arr_otherdata))

#Save Orig Text
with open('data/text.pkl', 'wb') as fp:
    pickle.dump(text_list, fp)

#Save Tokens    
with open('data/token.pkl', 'wb') as fp:
    pickle.dump(token_list, fp)

# #Save Embeddings
# np.save('data/embed_arr.npy', embed_arr)

#Save Metadata
np.save('data/arr_metadata.npy', arr_metadata)

#Save Targets
np.save('data/arr_targetdata.npy', arr_targetdata)

#Save Other Data
np.save('data/arr_otherdata.npy', arr_otherdata)

<class 'list'>
<class 'list'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>


In [22]:
#Check to see that our .pkl file is there (and note its size)
!ls ./data -lah

total 3.0G
drwxr-xr-x 3 brandon_cummings brandon_cummings 4.0K Aug  9 21:43 .
drwxr-xr-x 4 brandon_cummings brandon_cummings 4.0K Aug 10 13:22 ..
-rw-r--r-- 1 brandon_cummings brandon_cummings 229M Aug 10 13:23 arr_metadata.npy
-rw-r--r-- 1 brandon_cummings brandon_cummings  92M Aug 10 13:23 arr_otherdata.npy
-rw-r--r-- 1 brandon_cummings brandon_cummings 2.9M Aug 10 13:23 arr_targetdata.npy
-rw-r--r-- 1 brandon_cummings brandon_cummings 537M Aug 10 12:57 df_alltweets.pkl
-rw-r--r-- 1 brandon_cummings brandon_cummings 1.3G Aug 10 13:14 df_mini_alltweets_with_embed.pkl
drwxr-xr-x 2 brandon_cummings brandon_cummings 4.0K Jul 19 12:42 GLoVE
-rw-r--r-- 1 brandon_cummings brandon_cummings  528 Aug 10 13:14 GloVe_Unknown_50.npy
-rw-r--r-- 1 brandon_cummings brandon_cummings 243M Aug 10 13:22 text.pkl
-rw-r--r-- 1 brandon_cummings brandon_cummings 575M Aug 10 13:23 token.pkl
