# W266 Russian Troll Detection Project

# Data Pre-Processing Notebook, Part 2 of 2

This notebook takes our initial data and performs all the preprocessing steps required for it to be trained in an ML algorithm (e.g., an LSTM neural network).

In [1]:
#Import stuff
import numpy as np
import pandas as pd
import sys
import regex as re
from csv import reader
from collections import defaultdict
import itertools

In [2]:
#Set Global Options
pd.set_option('display.max_columns', 500)

# Load Data

Here we'll load the data that we saved as the final output of Pre-Processing notebook #1.

In [3]:
#Read from pkl file
df_alltweets = pd.read_pickle('data/df_alltweets.pkl')
df_alltweets.info(memory_usage='deep', verbose=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2991009 entries, 0 to 3003480
Data columns (total 17 columns):
tweet_id                 int64
text                     object
user_id                  int64
in_reply_to_status_id    float64
retweeted_status_id      float64
retweet_count            float32
favorite_count           float32
num_hashtags             float32
num_urls                 float32
num_mentions             float64
target                   int8
statuses_count           float64
followers_count          float64
friends_count            float64
favourites_count         float64
listed_count             float64
lang                     object
dtypes: float32(4), float64(8), int64(2), int8(1), object(2)
memory usage: 900.5 MB


In [4]:
#Counts
def count_tweets_by_target(df):
    print('Genuine Tweets: {:,}\nIRA Tweets: {:,}'.format(sum(df['target'] == 0),
                                                            sum(df['target'] == 1)))
    
count_tweets_by_target(df_alltweets)

Genuine Tweets: 2,787,527
IRA Tweets: 203,482


In [5]:
#Drop for now, maybe add back later
df_alltweets = df_alltweets.drop(columns=['lang'])
df_alltweets.info(memory_usage='deep', verbose=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2991009 entries, 0 to 3003480
Data columns (total 16 columns):
tweet_id                 int64
text                     object
user_id                  int64
in_reply_to_status_id    float64
retweeted_status_id      float64
retweet_count            float32
favorite_count           float32
num_hashtags             float32
num_urls                 float32
num_mentions             float64
target                   int8
statuses_count           float64
followers_count          float64
friends_count            float64
favourites_count         float64
listed_count             float64
dtypes: float32(4), float64(8), int64(2), int8(1), object(1)
memory usage: 731.9 MB


In [6]:
#Reduce size for initial model building
df_mini_alltweets = df_alltweets[df_alltweets['target'] == 1]. \
    append(df_alltweets[df_alltweets['target'] == 0]. \
    sample(frac=0.05, random_state=77), ignore_index=True)

count_tweets_by_target(df_mini_alltweets)
    
df_mini_alltweets.info(memory_usage='deep', verbose=True)

Genuine Tweets: 139,376
IRA Tweets: 203,482
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 342858 entries, 0 to 342857
Data columns (total 16 columns):
tweet_id                 342858 non-null int64
text                     342858 non-null object
user_id                  342858 non-null int64
in_reply_to_status_id    342858 non-null float64
retweeted_status_id      342858 non-null float64
retweet_count            342858 non-null float32
favorite_count           342858 non-null float32
num_hashtags             342858 non-null float32
num_urls                 342858 non-null float32
num_mentions             342858 non-null float64
target                   342858 non-null int8
statuses_count           342858 non-null float64
followers_count          342858 non-null float64
friends_count            342858 non-null float64
favourites_count         342858 non-null float64
listed_count             342858 non-null float64
dtypes: float32(4), float64(8), int64(2), int8(1), object(1)
memory u

# Tokenize and Canonicalize Tweet Text

In [7]:
# Functions to Tokenize and Canonicalize Tweet Text

"""
Source:  https://gist.github.com/tokestermw/cb87a97113da12acb388

preprocess-twitter.py
python preprocess-twitter.py "Some random text with #hashtags, @mentions and http://t.co/kdjfkdjf (links). :)"
Script for preprocessing tweets by Romain Paulus
with small modifications by Jeffrey Pennington
with translation to Python by Motoki Wu
Translation of Ruby script to create features for GloVe vectors for Twitter data.
http://nlp.stanford.edu/projects/glove/preprocess-twitter.rb
"""

FLAGS = re.MULTILINE | re.DOTALL

def hashtag(text):
    text = text.group()
    hashtag_body = text[1:]
    if hashtag_body.isupper():
        result = " {} ".format(hashtag_body.lower())
    else:
        result = " ".join(["<hashtag>"] + re.split(r"(?=[A-Z])", hashtag_body, flags=FLAGS))
    return result

def allcaps(text):
    text = text.group()
    return text.lower() + " <allcaps>"


def tokenize(text):
    # Different regex parts for smiley faces
    eyes = r"[8:=;]"
    nose = r"['`\-]?"

    # function so code less repetitive
    def re_sub(pattern, repl):
        return re.sub(pattern, repl, text, flags=FLAGS)

    text = re_sub(r"https?:\/\/\S+\b|www\.(\w+\.)+\S*", "<url>")
    text = re_sub(r"@\w+", "<user>")
    text = re_sub(r"{}{}[)dD]+|[)dD]+{}{}".format(eyes, nose, nose, eyes), "<smile>")
    text = re_sub(r"{}{}p+".format(eyes, nose), "<lolface>")
    text = re_sub(r"{}{}\(+|\)+{}{}".format(eyes, nose, nose, eyes), "<sadface>")
    text = re_sub(r"{}{}[\/|l*]".format(eyes, nose), "<neutralface>")
    text = re_sub(r"/"," / ")
    text = re_sub(r"<3","<heart>")
    text = re_sub(r"[-+]?[.\d]*[\d]+[:,.\d]*", "<number>")
    text = re_sub(r"#\S+", hashtag)
    text = re_sub(r"([!?.]){2,}", r"\1 <repeat>")
    text = re_sub(r"\b(\S*?)(.)\2{2,}\b", r"\1\2 <elong>")

    ## -- I just don't understand why the Ruby script adds <allcaps> to everything so I limited the selection.
    # text = re_sub(r"([^a-z0-9()<>'`\-]){2,}", allcaps)
    text = re_sub(r"([A-Z]){2,}", allcaps)

    ### ORIGINAL ###
    #return text.lower().split()  #CK added the .split() on the end.
    #return text.lower()
    
    ### NEW: FIXES TOKENIZER PUNCTUATION SPLITTING ERRORS ###
    output = text.lower().split()  #CK added the .split() on the end.
    output = list(itertools.chain(*[re.split(r'([^\w<>])', x) for x in output]))  #Splits punctuation, keeping < and >
    return [item for item in output if item != '']  #Removes blank strings from list

In [8]:
#Test tokenizer
tokenize('abc d.ef #blah <new> word: user: <user>:')

['abc',
 'd',
 '.',
 'ef',
 '<hashtag>',
 'blah',
 '<new>',
 'wor<smile>',
 'user',
 ':',
 '<user>',
 ':']

In [9]:
#Make new column with tokenized, canonicalized text
df_mini_alltweets['text_cantok'] = df_mini_alltweets['text'].apply(tokenize)

#Note, this roughly doubles (trippled it online) its size

In [10]:
#Check out the text_cantok column at the end
df_mini_alltweets

Unnamed: 0,tweet_id,text,user_id,in_reply_to_status_id,retweeted_status_id,retweet_count,favorite_count,num_hashtags,num_urls,num_mentions,target,statuses_count,followers_count,friends_count,favourites_count,listed_count,text_cantok
0,564839233957818369,"Either, we heal, now, as a team, or we will di...",0,0.000000e+00,0.000000e+00,0.0,0.0,0.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,"[either, ,, we, heal, ,, now, ,, as, a, team, ..."
1,676409305944469504,RT @AmyMek: Shaker Aamer is not even a citizen...,0,0.000000e+00,0.000000e+00,0.0,0.0,0.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,"[rt, <allcaps>, <user>, :, shaker, aamer, is, ..."
2,564822564556382208,https://t.co/5jtxjrQrKa just watch it),0,0.000000e+00,0.000000e+00,0.0,0.0,0.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,"[<url>, just, watch, it, )]"
3,614985782739087360,"RT @_wintergirl93: Dude, seriously? Go away. h...",0,0.000000e+00,0.000000e+00,0.0,0.0,0.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,"[rt, <allcaps>, <user>, :, dude, ,, seriously,..."
4,578247211655434240,What is he doing?? http://t.co/w19JHah4pV,0,0.000000e+00,0.000000e+00,0.0,0.0,0.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,"[what, is, he, doing, ?, <repeat>, <url>]"
5,689503880091021312,RT @BringTheFlag: Iran to Begin Construction o...,0,0.000000e+00,0.000000e+00,0.0,0.0,0.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,"[rt, <allcaps>, <user>, :, iran, to, begin, co..."
6,564830562670501888,"We turn, not older with years, but newer every...",0,0.000000e+00,0.000000e+00,0.0,0.0,0.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,"[we, turn, ,, not, older, with, years, ,, but,..."
7,564821530824015873,To be able to say how much you love is to love...,0,0.000000e+00,0.000000e+00,0.0,0.0,0.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,"[to, be, able, to, say, how, much, you, love, ..."
8,673889392025600001,RT @keriqbaum: Mondays be like... https://t.co...,0,0.000000e+00,0.000000e+00,0.0,0.0,0.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,"[rt, <allcaps>, <user>, :, mondays, be, like, ..."
9,554292771658268674,Literally why people voted for him http://t.co...,0,0.000000e+00,0.000000e+00,0.0,0.0,0.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,"[literally, why, people, voted, for, him, <url>]"


In [11]:
def remove_tokens(text_cantok, tokens_to_remove):
    return [item for item in text_cantok if item not in tokens_to_remove]

In [12]:
#Set to True only to test model performance without these tokens; normal functionality is False
remove_common_tokens = False
token_list = ['<allcaps>', '<user>', ':', 'rt', '<url>', '<hashtag>', '…']
if remove_common_tokens:
    df_mini_alltweets['text_cantok'] = df_mini_alltweets['text_cantok'].apply(remove_tokens, args=([token_list]))    

In [13]:
df_mini_alltweets

Unnamed: 0,tweet_id,text,user_id,in_reply_to_status_id,retweeted_status_id,retweet_count,favorite_count,num_hashtags,num_urls,num_mentions,target,statuses_count,followers_count,friends_count,favourites_count,listed_count,text_cantok
0,564839233957818369,"Either, we heal, now, as a team, or we will di...",0,0.000000e+00,0.000000e+00,0.0,0.0,0.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,"[either, ,, we, heal, ,, now, ,, as, a, team, ..."
1,676409305944469504,RT @AmyMek: Shaker Aamer is not even a citizen...,0,0.000000e+00,0.000000e+00,0.0,0.0,0.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,"[rt, <allcaps>, <user>, :, shaker, aamer, is, ..."
2,564822564556382208,https://t.co/5jtxjrQrKa just watch it),0,0.000000e+00,0.000000e+00,0.0,0.0,0.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,"[<url>, just, watch, it, )]"
3,614985782739087360,"RT @_wintergirl93: Dude, seriously? Go away. h...",0,0.000000e+00,0.000000e+00,0.0,0.0,0.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,"[rt, <allcaps>, <user>, :, dude, ,, seriously,..."
4,578247211655434240,What is he doing?? http://t.co/w19JHah4pV,0,0.000000e+00,0.000000e+00,0.0,0.0,0.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,"[what, is, he, doing, ?, <repeat>, <url>]"
5,689503880091021312,RT @BringTheFlag: Iran to Begin Construction o...,0,0.000000e+00,0.000000e+00,0.0,0.0,0.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,"[rt, <allcaps>, <user>, :, iran, to, begin, co..."
6,564830562670501888,"We turn, not older with years, but newer every...",0,0.000000e+00,0.000000e+00,0.0,0.0,0.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,"[we, turn, ,, not, older, with, years, ,, but,..."
7,564821530824015873,To be able to say how much you love is to love...,0,0.000000e+00,0.000000e+00,0.0,0.0,0.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,"[to, be, able, to, say, how, much, you, love, ..."
8,673889392025600001,RT @keriqbaum: Mondays be like... https://t.co...,0,0.000000e+00,0.000000e+00,0.0,0.0,0.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,"[rt, <allcaps>, <user>, :, mondays, be, like, ..."
9,554292771658268674,Literally why people voted for him http://t.co...,0,0.000000e+00,0.000000e+00,0.0,0.0,0.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,"[literally, why, people, voted, for, him, <url>]"


In [14]:
df_mini_alltweets.info(memory_usage='deep', verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 342858 entries, 0 to 342857
Data columns (total 17 columns):
tweet_id                 342858 non-null int64
text                     342858 non-null object
user_id                  342858 non-null int64
in_reply_to_status_id    342858 non-null float64
retweeted_status_id      342858 non-null float64
retweet_count            342858 non-null float32
favorite_count           342858 non-null float32
num_hashtags             342858 non-null float32
num_urls                 342858 non-null float32
num_mentions             342858 non-null float64
target                   342858 non-null int8
statuses_count           342858 non-null float64
followers_count          342858 non-null float64
friends_count            342858 non-null float64
favourites_count         342858 non-null float64
listed_count             342858 non-null float64
text_cantok              342858 non-null object
dtypes: float32(4), float64(8), int64(2), int8(1), object(2)
memo

# Load the GloVe model

In [15]:
def loadGloveModel(gloveFile):
    print("Loading Glove Model")
    f = open(gloveFile, 'r', encoding='utf-8')
    model = {}
    status_every = 100000
    
    for i, line in enumerate(f):
        if i%status_every == 0:
            print('Processing line {:,}'.format(i))
        splitLine = line.split()
        word = splitLine[0]
        embedding = np.array([float(val) for val in splitLine[1:]])
        model[word] = embedding
    print("Done.",'{:,}'.format(len(model))," words loaded!")
    return model

In [16]:
#Load the GloVE model into a defaultdict

glove_dir = './data/GLoVE/'
glove_filename = 'glove.twitter.27B.50d.txt'
glove_fullpath = glove_dir + glove_filename
glove_dd = loadGloveModel(glove_fullpath)

#Converts to a default dict
#Returns the embedding for 'unk' if unknown word found 
print('\nDict Size: {:,} bytes'.format(sys.getsizeof(glove_dd)))
glove_dd = defaultdict(lambda: glove_dd['unk'], glove_dd)
print('DefaultDict Size: {:,} bytes'.format(sys.getsizeof(glove_dd)))

#Note, defaultdict this uses 2x memory; not sure why or if it will matter

Loading Glove Model
Processing line 0
Processing line 100,000
Processing line 200,000
Processing line 300,000
Processing line 400,000
Processing line 500,000
Processing line 600,000
Processing line 700,000
Processing line 800,000
Processing line 900,000
Processing line 1,000,000
Processing line 1,100,000
Done. 1,193,514  words loaded!

Dict Size: 50,331,744 bytes
DefaultDict Size: 100,663,392 bytes


# Create Embeddings

In [17]:
def embed_text(cantok_tweet):
    return np.array([glove_dd[word] for word in cantok_tweet]).astype(np.float32)

In [18]:
#Quick test of our function above
embed_text(df_mini_alltweets['text_cantok'][5])

array([[ 6.8243e-01,  7.3589e-01,  1.7529e-02, -1.3763e-01,  3.6029e-01,
         5.1704e-01,  1.9540e-01,  7.5219e-01,  4.3029e-01,  1.0728e+00,
         5.7250e-02,  6.0043e-01, -2.4946e+00,  3.2148e-02, -4.4991e-01,
         9.7306e-03, -1.3653e-02, -5.1001e-01,  2.5079e-01, -3.4493e-01,
         3.3606e-01, -9.6185e-03, -5.1301e-01,  1.2760e-01,  8.5964e-01,
        -7.8568e-01, -1.9140e-01, -3.3234e-03,  8.9442e-01,  3.9503e-01,
        -9.5075e-01,  9.3803e-01, -3.1701e-01,  5.6457e-01,  1.0153e+00,
         8.6391e-01, -1.9971e-01,  4.7159e-01, -2.6864e-01,  1.2548e-01,
        -2.7490e+00, -5.5106e-01,  4.0409e-02,  2.9164e-01, -2.8792e-01,
        -2.2740e-02, -4.0295e-01,  1.4771e-01,  8.0503e-02, -6.8115e-01],
       [ 1.1266e+00, -6.1616e-01, -7.0097e-01, -7.1332e-01, -5.1856e-01,
         5.0459e-01,  6.6192e-01,  5.5820e-01,  2.8674e-01,  9.3260e-01,
         3.5756e-01,  4.8560e-01, -1.6895e+00, -3.1646e-02,  1.6962e-01,
        -8.4166e-01, -6.1023e-01,  3.3499e-01, -1.

In [19]:
#Embed our text!
df_mini_alltweets['embedded_text'] = df_mini_alltweets['text_cantok'].apply(embed_text)

In [20]:
#Check out new embedded_text column at the end!
df_mini_alltweets

Unnamed: 0,tweet_id,text,user_id,in_reply_to_status_id,retweeted_status_id,retweet_count,favorite_count,num_hashtags,num_urls,num_mentions,target,statuses_count,followers_count,friends_count,favourites_count,listed_count,text_cantok,embedded_text
0,564839233957818369,"Either, we heal, now, as a team, or we will di...",0,0.000000e+00,0.000000e+00,0.0,0.0,0.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,"[either, ,, we, heal, ,, now, ,, as, a, team, ...","[[0.32019, 0.081182, -0.080413, -0.0066426, -0..."
1,676409305944469504,RT @AmyMek: Shaker Aamer is not even a citizen...,0,0.000000e+00,0.000000e+00,0.0,0.0,0.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,"[rt, <allcaps>, <user>, :, shaker, aamer, is, ...","[[0.68243, 0.73589, 0.017529, -0.13763, 0.3602..."
2,564822564556382208,https://t.co/5jtxjrQrKa just watch it),0,0.000000e+00,0.000000e+00,0.0,0.0,0.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,"[<url>, just, watch, it, )]","[[0.55283, -0.57581, -0.76596, -1.1371, 0.2205..."
3,614985782739087360,"RT @_wintergirl93: Dude, seriously? Go away. h...",0,0.000000e+00,0.000000e+00,0.0,0.0,0.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,"[rt, <allcaps>, <user>, :, dude, ,, seriously,...","[[0.68243, 0.73589, 0.017529, -0.13763, 0.3602..."
4,578247211655434240,What is he doing?? http://t.co/w19JHah4pV,0,0.000000e+00,0.000000e+00,0.0,0.0,0.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,"[what, is, he, doing, ?, <repeat>, <url>]","[[0.71441, 0.46241, 0.028374, 0.07638, -0.0030..."
5,689503880091021312,RT @BringTheFlag: Iran to Begin Construction o...,0,0.000000e+00,0.000000e+00,0.0,0.0,0.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,"[rt, <allcaps>, <user>, :, iran, to, begin, co...","[[0.68243, 0.73589, 0.017529, -0.13763, 0.3602..."
6,564830562670501888,"We turn, not older with years, but newer every...",0,0.000000e+00,0.000000e+00,0.0,0.0,0.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,"[we, turn, ,, not, older, with, years, ,, but,...","[[0.70057, 1.2429, 0.14358, -0.047923, -0.2265..."
7,564821530824015873,To be able to say how much you love is to love...,0,0.000000e+00,0.000000e+00,0.0,0.0,0.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,"[to, be, able, to, say, how, much, you, love, ...","[[0.54964, 0.032752, 0.19354, -0.51119, -0.394..."
8,673889392025600001,RT @keriqbaum: Mondays be like... https://t.co...,0,0.000000e+00,0.000000e+00,0.0,0.0,0.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,"[rt, <allcaps>, <user>, :, mondays, be, like, ...","[[0.68243, 0.73589, 0.017529, -0.13763, 0.3602..."
9,554292771658268674,Literally why people voted for him http://t.co...,0,0.000000e+00,0.000000e+00,0.0,0.0,0.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,"[literally, why, people, voted, for, him, <url>]","[[0.35106, 0.43278, 1.0263, -0.71624, -0.2855,..."


In [21]:
df_mini_alltweets.info(memory_usage='deep', verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 342858 entries, 0 to 342857
Data columns (total 18 columns):
tweet_id                 342858 non-null int64
text                     342858 non-null object
user_id                  342858 non-null int64
in_reply_to_status_id    342858 non-null float64
retweeted_status_id      342858 non-null float64
retweet_count            342858 non-null float32
favorite_count           342858 non-null float32
num_hashtags             342858 non-null float32
num_urls                 342858 non-null float32
num_mentions             342858 non-null float64
target                   342858 non-null int8
statuses_count           342858 non-null float64
followers_count          342858 non-null float64
friends_count            342858 non-null float64
favourites_count         342858 non-null float64
listed_count             342858 non-null float64
text_cantok              342858 non-null object
embedded_text            342858 non-null object
dtypes: float32(4

In [22]:
#For some reason, dropping rows with NA text still left some zero-length tweets
df_mini_alltweets[df_mini_alltweets['embedded_text'].apply(len) == 0]

Unnamed: 0,tweet_id,text,user_id,in_reply_to_status_id,retweeted_status_id,retweet_count,favorite_count,num_hashtags,num_urls,num_mentions,target,statuses_count,followers_count,friends_count,favourites_count,listed_count,text_cantok,embedded_text
65808,756993006394302464,,4209684039,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,286.0,2960.0,1693.0,15.0,12.0,[],[]
65809,760221559172788224,,4209684039,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,286.0,2960.0,1693.0,15.0,12.0,[],[]
65810,757305380510715904,,4209684039,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,286.0,2960.0,1693.0,15.0,12.0,[],[]
65934,761637347234615297,,4301962823,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1370.0,11518.0,6737.0,36.0,130.0,[],[]
105524,785949687060639744,,4217244274,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1545.0,8339.0,6374.0,3.0,75.0,[],[]
105525,782024108250959872,,4217244274,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1545.0,8339.0,6374.0,3.0,75.0,[],[]
160926,763757600936697856,,2882331822,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,20576.0,61609.0,20600.0,8763.0,456.0,[],[]
160930,763758118530670592,,2882331822,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,20576.0,61609.0,20600.0,8763.0,456.0,[],[]
161932,768368973859680256,,3899481526,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,10162.0,26430.0,6100.0,93.0,273.0,[],[]
166230,769667497863835648,,4289431230,0.0,0.0,0.0,0.0,2.0,0.0,0.0,1,1530.0,9767.0,8729.0,360.0,48.0,[],[]


In [23]:
#Removing zero-length tweets
df_mini_alltweets = df_mini_alltweets[df_mini_alltweets['embedded_text'].apply(len) != 0]

#Check (should produce headers with no rows)
df_mini_alltweets[df_mini_alltweets['embedded_text'].apply(len) == 0]

Unnamed: 0,tweet_id,text,user_id,in_reply_to_status_id,retweeted_status_id,retweet_count,favorite_count,num_hashtags,num_urls,num_mentions,target,statuses_count,followers_count,friends_count,favourites_count,listed_count,text_cantok,embedded_text


# Pickle / Save Time!

In [24]:
#Pickle the file
df_mini_alltweets.to_pickle('data/df_mini_alltweets_with_embed.pkl')

In [25]:
#Save the unknown sequence for analysis
np.save('data/GloVe_Unknown_50.npy', glove_dd['unk'])

In [26]:
#Check to see that our .pkl file is there (and note its size)
!ls ./data -lah

total 3.0G
drwxr-xr-x 3 brandon_cummings brandon_cummings 4.0K Aug  9 21:43 .
drwxr-xr-x 4 brandon_cummings brandon_cummings 4.0K Aug 10 13:14 ..
-rw-r--r-- 1 brandon_cummings brandon_cummings 229M Aug  9 21:43 arr_metadata.npy
-rw-r--r-- 1 brandon_cummings brandon_cummings  92M Aug  9 21:43 arr_otherdata.npy
-rw-r--r-- 1 brandon_cummings brandon_cummings 2.9M Aug  9 21:43 arr_targetdata.npy
-rw-r--r-- 1 brandon_cummings brandon_cummings 537M Aug 10 12:57 df_alltweets.pkl
-rw-r--r-- 1 brandon_cummings brandon_cummings 1.3G Aug 10 13:14 df_mini_alltweets_with_embed.pkl
drwxr-xr-x 2 brandon_cummings brandon_cummings 4.0K Jul 19 12:42 GLoVE
-rw-r--r-- 1 brandon_cummings brandon_cummings  528 Aug 10 13:14 GloVe_Unknown_50.npy
-rw-r--r-- 1 brandon_cummings brandon_cummings 243M Aug  9 21:43 text.pkl
-rw-r--r-- 1 brandon_cummings brandon_cummings 575M Aug  9 21:43 token.pkl
