In [88]:
import pandas as pd
import numpy as np
import sklearn.utils
import re
import functools
import contractions

from datetime import datetime
from matplotlib import pyplot as plt
from tqdm import tqdm_notebook as tqdm

from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from string import punctuation
from collections import Counter

stop = set(stopwords.words('english'))

np.random.seed(101)
rand_seed = 101

tqdm().pandas()

In [89]:
%%time
ROOT = "./csv"
# df_nbc = pd.read_csv(
#     ROOT + "/tweets.csv", 
#     encoding='utf-8', 
#     low_memory=False, 
#     parse_dates=False
# )

num_rows = 10000000

df = pd.read_csv(
    ROOT + "/scraped_tweets.csv", 
    encoding='utf-8', 
    nrows = num_rows,
    low_memory=False, 
    parse_dates=False
)

Wall time: 52.4 s


In [90]:
df[[
    'retweeted_status_id',
    'in_reply_to_status_id'
]] = df[[
    'retweeted_status_id',
    'in_reply_to_status_id'
]].fillna(0).astype(np.int64)

In [91]:
stop_words = []

f = open('./data/stopwords.txt', 'r')
for l in f.readlines():
    stop_words.append(l.replace('\n', ''))
    
additional_stop_words = ['t', 'will']
stop_words += additional_stop_words

print(len(stop_words))

668


In [92]:
def remove_non_ascii(s): 
    return "".join(i for i in s if ord(i)<128)

def clean_text(text):
    text = text.lower()
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text)
    text = re.sub(r"what's", "what is ", text)
    text = text.replace(r'(ap)', '')
    text = re.sub(r"\'s", " is ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r'\W+', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r"\\", "", text)
    text = re.sub(r"\'", "", text)    
    text = re.sub(r"\"", "", text)
    text = re.sub(r'[^a-zA-Z ?!]+', '', text)
    text = re.sub(r"(x[abcdef0-9]{0,2})?", '', text)
    text = re.sub(r"rt", '', text)
    text = re.sub(r"amp", '', text)
    text = remove_non_ascii(text)
    text = text.strip()
    return text

def tokenizer(text):
#     text = contractions.fix(text)
    text = clean_text(text)    
    tokens = [word_tokenize(sent) for sent in sent_tokenize(text)]
    tokens = list(functools.reduce(lambda x,y: x+y, tokens))
    tokens = list(filter(lambda token: token not in (stop_words + list(punctuation)) , tokens))
    return tokens

def stem_words(words):
    """Stem words from a list of tokenized words"""
    stemmer = PorterStemmer()
    stems = []
    for word in words:
        stem = stemmer.stem(word)
        stems.append(stem)
    return stems

def lemmatize_words(words):
    """Lemmatize verbs from a list of tokenized words"""
    lemmatizer = WordNetLemmatizer()
    lemmas = []
    for word in words:
        lemma = lemmatizer.lemmatize(word, pos='v')
        lemmas.append(lemma)
    return lemmas

In [95]:
# df['tokens'] = df['text'].progress_map(lambda d: tokenizer(d))

# df['stems'] = df['tokens'].progress_apply(stem_words)
df['lemmas'] = df['tokens'].progress_apply(lemmatize_words)

Exception ignored in: <bound method tqdm.__del__ of 45659/|/  0%|| 45659/19514378 [2:19:46<2:20:11, 2314.59it/s]>
Traceback (most recent call last):
  File "C:\Users\matt\Anaconda3\lib\site-packages\tqdm\_tqdm.py", line 857, in __del__
    self.close()
  File "C:\Users\matt\Anaconda3\lib\site-packages\tqdm\_tqdm_notebook.py", line 203, in close
    super(tqdm_notebook, self).close(*args, **kwargs)
  File "C:\Users\matt\Anaconda3\lib\site-packages\tqdm\_tqdm.py", line 1076, in close
    self._decr_instances(self)
  File "C:\Users\matt\Anaconda3\lib\site-packages\tqdm\_tqdm.py", line 428, in _decr_instances
    with cls._lock:
  File "C:\Users\matt\Anaconda3\lib\site-packages\tqdm\_tqdm.py", line 102, in __enter__
    self.acquire()
  File "C:\Users\matt\Anaconda3\lib\site-packages\tqdm\_tqdm.py", line 95, in acquire
    lock.acquire()
KeyboardInterrupt


In [96]:
print(df.dtypes)
pd.set_option('display.max_columns', None)
print(df.head(30))
print(df.tail())

user_id                   int64
user_key                 object
created_at                int64
created_str              object
retweet_count             int64
retweeted                  bool
favorite_count            int64
text                     object
tweet_id                  int64
source                   object
hashtags                 object
expanded_urls            object
mentions                 object
retweeted_status_id       int64
in_reply_to_status_id     int64
tokens                   object
stems                    object
lemmas                   object
dtype: object
               user_id         user_key     created_at  \
0   961701459900342272  Jeannie22757716  1518122470246   
1   961701477164105728        _p_body__  1518122474362   
2   961701487507312641   6728FixerUpper  1518122476828   
3   961701511289073665        LenAulett  1518122482498   
4   961701912880988161       Dsquared69  1518122578245   
5   961701916358135808  SlizewskiEdwar1  1518122579074   
6   

29  [george, bush, abu, dhabi, trash, trump, embra...  
                    user_id      user_key     created_at  \
9999995  965237446358085632  alison_rixon  1518965515113   
9999996  965237446618132480     100lthere  1518965515175   
9999997  965237446286901251         MDS46  1518965515096   
9999998  965237446039494663   tonya110104  1518965515037   
9999999  965237446278496256  MattProutUWO  1518965515094   

                            created_str  retweet_count  retweeted  \
9999995  Sun Feb 18 14:51:55 +0000 2018              0      False   
9999996  Sun Feb 18 14:51:55 +0000 2018              0      False   
9999997  Sun Feb 18 14:51:55 +0000 2018              0      False   
9999998  Sun Feb 18 14:51:55 +0000 2018              0      False   
9999999  Sun Feb 18 14:51:55 +0000 2018              0      False   

         favorite_count                                               text  \
9999995               0  b"Honestly, I'm nauseous. That tweet using a s...   
9999996     

In [97]:
df.to_csv(ROOT + "/propertweets.csv", encoding = "utf-8")

In [73]:
def keywords(df, field):
    tokens = df[field]
    alltokens = []
    for token_list in tokens:
        alltokens += token_list
    counter = Counter(alltokens)
    return counter.most_common(30)

for word, count in keywords(df, 'tokens'):
    print("Word: {}, Count: {}".format(word, count))

print("\n----\n")

for word, count in keywords(df, 'stems'):
    print("Word: {}, Count: {}".format(word, count))
    
print("\n----\n")
for word, count in keywords(df, 'lemmas'):
    print("Word: {}, Count: {}".format(word, count))


Word: obama, Count: 43681
Word: trump, Count: 25351
Word: maga, Count: 16565
Word: fbi, Count: 11558
Word: deficits, Count: 10832
Word: president, Count: 10543
Word: realdonaldtrump, Count: 10157
Word: clinton, Count: 10104
Word: hillary, Count: 9401
Word: media, Count: 7258
Word: mueller, Count: 7127
Word: rand, Count: 6740
Word: paul, Count: 6528
Word: ago, Count: 6457
Word: trillion, Count: 6441
Word: memo, Count: 6172
Word: nunes, Count: 5315
Word: uranium, Count: 5172
Word: weeks, Count: 5059
Word: russia, Count: 4744
Word: state, Count: 4490
Word: russian, Count: 4163
Word: dossier, Count: 4134
Word: republican, Count: 4012
Word: fuck, Count: 3965
Word: voted, Count: 3909
Word: ta, Count: 3836
Word: isn, Count: 3750
Word: bill, Count: 3593
Word: scandal, Count: 3537

----

Word: obama, Count: 43738
Word: trump, Count: 25484
Word: maga, Count: 16569
Word: presid, Count: 13958
Word: deficit, Count: 13779
Word: fbi, Count: 11558
Word: clinton, Count: 11142
Word: realdonaldtrump, Cou