# AMOD 5410 - Big Data
## Project: Bot Detection
### By: Matt Emmons (0221920)

## Introduction

Write stuff about this project. 3k+ words?

## Imports

In [3]:
import numpy as np
import pandas as pd
import sklearn.utils
from datetime import datetime

import matplotlib.pylab as plt
import seaborn as sns
from tqdm import tqdm_notebook as tqdm

import nltk
import contractions
import inflect
from nltk import word_tokenize, sent_tokenize
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

np.random.seed(101)
rand_seed = 101

n_rows = 1000000

In [11]:
%%time
ROOT = "./csv"
df_nbc = pd.read_csv(
    ROOT + "/tweets.csv", 
    encoding='utf-8', 
    low_memory=False, 
    parse_dates=False
)

df_scraped = pd.read_csv(
    ROOT + "/scraped_tweets.csv", 
    nrows=n_rows,
#     converters = { 'text': lambda x: str(x.decode('utf-8')) },
    encoding='utf-8', 
    low_memory=False, 
    parse_dates=False
)

CPU times: user 7.54 s, sys: 954 ms, total: 8.49 s
Wall time: 8.6 s


## Preprocessing

Let's remove unneeded columns.

In [12]:
delcols = [
    'created_at',
    'created_str',
    'expanded_urls',
    'in_reply_to_status_id',
    'source'
]
for col in delcols:
    del df_nbc[col]
    del df_scraped[col]
    
del df_nbc['posted']

Need to ensure all columns have consistent datatypes between the two dataframes.

In [13]:
df_nbc[['user_id', 'retweet_count', 'favorite_count', 'tweet_id', 'retweeted_status_id']] = df_nbc[['user_id', 'retweet_count', 'favorite_count', 'tweet_id', 'retweeted_status_id']].fillna(0).astype(int)
df_nbc[['user_key', 'text']] = df_nbc[['user_key', 'text']].astype('str')
df_nbc[['retweeted']] = df_nbc[['retweeted']].astype('bool')
df_scraped[['retweeted_status_id']] = df_scraped[['retweeted_status_id']].fillna(0).astype(int)

In [14]:
print(df_nbc.dtypes)
print(df_scraped.dtypes)

print(list(df_nbc.dtypes) == list(df_scraped.dtypes))

user_id                 int64
user_key               object
retweet_count           int64
retweeted                bool
favorite_count          int64
text                   object
tweet_id                int64
hashtags               object
mentions               object
retweeted_status_id     int64
dtype: object
user_id                 int64
user_key               object
retweet_count           int64
retweeted                bool
favorite_count          int64
text                   object
tweet_id                int64
hashtags               object
mentions               object
retweeted_status_id     int64
dtype: object
True


In [15]:
print(df_nbc.head())
print("\n---------------------\n")
print(df_scraped.head())

      user_id         user_key  retweet_count  retweeted  favorite_count  \
0  2532611755        kathiemrr              0       True               0   
1  2531159968   traceyhappymom              0       True               0   
2           0    evewebster373              0       True               0   
3  4840551713      blacktolive             18      False              17   
4  1694026190  jacquelinisbest              0      False               0   

                                                text            tweet_id  \
0    #ThingsDoneByMistake kissing auntie in the lips  836227891897651200   
1  RT @mc_derpin: #TheOlderWeGet the more pessimi...  765198948239810560   
2  RT @dmataconis: Ready To Feel Like A Failure? ...  616002306572746752   
3    Amen! #blacklivesmatter https://t.co/wGffaOqgzl  776693302926147584   
4  RT @NahBabyNah: Twitchy: Chuck Todd caught out...  777594647875059712   

                  hashtags        mentions  retweeted_status_id  
0  ["ThingsDoneByMis

In [148]:
# df_nbc['newtext'] = df_nbc.text.str.decode(encoding = 'UTF-8')
# df_scraped['newtext'] = df_scraped.text.str.decode(encoding = 'UTF-8')

# df_nbc['newtext'] = df_nbc.text.apply( lambda x:  x.decode(encoding = "utf-8"))
# df_scraped['newtext'] = df_scraped.text.apply( lambda x:  x.decode(encoding = "utf-8"))


print(df_nbc.head())
print("\n---------------------\n")
print(df_scraped.head())

      user_id         user_key  retweet_count  retweeted  favorite_count  \
0  2532611755        kathiemrr              0       True               0   
1  2531159968   traceyhappymom              0       True               0   
2           0    evewebster373              0       True               0   
3  4840551713      blacktolive             18      False              17   
4  1694026190  jacquelinisbest              0      False               0   

                                                text            tweet_id  \
0    #ThingsDoneByMistake kissing auntie in the lips  836227891897651200   
1  RT @mc_derpin: #TheOlderWeGet the more pessimi...  765198948239810560   
2  RT @dmataconis: Ready To Feel Like A Failure? ...  616002306572746752   
3    Amen! #blacklivesmatter https://t.co/wGffaOqgzl  776693302926147584   
4  RT @NahBabyNah: Twitchy: Chuck Todd caught out...  777594647875059712   

                  hashtags        mentions  retweeted_status_id  
0  ["ThingsDoneByMis

### Prepare the data

What we need to do is this:

- Attach a class feature to the nbc dataset
- Since we do not know the class of the scraped dataset, we leave it for now
- Create a new dataset merged between a subset of nbc and scraped datasets to be our training set
- All rows left out of the merged subset will become the test subset

In [16]:
df_nbc['class'] = 1
df_test = df_scraped.copy()
df_test['class'] = np.nan

In [17]:
# Merge df_test and df_nbc
# train/test/val split
# apply classifiers
df = pd.concat([df_nbc, df_test], ignore_index = True)
print(df.head())
print(df.dtypes)
print(len(df.index))

      user_id         user_key  retweet_count  retweeted  favorite_count  \
0  2532611755        kathiemrr              0       True               0   
1  2531159968   traceyhappymom              0       True               0   
2           0    evewebster373              0       True               0   
3  4840551713      blacktolive             18      False              17   
4  1694026190  jacquelinisbest              0      False               0   

                                                text            tweet_id  \
0    #ThingsDoneByMistake kissing auntie in the lips  836227891897651200   
1  RT @mc_derpin: #TheOlderWeGet the more pessimi...  765198948239810560   
2  RT @dmataconis: Ready To Feel Like A Failure? ...  616002306572746752   
3    Amen! #blacklivesmatter https://t.co/wGffaOqgzl  776693302926147584   
4  RT @NahBabyNah: Twitchy: Chuck Todd caught out...  777594647875059712   

                  hashtags        mentions  retweeted_status_id  class  
0  ["ThingsDo

## Text Manipulation

In [80]:
import unicodedata, re

class NormalizationPipeline():
    """
    1. tokenize
    2. replace_contractions
    3. remove_non_ascii
    4. to_lowercase
    5. remove_punctuation
    6. replace_numbers
    7. remove_stopwords
    """
    def __init__(self, words):
        self.words = words
        self.process()

    def process(self):
#         self.words = self.replace_contractions()
#         self.words = nltk.word_tokenize(self.words)
#         tt.tokenize
        self.words = self.remove_non_ascii()
        self.words = self.to_lowercase()
        self.words = self.remove_b()
        self.words = self.remove_links()
        self.words = self.remove_punctuation()
        self.words = self.replace_numbers()
        self.words = self.remove_stopwords()
        self.stems = self.stem_words()
        self.lemmas = self.lemmatize_verbs()
        return self.stems, self.lemmas, self.words

    def replace_contractions(self):
        """Replace contractions in string of text"""
        self.words = contractions.fix(self.words)
        
        
    def remove_non_ascii(self):
        """Remove non-ASCII characters from list of tokenized words"""
        new_words = []
        for word in self.words:
            new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
            new_words.append(new_word)
        return new_words

    def to_lowercase(self):
        """Convert all characters to lowercase from list of tokenized words"""
        new_words = []
        for word in self.words:
            new_word = word.lower()
            new_words.append(new_word)
        return new_words

    def remove_b(self):
        """Remove the stupid byte string indicator"""
        new_words = []
        for word in self.words:
            if word == self.words[0]:
                if len(word) == 1 and word[0] == 'b':
                    continue
                elif len(word) > 1 and word[:2] == "b'":
                    word = word[2:]
                new_words.append(word)
            else:
                new_words.append(word)
        return new_words
    
    def remove_links(self):
        """Remove links from list of tokenized words"""
        new_words = []
        for word in self.words:
            new_word = re.sub(r'^https?:\/\/.*[\r\n]*', '', word)
            new_words.append(new_word)
        return new_words

    def remove_punctuation(self):
        """Remove punctuation from list of tokenized words"""
        new_words = []
        for word in self.words:
            new_word = re.sub(r'[^\w\s]', '', word)
            if new_word != '':
                new_words.append(new_word)
        return new_words

    def replace_numbers(self):
        """Replace all interger occurrences in list of tokenized words with textual representation"""
        p = inflect.engine()
        new_words = []
        for word in self.words:
            if word.isdigit():
                new_word = p.number_to_words(word)
                new_words.append(new_word)
            else:
                new_words.append(word)
        return new_words

    def remove_stopwords(self):
        """Remove stop words from list of tokenized words"""
        new_words = []
        for word in self.words:
            if word not in stopwords.words('english'):
                new_words.append(word)
        return new_words

    def stem_words(self):
        """Stem words in list of tokenized words"""
        stemmer = LancasterStemmer()
        stems = []
        for word in self.words:
            stem = stemmer.stem(word)
            stems.append(stem)
        return stems

    def lemmatize_verbs(self):
        """Lemmatize verbs in list of tokenized words"""
        lemmatizer = WordNetLemmatizer()
        lemmas = []
        for word in self.words:
            lemma = lemmatizer.lemmatize(word, pos='v')
            lemmas.append(lemma)
        return lemmas
    
# tt = TweetTokenizer()
# df['tokenized_text'] = df['text'].apply(tt.tokenize) 

# count = 0
# for idx, row in df.iterrows():
#     print(row['tokenized_text'])
#     count += 1
#     if count == 10:
#         break
samples = [1, 5, 205100, 500192, 991858]

for sample in samples:
    tokenized = df.ix[sample].tokenized_text
    text = df.ix[sample].text
    stems, lemmas, words = NormalizationPipeline(tokenized).process()
    print("Text: {}".format(text))
    print("Stems: {}".format(stems))
    print("Lemmas: {}".format(lemmas))
    print("Words: {}".format(words))
    print("\n----\n")
    
# sample = df.ix[991858].tokenized_text
# words = NormalizationPipeline(sample).process()
# print(words)

Text: RT @mc_derpin: #TheOlderWeGet the more pessimistic we are https://t.co/zS3jHZJl8P
Stems: ['rt', 'mc_derpin', 'theolderweget', 'pessim']
Lemmas: ['rt', 'mc_derpin', 'theolderweget', 'pessimistic']
Words: ['rt', 'mc_derpin', 'theolderweget', 'pessimistic']

----

Text: RT @mcicero10: #BernieSanders #Trump people should rally TOGETHER against the establishment who is 💩-ing on both choices #thefix
Stems: ['rt', 'mcicero10', 'berniesand', 'trump', 'peopl', 'ral', 'togeth', 'est', 'ing', 'cho', 'thefix']
Lemmas: ['rt', 'mcicero10', 'berniesanders', 'trump', 'people', 'rally', 'together', 'establishment', 'ing', 'choices', 'thefix']
Words: ['rt', 'mcicero10', 'berniesanders', 'trump', 'people', 'rally', 'together', 'establishment', 'ing', 'choices', 'thefix']

----

Text: b'BREAKING: OBAMA DIRECTLY IMPLICATED IN FBI COVER-UP OF THE HILLARY INVES... https://t.co/ICxTmFEKIv via @YouTube'
Stems: ['break', 'obam', 'direct', 'imply', 'fbi', 'coverup', 'hil', 'inv', 'via', 'youtub']
Lemmas: [

In [81]:
print(df.head())

      user_id         user_key  retweet_count  retweeted  favorite_count  \
0  2532611755        kathiemrr              0       True               0   
1  2531159968   traceyhappymom              0       True               0   
2           0    evewebster373              0       True               0   
3  4840551713      blacktolive             18      False              17   
4  1694026190  jacquelinisbest              0      False               0   

                                                text            tweet_id  \
0    #ThingsDoneByMistake kissing auntie in the lips  836227891897651200   
1  RT @mc_derpin: #TheOlderWeGet the more pessimi...  765198948239810560   
2  RT @dmataconis: Ready To Feel Like A Failure? ...  616002306572746752   
3    Amen! #blacklivesmatter https://t.co/wGffaOqgzl  776693302926147584   
4  RT @NahBabyNah: Twitchy: Chuck Todd caught out...  777594647875059712   

                  hashtags        mentions  retweeted_status_id  class  \
0  ["ThingsD

## Feature Engineering

Need to create features for classification

In [None]:
# Lexical Diversity
def lexical_diversity(text):
    if len(text) == 0:
        diversity = 0
    else: 
        diversity = float(len(set(text))) / len(text)
    return diversity



## Classification

In [16]:
df = sklearn.utils.shuffle(df)
X = df.iloc[:,0:7]
Y = df.iloc[:,8]
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3)

In [17]:
%%time
RFModel = RandomForestClassifier(
    n_estimators = 1000, 
    max_depth = 5, 
    max_features = 3, 
    oob_score=False
)

RFModel.fit(X_train, Y_train)
prediction = RFModel.predict_proba(X_test)
auc = roc_auc_score(Y_test, prediction[:,1:2])
print(auc)

RFModel.fit(X_test, Y_test)
prediction = RFModel.predict_proba(X_train)
auc = roc_auc_score(Y_train, prediction[:,1:2])
print(auc)


ValueError: could not convert string to float: 'b\'RT @mitchellvii: UNITE THE BASE! YourVoice\\xc2\\x99 America (2/8) "Uranium One Connects Obama!" https://t.co/c8ga2kr7Hy\''