# AMOD 5410 - Big Data
## Project: Bot Detection
### By: Matt Emmons (0221920)

## Introduction

Write stuff about this project. 3k+ words?

## Imports

In [13]:
import re
import numpy as np
import pandas as pd
import sklearn.utils

import matplotlib.pylab as plt
import seaborn as sns
from tqdm import tqdm_notebook as tqdm
import unicodedata

import nltk
import contractions
import inflect
from nltk import word_tokenize, sent_tokenize
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

np.random.seed(101)
rand_seed = 101

n_rows = 1000000

In [None]:
%%time
ROOT = "./csv"
df_nbc = pd.read_csv(
    ROOT + "/tweets.csv", 
    encoding='utf-8', 
    low_memory=False, 
    parse_dates=False
)

df_scraped = pd.read_csv(
    ROOT + "/scraped_tweets.csv", 
    encoding='utf-8', 
    nrows=n_rows,
    low_memory=False, 
    parse_dates=False
)

In [None]:
# df['Column'] = df['Column'].str.decode('ascii')
# df_scraped['text'] = df_scraped['text'].astype(str).apply(decode(encoding = 'utf-8'))
# print(df_scraped['text'].head())
# print(df_nbc['text'].head())

## Preprocessing

Let's remove unneeded columns.

In [3]:
delcols = [
    'created_at',
    'created_str',
    'expanded_urls',
    'in_reply_to_status_id',
    'source'
]
for col in delcols:
    del df_nbc[col]
    del df_scraped[col]
    
del df_nbc['posted']

Need to ensure all columns have consistent datatypes between the two dataframes.

In [4]:
df_nbc[['user_id', 'retweet_count', 'favorite_count', 'tweet_id', 'retweeted_status_id']] = df_nbc[['user_id', 'retweet_count', 'favorite_count', 'tweet_id', 'retweeted_status_id']].fillna(0).astype(int)
df_nbc[['user_key', 'text']] = df_nbc[['user_key', 'text']].astype('str')
df_nbc[['retweeted']] = df_nbc[['retweeted']].astype('bool')
df_scraped[['retweeted_status_id']] = df_scraped[['retweeted_status_id']].fillna(0).astype(int)

In [5]:
print(df_nbc.dtypes)
print(df_scraped.dtypes)

print(list(df_nbc.dtypes) == list(df_scraped.dtypes))

user_id                 int64
user_key               object
retweet_count           int64
retweeted                bool
favorite_count          int64
text                   object
tweet_id                int64
hashtags               object
mentions               object
retweeted_status_id     int64
dtype: object
user_id                 int64
user_key               object
retweet_count           int64
retweeted                bool
favorite_count          int64
text                   object
tweet_id                int64
hashtags               object
mentions               object
retweeted_status_id     int64
dtype: object
True


In [6]:
print(df_nbc.head())
print("\n---------------------\n")
print(df_scraped.head())

      user_id         user_key  retweet_count  retweeted  favorite_count  \
0  2532611755        kathiemrr              0       True               0   
1  2531159968   traceyhappymom              0       True               0   
2           0    evewebster373              0       True               0   
3  4840551713      blacktolive             18      False              17   
4  1694026190  jacquelinisbest              0      False               0   

                                                text            tweet_id  \
0    #ThingsDoneByMistake kissing auntie in the lips  836227891897651200   
1  RT @mc_derpin: #TheOlderWeGet the more pessimi...  765198948239810560   
2  RT @dmataconis: Ready To Feel Like A Failure? ...  616002306572746752   
3    Amen! #blacklivesmatter https://t.co/wGffaOqgzl  776693302926147584   
4  RT @NahBabyNah: Twitchy: Chuck Todd caught out...  777594647875059712   

                  hashtags        mentions  retweeted_status_id  
0  ["ThingsDoneByMis

### Prepare the data

What we need to do is this:

- Attach a class feature to the nbc dataset
- Since we do not know the class of the scraped dataset, we leave it for now
- Create a new dataset merged between a subset of nbc and scraped datasets to be our training set
- All rows left out of the merged subset will become the test subset

In [7]:
df_nbc['class'] = 1
df_test = df_scraped.copy()
df_test['class'] = np.nan

In [8]:
# Merge df_test and df_nbc
# train/test/val split
# apply classifiers
df = pd.concat([df_nbc, df_test], ignore_index = True)
print(df.head())
print(df.dtypes)
print(len(df.index))

      user_id         user_key  retweet_count  retweeted  favorite_count  \
0  2532611755        kathiemrr              0       True               0   
1  2531159968   traceyhappymom              0       True               0   
2           0    evewebster373              0       True               0   
3  4840551713      blacktolive             18      False              17   
4  1694026190  jacquelinisbest              0      False               0   

                                                text            tweet_id  \
0    #ThingsDoneByMistake kissing auntie in the lips  836227891897651200   
1  RT @mc_derpin: #TheOlderWeGet the more pessimi...  765198948239810560   
2  RT @dmataconis: Ready To Feel Like A Failure? ...  616002306572746752   
3    Amen! #blacklivesmatter https://t.co/wGffaOqgzl  776693302926147584   
4  RT @NahBabyNah: Twitchy: Chuck Todd caught out...  777594647875059712   

                  hashtags        mentions  retweeted_status_id  class  
0  ["ThingsDo

## Text Manipulation

In [9]:
tt = TweetTokenizer()
df['tokenized_text'] = df['text'].apply(tt.tokenize)

In [80]:
def normalize(words):
    """
    Pipeline for normalizing the text field
    Only issue is detecting certain UTF-8 encoded symbols
    """
    words = remove_b(words)
    words = remove_unicode_symbols(words)
    words = replace_contractions(words)
    words = remove_non_ascii(words)
    words = to_lowercase(words)
    words = remove_links(words)
    words = remove_punctuation(words)
    words = replace_numbers(words)
    words = remove_stopwords(words)
    stems = stem_words(words)
    lemmas = lemmatize_words(words)
    return (stems, lemmas)

def get_stems(words):
    words = remove_b(words)
    words = remove_unicode_symbols(words)
    words = replace_contractions(words)
    words = remove_non_ascii(words)
    words = to_lowercase(words)
    words = remove_links(words)
    words = remove_punctuation(words)
    words = replace_numbers(words)
    words = remove_stopwords(words)
    stems = stem_words(words)    
    return stems

def get_lemmas(words):
    words = remove_b(words)
    words = remove_unicode_symbols(words)
    words = replace_contractions(words)
    words = remove_non_ascii(words)
    words = to_lowercase(words)
    words = remove_links(words)
    words = remove_punctuation(words)
    words = replace_numbers(words)
    words = remove_stopwords(words)
    lemmas = lemmatize_words(words)
    return lemmas

def remove_b(words):
    """Remove the stupid byte string indicator"""
    new_words = []
    for word in words:
        if word == words[0]:
            if len(word) == 1 and word[0] == 'b':
                continue
            elif len(word) > 1 and word[:2] == "b'":
                word = word[2:]
            new_words.append(word)
        else:
            new_words.append(word)
    return new_words

def remove_unicode_symbols(words):
    """Remove errant unicode symbols"""
    new_words = []
    for word in words:
        new_word = re.sub(r"(x[abcdef0-9]{0,2})?", '', word)
        # new_word = re.sub(r'x([a-z]|[0-9]){1,1}([a-z]|[0-9])', '', word)
        new_words.append(new_word)
    return new_words
        
def replace_contractions(words):
    """Replace contractions in string of text"""
    new_words = []
    for word in words:
        new_word = contractions.fix(word)
        new_words.append(new_word)
    return new_words

def remove_non_ascii(words):
    """Remove non-ASCII characters from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words

def to_lowercase(words):
    """Convert all characters to lowercase from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = word.lower()
        new_words.append(new_word)
    return new_words

def remove_links(words):
    """Remove links from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = re.sub(r'^https?:\/\/.*[\r\n]*', '', word)
        new_words.append(new_word)
    return new_words

def remove_punctuation(words):
    """Remove punctuation from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words

def replace_numbers(words):
    """Replace all interger occurrences in list of tokenized words with textual representation"""
    p = inflect.engine()
    new_words = []
    for word in words:
        if word.isdigit():
            new_word = p.number_to_words(word)
            new_words.append(new_word)
        else:
            new_words.append(word)
    return new_words

def remove_stopwords(words):
    """Remove stop words from list of tokenized words"""
    new_words = []
    for word in words:
        if word not in stopwords.words('english'):
            new_words.append(word)
    return new_words

def stem_words(words):
    """Stem words in list of tokenized words"""
    stemmer = LancasterStemmer()
    stems = []
    for word in words:
        stem = stemmer.stem(word)
        stems.append(stem)
    return stems

def lemmatize_words(words):
    """Lemmatize verbs in list of tokenized words"""
    lemmatizer = WordNetLemmatizer()
    lemmas = []
    for word in words:
        lemma = lemmatizer.lemmatize(word, pos='v')
        lemmas.append(lemma)
    return lemmas

In [81]:
samples = [991858, 991500, 991858, 991918]

for sample in samples:
    tokenized = df.iloc[sample].tokenized_text
    text = df.iloc[sample].text
    stems, lemmas = normalize(tokenized)
    print("Text: {}".format(text))
    print("Lemmas: {}".format(lemmas))
    print("Stems: {}".format(stems))
    print("\n----\n")
    
# df['stem_text'] = df['tokenized_text'].apply(get_stems)
# df['lemma_text'] = df['tokenized_text'].apply(get_lemmas)

# df.head()

Text: b'CRAZZZZY TOWN !!\n\n1ST SCHIFF AND WARREN FOILED  \xe2\x80\x94 NOW OBAMA\xe2\x80\x99S FBI BUSTED TRYING TO PAY RUSSIANS 1 MILLION FOR DIRT ON TRUMP !!\n\n\xe2\x80\x9cWhat is clear is the Obama Administration spied on a rival political campaign.\xe2\x80\x9d \xe2\x80\x94Tucker\n\nLOCK THEM UP !!! https://t.co/EQkZclNcbK'
Lemmas: ['crazzzzy', 'town', 'n', 'n1st', 'schiff', 'warren', 'foil', 'obama', 'fbi', 'bust', 'try', 'pay', 'russians', 'one', 'million', 'dirt', 'trump', 'n', 'n', 'clear', 'obama', 'administration', 'spy', 'rival', 'political', 'campaign', 'tucker', 'n', 'nlock']
Stems: ['crazzzzy', 'town', 'n', 'n1st', 'schiff', 'war', 'foil', 'obam', 'fbi', 'bust', 'try', 'pay', 'russ', 'on', 'mil', 'dirt', 'trump', 'n', 'n', 'clear', 'obam', 'admin', 'spi', 'riv', 'polit', 'campaign', 'tuck', 'n', 'nlock']

----

Text: b'Trump blocks release of Russia memo drafted by Democrats https://t.co/T4W4BbicfQ\n#ReleaseTheMemo\n#ReleaseTheDemMemo\n#ReleaseTheDemsMemo https://t.co/1mBh

In [81]:
print(df.head())

      user_id         user_key  retweet_count  retweeted  favorite_count  \
0  2532611755        kathiemrr              0       True               0   
1  2531159968   traceyhappymom              0       True               0   
2           0    evewebster373              0       True               0   
3  4840551713      blacktolive             18      False              17   
4  1694026190  jacquelinisbest              0      False               0   

                                                text            tweet_id  \
0    #ThingsDoneByMistake kissing auntie in the lips  836227891897651200   
1  RT @mc_derpin: #TheOlderWeGet the more pessimi...  765198948239810560   
2  RT @dmataconis: Ready To Feel Like A Failure? ...  616002306572746752   
3    Amen! #blacklivesmatter https://t.co/wGffaOqgzl  776693302926147584   
4  RT @NahBabyNah: Twitchy: Chuck Todd caught out...  777594647875059712   

                  hashtags        mentions  retweeted_status_id  class  \
0  ["ThingsD

## Feature Engineering

Need to create features for classification

In [None]:
# Lexical Diversity
def lexical_diversity(text):
    if len(text) == 0:
        diversity = 0
    else: 
        diversity = float(len(set(text))) / len(text)
    return diversity

## Classification

In [16]:
df = sklearn.utils.shuffle(df)
X = df.iloc[:,0:7]
Y = df.iloc[:,8]
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3)

In [17]:
%%time
RFModel = RandomForestClassifier(
    n_estimators = 1000, 
    max_depth = 5, 
    max_features = 3, 
    oob_score=False
)

RFModel.fit(X_train, Y_train)
prediction = RFModel.predict_proba(X_test)
auc = roc_auc_score(Y_test, prediction[:,1:2])
print(auc)

RFModel.fit(X_test, Y_test)
prediction = RFModel.predict_proba(X_train)
auc = roc_auc_score(Y_train, prediction[:,1:2])
print(auc)


ValueError: could not convert string to float: 'b\'RT @mitchellvii: UNITE THE BASE! YourVoice\\xc2\\x99 America (2/8) "Uranium One Connects Obama!" https://t.co/c8ga2kr7Hy\''