In [2]:
import pandas as pd
import nltk
import string
import pickle

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

from sklearn.naive_bayes import MultinomialNB

In [6]:
df = pd.read_csv('data/IRAhandle_tweets_1.csv')

In [7]:
df.shape

(381016, 15)

In [8]:
df.head()

Unnamed: 0,external_author_id,author,content,region,language,publish_date,harvested_date,following,followers,updates,post_type,account_type,new_june_2018,retweet,account_category
0,9.06e+17,10_GOP,"""We have a sitting Democrat US Senator on tria...",Unknown,English,10/1/2017 19:58,10/1/2017 19:59,1052,9636,253,,Right,0,0,RightTroll
1,9.06e+17,10_GOP,Marshawn Lynch arrives to game in anti-Trump s...,Unknown,English,10/1/2017 22:43,10/1/2017 22:43,1054,9637,254,,Right,0,0,RightTroll
2,9.06e+17,10_GOP,Daughter of fallen Navy Sailor delivers powerf...,Unknown,English,10/1/2017 22:50,10/1/2017 22:51,1054,9637,255,RETWEET,Right,0,1,RightTroll
3,9.06e+17,10_GOP,JUST IN: President Trump dedicates Presidents ...,Unknown,English,10/1/2017 23:52,10/1/2017 23:52,1062,9642,256,,Right,0,0,RightTroll
4,9.06e+17,10_GOP,"19,000 RESPECTING our National Anthem! #StandF...",Unknown,English,10/1/2017 2:13,10/1/2017 2:13,1050,9645,246,RETWEET,Right,0,1,RightTroll


In [9]:
df['account_category'].unique()

array(['RightTroll', 'NonEnglish', 'Fearmonger', 'LeftTroll', 'Unknown',
       'HashtagGamer', 'NewsFeed', 'Commercial'], dtype=object)

In [10]:
df[df['account_category']=='NewsFeed']

Unnamed: 0,external_author_id,author,content,region,language,publish_date,harvested_date,following,followers,updates,post_type,account_type,new_june_2018,retweet,account_category
52727,8.026730e+17,ALBUQUERQUEON,Albuquerque men arrested for trying to sell st...,Unknown,English,1/1/2017 0:21,1/1/2017 0:21,60,10,822,,local,1,0,NewsFeed
52728,8.026730e+17,ALBUQUERQUEON,Albuquerque Pokemon Go players here to stay an...,Unknown,English,1/1/2017 14:51,1/1/2017 14:51,60,10,830,,local,1,0,NewsFeed
52729,8.026730e+17,ALBUQUERQUEON,Queen Elizabeth II misses church due to 'heavy...,Unknown,English,1/1/2017 14:53,1/1/2017 14:53,60,10,831,,local,1,0,NewsFeed
52730,8.026730e+17,ALBUQUERQUEON,New UN chief urges New Year's resolution: 'Put...,Unknown,English,1/1/2017 15:23,1/1/2017 15:23,60,10,832,,local,1,0,NewsFeed
52731,8.026730e+17,ALBUQUERQUEON,China says it will shut down ivory trade by en...,Unknown,English,1/1/2017 15:53,1/1/2017 15:53,60,10,833,,local,1,0,NewsFeed
52732,8.026730e+17,ALBUQUERQUEON,Rural New Mexico exports mentoring model for p...,Unknown,English,1/1/2017 16:23,1/1/2017 16:23,60,10,834,,local,1,0,NewsFeed
52733,8.026730e+17,ALBUQUERQUEON,County enters last weekend of Tavern Taxi prog...,Unknown,English,1/1/2017 1:53,1/1/2017 1:53,60,10,823,,local,1,0,NewsFeed
52734,8.026730e+17,ALBUQUERQUEON,Eye on New Mexico: Water in New Mexico https:/...,Unknown,English,1/1/2017 22:23,1/1/2017 22:23,60,10,835,,local,1,0,NewsFeed
52735,8.026730e+17,ALBUQUERQUEON,Family searching for missing woman from Cubero...,Unknown,English,1/1/2017 2:23,1/1/2017 2:23,60,10,824,,local,1,0,NewsFeed
52736,8.026730e+17,ALBUQUERQUEON,New Year’s Eve weather forecast https://t.co/9...,Unknown,English,1/1/2017 2:53,1/1/2017 2:53,60,10,825,,local,1,0,NewsFeed


In [11]:
df_english = df[df['account_category'] != 'NonEnglish']

In [12]:
df_english.shape

(292080, 15)

In [13]:
def custom_tokenizer(text):
    punc = ''.join(string.punctuation.split('#'))
    
    # remove punctuation
    remove_punct = str.maketrans('', '', punc)
    text = text.translate(remove_punct)

    # remove digits and convert to lower case
    remove_digits = str.maketrans('', '', string.digits)
    text = text.lower().translate(remove_digits)

    # tokenize
    tokens = word_tokenize(text)

    # remove stop words
    stop_words = stopwords.words('english')
    tokens_stop = [y for y in tokens if y not in stop_words]

    # stem
    stemmer = SnowballStemmer('english')
    tokens_stem = [stemmer.stem(y) for y in tokens_stop] 

    return tokens_stem

In [14]:
cv = CountVectorizer(tokenizer=custom_tokenizer)

In [11]:
test_df = df_english[0:10000]

In [12]:
print(test_df.shape)
test_df.head()

(10000, 15)


Unnamed: 0,external_author_id,author,content,region,language,publish_date,harvested_date,following,followers,updates,post_type,account_type,new_june_2018,retweet,account_category
0,9.06e+17,10_GOP,"""We have a sitting Democrat US Senator on tria...",Unknown,English,10/1/2017 19:58,10/1/2017 19:59,1052,9636,253,,Right,0,0,RightTroll
1,9.06e+17,10_GOP,Marshawn Lynch arrives to game in anti-Trump s...,Unknown,English,10/1/2017 22:43,10/1/2017 22:43,1054,9637,254,,Right,0,0,RightTroll
2,9.06e+17,10_GOP,Daughter of fallen Navy Sailor delivers powerf...,Unknown,English,10/1/2017 22:50,10/1/2017 22:51,1054,9637,255,RETWEET,Right,0,1,RightTroll
3,9.06e+17,10_GOP,JUST IN: President Trump dedicates Presidents ...,Unknown,English,10/1/2017 23:52,10/1/2017 23:52,1062,9642,256,,Right,0,0,RightTroll
4,9.06e+17,10_GOP,"19,000 RESPECTING our National Anthem! #StandF...",Unknown,English,10/1/2017 2:13,10/1/2017 2:13,1050,9645,246,RETWEET,Right,0,1,RightTroll


In [9]:
X, y = test_df.content, test_df.account_category
X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=.3, random_state=3)

NameError: name 'test_df' is not defined

In [14]:
X_train.shape, y_train.shape

((7000,), (7000,))

In [15]:
X_test.shape, y_test.shape

((3000,), (3000,))

In [16]:
# countvectorizing and training
cv = CountVectorizer(tokenizer=custom_tokenizer)

X_train_cv = cv.fit_transform(X_train)
X_test_cv  = cv.transform(X_test)

In [17]:
nb = MultinomialNB()

nb.fit(X_train_cv, y_train)
nb.score(X_test_cv, y_test)
#Model with 10,000 tweets, dropping hashtags

0.823

In [10]:
X, y = df_english.content, df_english.account_category
X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=.3, random_state=3)

In [19]:
X_train.shape, y_train.shape

((204456,), (204456,))

In [11]:
# countvectorizing and training
cv = CountVectorizer(tokenizer=custom_tokenizer)

X_train_cv = cv.fit_transform(X_train)
X_test_cv  = cv.transform(X_test)

In [None]:
nb = MultinomialNB()

nb.fit(X_train_cv, y_train)
nb.score(X_test_cv, y_test)
#Model with all tweets, dropping hashtags

In [None]:
preds = nb.predict(X_test_cv)

In [None]:
X_test.head(10)

In [None]:
y_test.head(10)

In [None]:
print(preds[:10])

In [None]:
X_test.iloc[1]

In [13]:
X, y = df_english.content, df_english.account_category
X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=.3, random_state=3)

# countvectorizing and training
cv = CountVectorizer(tokenizer=custom_tokenizer)

X_train_cv = cv.fit_transform(X_train)  
X_test_cv  = cv.transform(X_test)

In [14]:
nb = MultinomialNB()

nb.fit(X_train_cv, y_train)
nb.score(X_test_cv, y_test)
#Model with all tweets, keeping hashtags

0.7975554642563681

In [16]:
with open('nb_model.pkl', 'wb') as picklefile:
    pickle.dump(nb, picklefile) 

In [3]:
with open('nb_model.pkl', 'rb') as picklefile:
    nb_model = pickle.load(picklefile)

In [20]:
nb_model.predict(X_test_cv[0])

array(['NewsFeed'], dtype='<U12')

In [21]:
mytweet = 'Florida man feeds food to alligator'
#tokens = custom_tokenizer(mytweet)
tokens_cv = cv.transform([mytweet])
nb.predict(tokens_cv)[0]
# for word in tokens:
#     print(word)
#     print(nb.predict(cv.transform([word])))

'NewsFeed'

In [4]:
def russiafy_my_tweet(tweet):
    tokens_cv = cv.transform([tweet])
    print('Your tweet is most similar to a Russian', nb.predict(tokens_cv)[0])

In [5]:
russiafy_my_tweet('Florida man feeds food to alligator')

NameError: name 'cv' is not defined

In [27]:
russiafy_my_tweet('Trump is the best forever, #MAGA')

Your tweet is most similar to a Russian RightTroll


In [28]:
russiafy_my_tweet('Why do cops hate people')

Your tweet is most similar to a Russian LeftTroll


### Now lets try with 2 grams

In [16]:
X, y = df_english.content, df_english.account_category
X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=.3, random_state=3)

# countvectorizing and training
cv = CountVectorizer(tokenizer=custom_tokenizer, ngram_range=(1,2))

X_train_cv = cv.fit_transform(X_train)
X_test_cv  = cv.transform(X_test)

In [17]:
nb = MultinomialNB()

nb.fit(X_train_cv, y_train)
nb.score(X_test_cv, y_test)
#Model with 2 grams

0.8213274901853373