In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('data/IRAhandle_tweets_1.csv')
english = df[df['language'] == 'English']

In [15]:
english

Unnamed: 0,external_author_id,author,content,region,language,publish_date,harvested_date,following,followers,updates,post_type,account_type,new_june_2018,retweet,account_category
0,9.060000e+17,10_GOP,"""We have a sitting Democrat US Senator on tria...",Unknown,English,10/1/2017 19:58,10/1/2017 19:59,1052,9636,253,,Right,0,0,RightTroll
1,9.060000e+17,10_GOP,Marshawn Lynch arrives to game in anti-Trump s...,Unknown,English,10/1/2017 22:43,10/1/2017 22:43,1054,9637,254,,Right,0,0,RightTroll
2,9.060000e+17,10_GOP,Daughter of fallen Navy Sailor delivers powerf...,Unknown,English,10/1/2017 22:50,10/1/2017 22:51,1054,9637,255,RETWEET,Right,0,1,RightTroll
3,9.060000e+17,10_GOP,JUST IN: President Trump dedicates Presidents ...,Unknown,English,10/1/2017 23:52,10/1/2017 23:52,1062,9642,256,,Right,0,0,RightTroll
4,9.060000e+17,10_GOP,"19,000 RESPECTING our National Anthem! #StandF...",Unknown,English,10/1/2017 2:13,10/1/2017 2:13,1050,9645,246,RETWEET,Right,0,1,RightTroll
5,9.060000e+17,10_GOP,"Dan Bongino: ""Nobody trolls liberals better th...",Unknown,English,10/1/2017 2:47,10/1/2017 2:47,1050,9644,247,,Right,0,0,RightTroll
6,9.060000e+17,10_GOP,🐝🐝🐝 https://t.co/MorL3AQW0z,Unknown,English,10/1/2017 2:48,10/1/2017 2:48,1050,9644,248,RETWEET,Right,0,1,RightTroll
7,9.060000e+17,10_GOP,'@SenatorMenendez @CarmenYulinCruz Doesn't mat...,Unknown,English,10/1/2017 2:52,10/1/2017 2:53,1050,9644,249,,Right,0,0,RightTroll
8,9.060000e+17,10_GOP,"As much as I hate promoting CNN article, here ...",Unknown,English,10/1/2017 3:47,10/1/2017 3:47,1050,9646,250,,Right,0,0,RightTroll
9,9.060000e+17,10_GOP,After the 'genocide' remark from San Juan Mayo...,Unknown,English,10/1/2017 3:51,10/1/2017 3:51,1050,9646,251,,Right,0,0,RightTroll


In [3]:
y = english['account_category'].values
X = english['content'].values

X_train, X_test, y_train, y_test = train_test_split(X, y)
X_sub, y_sub = X_train[:10000], y_train[:10000]

In [4]:
vectorizer = TfidfVectorizer()
classifier = MultinomialNB()

In [5]:
X_tfidf = vectorizer.fit_transform(X_sub)
classifier.fit(X_tfidf, y_sub)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [6]:
classifier.score(X_tfidf,y_sub)

0.6784

In [7]:
np.unique(y_sub, return_counts=True)

(array(['Commercial', 'Fearmonger', 'HashtagGamer', 'LeftTroll',
        'NewsFeed', 'NonEnglish', 'RightTroll', 'Unknown'], dtype=object),
 array([ 423,   30, 1139, 2791, 1132,  105, 4344,   36]))

In [8]:
pred = classifier.predict(X_tfidf)
np.unique(pred, return_counts=True)

(array(['Commercial', 'HashtagGamer', 'LeftTroll', 'NewsFeed',
        'RightTroll'], dtype='<U12'), array([  20,  110, 2142,  399, 7329]))

In [9]:
kmeans = KMeans(n_clusters=8)
kmeans.fit(X_tfidf)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=8, n_init=10, n_jobs=None, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [10]:
max_index = np.argsort(-kmeans.cluster_centers_)[:,30:]
inv_map = {v:k for k,v in vectorizer.vocabulary_.items()}
best_words = np.zeros(max_index.shape,dtype=object)
for i,index_array in enumerate(max_index):
    for j,index in enumerate(index_array):
        best_words[i,j] = inv_map[index]
        
best_words

array([['what', 'get', 'will', ..., 'fix', 'fiverights', 'ㅤㅤ'],
       ['your', 'at', 'like', ..., 'ifinditfunnyhow', 'ignore',
        'kevski07'],
       ['not', 'love', 'thank', ..., 'frequently', 'fridays', 'ㅤㅤ'],
       ...,
       ['now', 'out', 'amb', ..., 'hypes', 'hznp', 'ㅤㅤ'],
       ['can', 'at', 'you', ..., 'infinite', 'infiltrated', 'ㅤㅤ'],
       ['have', 'but', 'at', ..., 'goose', 'gops', 'ㅤㅤ']], dtype=object)

In [11]:
pd.DataFrame(best_words)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,30313,30314,30315,30316,30317,30318,30319,30320,30321,30322
0,what,get,will,rt,than,think,their,about,stop,america,...,fizzles,fixing,fixes,fixer_guy,fixed,fixating,fixate,fix,fiverights,ㅤㅤ
1,your,at,like,have,but,his,what,get,who,lsu,...,ifttt,ift9mxqwkl,ift1jq8ggm,ifsqfyueny,iflpayments,iflopsets,4b975ihjov,ifinditfunnyhow,ignore,kevski07
2,not,love,thank,my,know,so,but,think,me,all,...,frfr,fretting,fresno,freshoutthe90s,freshly,fresh_flames1,fresh,frequently,fridays,ㅤㅤ
3,they,people,truth,that,mj4n1kuw75,usfreedomarmy,when,want,not,then,...,financially,financial,financed,finance,finals,finally,finale,final,findkatelin,ㅤㅤ
4,that,out,will,anti,new,white,watch,hillary,not,well,...,fulfillment,fulfill,fukushimaagain,fukushima,fukitimstarvin,fukin,fuentits_,fuels,fulton,ㅤㅤ
5,now,out,amb,woman,atlanta,shooting,shot,charlottesville,clinton,arrested,...,hzl41wj8ym,hzhckejwp8,hzgysxhvgo,hzeipv50ti,hytbfu6ut6,hysterical,hypothesis,hypes,hznp,ㅤㅤ
6,can,at,you,just,from,tcot,cops,need,was,wiunion,...,influences,influencers,influence,inflection,inflation,inflatable,infinity,infinite,infiltrated,ㅤㅤ
7,have,but,at,has,black,up,amp,what,about,our,...,gopoversight,gopnuk86od,gophers,gope,gopclowncar,gop2016,goosebumps,goose,gops,ㅤㅤ
