## CSC 177-02 Data Warehousing and Data Mining
### Mini-Project 1: Clustering
### 2016 US presedential election Twitter analysis

#### Group members: Aaron Enberg,

In [1]:
import pandas as pd
import sklearn.feature_extraction.text as sk_text
pd.set_option('display.max_colwidth', -1)

In [2]:
column_names = ['Name', 'screen_Name', 'User_ID', 
                'Followers_Count', 'Friends_Count', 
                'Location', 'Description', 'Created_At', 
                'Status_ID', 'Language', 'Place', 
                'Retweet_Count', 'Favorite_Count', 'Text']
tweets = pd.read_table('data/clinton_trump_tweets.txt', names=column_names, encoding='ISO-8859-1')
tweets.columns = tweets.columns.str.lower()

In [None]:
tweets.shape

In [None]:
tweets.dtypes

## Preprocessing

In [3]:
tweets.drop(['name', 'screen_name', 
            'followers_count', 
            'friends_count', 
            'location', 
            'description', 
            'created_at',
            'status_id', 
            'language', 
            'place', 
            'retweet_count', 
            'favorite_count'], axis=1, inplace=True)

In [None]:
tweets.head(n=10)

In [4]:
pattern = r'^RT\s'
 
# matches retweets and removes them
tweets = tweets[tweets.text.str.match(pattern) == False]

In [46]:
tweets.shape

(2416818, 2)

In [None]:
tweets.head(n=10)

In [5]:
# match all hashtags and mentions in a tweet, ignoring possible email addresses
pattern = r'(?<=^|(?<=[^a-zA-Z0-9-\.]))@([A-Za-z_]+[A-Za-z0-9_]+)|(?<=^|(?<=[^a-zA-Z0-9-\.]))#([A-Za-z_]+[A-Za-z0-9_]+)'

""" returns a DataFrame with a MultiIndex:
    First index is our original index. Second index is "match" which is a running
    total of the number of occurences of hashtags and mentions for a particular 
    tweet. """
handles_hashtags = tweets.text.str.extractall(pattern)

In [6]:
handles_hashtags.columns = ['handles', 'hashtags']
handles_hashtags = handles_hashtags.reset_index().set_index('level_0')
del handles_hashtags.index.name
handles_hashtags.drop(['match'], axis=1, inplace=True)
# stack handles and hashtags into one column
handles_hashtags = pd.concat([handles_hashtags.handles, handles_hashtags.hashtags]).dropna().to_frame(name='handles_hashtags')

In [7]:
''' returns users along with all the hashtags/handles they've used 
    (each occurrence of a hashtag/handle will also show up in the list) '''
tweets = tweets.join(handles_hashtags, how='inner')
tweets.drop(['text'], axis=1, inplace=True)
handles_hashtags_all = tweets.reset_index().groupby('user_id')['handles_hashtags'].apply(list).to_frame()

In [53]:
handles_hashtags_all.head(n=10)

Unnamed: 0_level_0,handles_hashtags
user_id,Unnamed: 1_level_1
150,"[flangy, nelson, fjordinn, shinypb, fjordinn, fjordinn, iano, thatstacy, utilizer001, Carricohimself, fjordinn, raffi, d6, Sonikku_a, doctorow, gwestr]"
1437,"[overheardinlondon, whodeyintheUK, JimOwczarski, whodeyintheUK, AirlineFlyer, AirlineFlyer]"
1512,"[mozilla, david_bryant, SenFeinstein, marypcbuk]"
1644,"[MaryLovesBooks, BookNerdParadis, matt_hearnden, success, perspective, life, CodyBLister, Marketing, nick_eubanks, contentmarketing, promotionftw, contentmarketing, fredrivett, RobWormley, ContentMarketing, LArtra, BookNerdParadis, adamjayc, bloggingwizard, GuestBloggingStrategy, TamieDearen, BookNerdParadis, JRRTolkien, KatyHuthJones, BookNerdParadis, LorilynRoberts, BookNerdParadis, _awtozer, rahdieh, BookNerdParadis, TolkienQuote, EliseKova, BookNerdParadis, startrailsIV, BookNerdParadis, FebruaryGrace, BookNerdParadis, rtmixmktg, raecarson, BookNerdParadis, WillBluntAU, ContentMarketing, blog, LaurenLynneYA, BookNerdParadis, listbuilding, guestposting, motivation, success, emailmarketing, CRO]"
1668,[garrytan]
1737,"[nehanarkhede, ATO2016, erinscafe, MailChimp, ATO2016, rachaelmaddux, _raven_io, pwnela, GregU, PTC, Vuforia, ATO2016, nehanarkhede, erinscafe, salrelish, KyFaSt, nehanarkhede, MailChimp, ATO2016, SwiftOnSecurity, dcloues, nehanarkhede, jessfraz, erinscafe, nehanarkhede, solrac901, ApacheSpot, MailChimp, ATO2016, adickerson, ComfortablySmug, jdickerson, skamille, Michael_Tsunam1, samnesmith, bakins, JessicaMauerhan]"
2294,[qz]
2311,"[wp, ischafer, TeslaMotors, levie, SlackHQ, worldseries, hitlist_app, wandertab]"
2391,"[NathanFGao, bonaventuresoft, presserb, petshopboys, eldescanso, buzz, andymatic, petshopboys, mknepprath, dansinker, willsh, j3sse_pub, petshopboys, thelastwalt, tonx, MikeIsaac, fmanjoo, migurski, rtraister, atrubens, twitter, j3sse_pub, petshopboys, clearwriter, twitter, timbuckwalter, scarequotes, kathrynyu, summersumz, steveportigal, billder, NathanFGao, robynkanner, thelastwalt, clearwriter, hhavrilesky, TheCut, tonx, bjheinley, kowitz, yodamay, BenKennerly, michael, MaxTemkin, lmc, nczeitgeist, kathrynyu, dansays, tonx, Annaleen, TimGunn, atrubens, twitter, mnik]"
2424,"[max_hodak, ATT, twxcorp, ndustrialio, OneLineage, barryoreilly]"


### active users with at least 20 distinct hashtags/handles

In [8]:
# returns users along with the distinct hashtags/handles they've used
handles_hashtags_distinct = tweets.reset_index().groupby('user_id')['handles_hashtags'].unique().to_frame()
# retrieve only those who have used 20 or more distinct handles/hashtags
users_active = handles_hashtags_distinct[handles_hashtags_distinct.handles_hashtags.str.len() > 19]

In [9]:
users_active = users_active.join(handles_hashtags_all, lsuffix='_distinct', rsuffix='_all', how='inner')

In [79]:
users_active.head()

Unnamed: 0_level_0,handles_hashtags_distinct,handles_hashtags_all,handles_hashtags_stringified
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1644,"[MaryLovesBooks, BookNerdParadis, matt_hearnden, success, perspective, life, CodyBLister, Marketing, nick_eubanks, contentmarketing, promotionftw, fredrivett, RobWormley, ContentMarketing, LArtra, adamjayc, bloggingwizard, GuestBloggingStrategy, TamieDearen, JRRTolkien, KatyHuthJones, LorilynRoberts, _awtozer, rahdieh, TolkienQuote, EliseKova, startrailsIV, FebruaryGrace, rtmixmktg, raecarson, WillBluntAU, blog, LaurenLynneYA, listbuilding, guestposting, motivation, emailmarketing, CRO]","[MaryLovesBooks, BookNerdParadis, matt_hearnden, success, perspective, life, CodyBLister, Marketing, nick_eubanks, contentmarketing, promotionftw, contentmarketing, fredrivett, RobWormley, ContentMarketing, LArtra, BookNerdParadis, adamjayc, bloggingwizard, GuestBloggingStrategy, TamieDearen, BookNerdParadis, JRRTolkien, KatyHuthJones, BookNerdParadis, LorilynRoberts, BookNerdParadis, _awtozer, rahdieh, BookNerdParadis, TolkienQuote, EliseKova, BookNerdParadis, startrailsIV, BookNerdParadis, FebruaryGrace, BookNerdParadis, rtmixmktg, raecarson, BookNerdParadis, WillBluntAU, ContentMarketing, blog, LaurenLynneYA, BookNerdParadis, listbuilding, guestposting, motivation, success, emailmarketing, CRO]",MaryLovesBooks BookNerdParadis matt_hearnden success perspective life CodyBLister Marketing nick_eubanks contentmarketing promotionftw contentmarketing fredrivett RobWormley ContentMarketing LArtra BookNerdParadis adamjayc bloggingwizard GuestBloggingStrategy TamieDearen BookNerdParadis JRRTolkien KatyHuthJones BookNerdParadis LorilynRoberts BookNerdParadis _awtozer rahdieh BookNerdParadis TolkienQuote EliseKova BookNerdParadis startrailsIV BookNerdParadis FebruaryGrace BookNerdParadis rtmixmktg raecarson BookNerdParadis WillBluntAU ContentMarketing blog LaurenLynneYA BookNerdParadis listbuilding guestposting motivation success emailmarketing CRO
1737,"[nehanarkhede, ATO2016, erinscafe, MailChimp, rachaelmaddux, _raven_io, pwnela, GregU, PTC, Vuforia, salrelish, KyFaSt, SwiftOnSecurity, dcloues, jessfraz, solrac901, ApacheSpot, adickerson, ComfortablySmug, jdickerson, skamille, Michael_Tsunam1, samnesmith, bakins, JessicaMauerhan]","[nehanarkhede, ATO2016, erinscafe, MailChimp, ATO2016, rachaelmaddux, _raven_io, pwnela, GregU, PTC, Vuforia, ATO2016, nehanarkhede, erinscafe, salrelish, KyFaSt, nehanarkhede, MailChimp, ATO2016, SwiftOnSecurity, dcloues, nehanarkhede, jessfraz, erinscafe, nehanarkhede, solrac901, ApacheSpot, MailChimp, ATO2016, adickerson, ComfortablySmug, jdickerson, skamille, Michael_Tsunam1, samnesmith, bakins, JessicaMauerhan]",nehanarkhede ATO2016 erinscafe MailChimp ATO2016 rachaelmaddux _raven_io pwnela GregU PTC Vuforia ATO2016 nehanarkhede erinscafe salrelish KyFaSt nehanarkhede MailChimp ATO2016 SwiftOnSecurity dcloues nehanarkhede jessfraz erinscafe nehanarkhede solrac901 ApacheSpot MailChimp ATO2016 adickerson ComfortablySmug jdickerson skamille Michael_Tsunam1 samnesmith bakins JessicaMauerhan
2391,"[NathanFGao, bonaventuresoft, presserb, petshopboys, eldescanso, buzz, andymatic, mknepprath, dansinker, willsh, j3sse_pub, thelastwalt, tonx, MikeIsaac, fmanjoo, migurski, rtraister, atrubens, twitter, clearwriter, timbuckwalter, scarequotes, kathrynyu, summersumz, steveportigal, billder, robynkanner, hhavrilesky, TheCut, bjheinley, kowitz, yodamay, BenKennerly, michael, MaxTemkin, lmc, nczeitgeist, dansays, Annaleen, TimGunn, mnik]","[NathanFGao, bonaventuresoft, presserb, petshopboys, eldescanso, buzz, andymatic, petshopboys, mknepprath, dansinker, willsh, j3sse_pub, petshopboys, thelastwalt, tonx, MikeIsaac, fmanjoo, migurski, rtraister, atrubens, twitter, j3sse_pub, petshopboys, clearwriter, twitter, timbuckwalter, scarequotes, kathrynyu, summersumz, steveportigal, billder, NathanFGao, robynkanner, thelastwalt, clearwriter, hhavrilesky, TheCut, tonx, bjheinley, kowitz, yodamay, BenKennerly, michael, MaxTemkin, lmc, nczeitgeist, kathrynyu, dansays, tonx, Annaleen, TimGunn, atrubens, twitter, mnik]",NathanFGao bonaventuresoft presserb petshopboys eldescanso buzz andymatic petshopboys mknepprath dansinker willsh j3sse_pub petshopboys thelastwalt tonx MikeIsaac fmanjoo migurski rtraister atrubens twitter j3sse_pub petshopboys clearwriter twitter timbuckwalter scarequotes kathrynyu summersumz steveportigal billder NathanFGao robynkanner thelastwalt clearwriter hhavrilesky TheCut tonx bjheinley kowitz yodamay BenKennerly michael MaxTemkin lmc nczeitgeist kathrynyu dansays tonx Annaleen TimGunn atrubens twitter mnik
2426,"[MikeIsaac, thelancearthur, FFWglobal, SenFeinstein, shootatweet, NancyPelosi, abookapart, SenatorBoxer, GOP, united, designhawg, JoeGermuska, knightlab, Real_TJ_Thomas, gruber, parisvega, espiekermann, MikeHosier, ChappellTracker, drwave, beaucolburn, choad, halvorson, facebook, sjarvis, zeldman, natts, astronautpnguin, matt_timmons, austinkleon, de5igner, madebyfew, gunsonfacebookrightnow, romanmars, mat, sonia__harris, SherylCababa, dansinker, facebooks, jbsibley, blocktogether, ToddRoss, MxF2016]","[MikeIsaac, thelancearthur, FFWglobal, SenFeinstein, shootatweet, SenFeinstein, shootatweet, NancyPelosi, shootatweet, abookapart, SenatorBoxer, shootatweet, GOP, united, designhawg, JoeGermuska, knightlab, MikeIsaac, NancyPelosi, shootatweet, NancyPelosi, shootatweet, Real_TJ_Thomas, gruber, SenatorBoxer, shootatweet, parisvega, espiekermann, MikeHosier, ChappellTracker, drwave, beaucolburn, choad, halvorson, NancyPelosi, shootatweet, designhawg, SenFeinstein, shootatweet, SenatorBoxer, shootatweet, facebook, facebook, SenatorBoxer, shootatweet, sjarvis, GOP, united, NancyPelosi, shootatweet, zeldman, natts, astronautpnguin, facebook, matt_timmons, austinkleon, gruber, facebook, de5igner, madebyfew, gruber, facebook, gunsonfacebookrightnow, romanmars, SenatorBoxer, shootatweet, facebook, mat, sonia__harris, facebook, gunsonfacebookrightnow, SenFeinstein, shootatweet, SherylCababa, austinkleon, dansinker, SenFeinstein, shootatweet, facebooks, jbsibley, blocktogether, gruber, ToddRoss, MxF2016]",MikeIsaac thelancearthur FFWglobal SenFeinstein shootatweet SenFeinstein shootatweet NancyPelosi shootatweet abookapart SenatorBoxer shootatweet GOP united designhawg JoeGermuska knightlab MikeIsaac NancyPelosi shootatweet NancyPelosi shootatweet Real_TJ_Thomas gruber SenatorBoxer shootatweet parisvega espiekermann MikeHosier ChappellTracker drwave beaucolburn choad halvorson NancyPelosi shootatweet designhawg SenFeinstein shootatweet SenatorBoxer shootatweet facebook facebook SenatorBoxer shootatweet sjarvis GOP united NancyPelosi shootatweet zeldman natts astronautpnguin facebook matt_timmons austinkleon gruber facebook de5igner madebyfew gruber facebook gunsonfacebookrightnow romanmars SenatorBoxer shootatweet facebook mat sonia__harris facebook gunsonfacebookrightnow SenFeinstein shootatweet SherylCababa austinkleon dansinker SenFeinstein shootatweet facebooks jbsibley blocktogether gruber ToddRoss MxF2016
14763,"[mwhuss, zuhrisaifudin45, garrettdimon, treehouse, bencareynyt, kristina_basham, fall, nofilter, portland, beauty, wjgilmore, goruck, fitness, spartan, training, KatieMSmith, zapatoche, GORUCK, danielmall, EducateYourself, nikiforovalex18, AaronGustafson, davatron5000, hihellosm, codejake, effectiveness, productivity, timemanagement]","[mwhuss, zuhrisaifudin45, garrettdimon, treehouse, bencareynyt, kristina_basham, treehouse, fall, nofilter, portland, beauty, wjgilmore, goruck, fitness, spartan, training, KatieMSmith, zapatoche, GORUCK, danielmall, EducateYourself, nikiforovalex18, danielmall, AaronGustafson, davatron5000, hihellosm, codejake, wjgilmore, effectiveness, productivity, timemanagement]",mwhuss zuhrisaifudin45 garrettdimon treehouse bencareynyt kristina_basham treehouse fall nofilter portland beauty wjgilmore goruck fitness spartan training KatieMSmith zapatoche GORUCK danielmall EducateYourself nikiforovalex18 danielmall AaronGustafson davatron5000 hihellosm codejake wjgilmore effectiveness productivity timemanagement


In [10]:
users_active['handles_hashtags_stringified'] = users_active['handles_hashtags_all'].apply(lambda x: ' '.join(map(str, x)))

In [None]:
users_active['handles_hashtags_stringified']

In [11]:
''' each string in the list contains all the hashtags/handles for one user 
    (contains duplicates if they used a hashtag/handle more than once) '''
corpus = list(users_active['handles_hashtags_stringified'])

In [None]:
corpus

### hashtags/handles that have been used by at least 20 distinct users (min_df=20)

In [12]:
# constructs a dataframe with only the handles/hashtags used by 20 users 
vectorizer = sk_text.CountVectorizer(min_df=20, lowercase=True, encoding='ISO-8859-1')
matrix = vectorizer.fit_transform(corpus)
df = pd.DataFrame(matrix.toarray(), index=users_active.index, columns=vectorizer.get_feature_names())

In [15]:
df

Unnamed: 0_level_0,_altright_anew,_carja,_cfj_,_makada_,_proud_american,_realvalentina_,a_miller48,abbydphillip,abbymartin,abbymartinm,...,zachhaller,zaibatsunews,zaidjilani,zekejmiller,zerohedge,zhaabowekwe,zigmanfreud,zika,zimmermanrob,zip90210
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1644,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1737,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2391,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2426,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14763,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1026541,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1134681,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1201691,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1253141,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1329901,0,0,0,0,0,0,0,0,0,0,...,5,0,0,0,0,0,0,0,0,0


In [None]:
for r in users_active.index:
    print(r)

In [60]:
df.var(axis=1).mean()

1.1951974818713689

Clustering Part 2: Clustering Users to Clinton and Trump Followers

In [14]:
from sklearn.feature_selection import VarianceThreshold
import sklearn.preprocessing as sk_p
import sklearn.cluster as sk
temp = df.drop([col for col, val in df.sum().iteritems() if val > 4000], axis=1, inplace=False)

In [23]:
df.max(axis=0).sort_values(ascending=False).head(n=10)

download      3016
spotify       3016
news          2654
lgbt          2320
now           1717
nowplaying    1492
trump         1349
cosproject    1292
pjnet         1020
biz           993 
dtype: int64

In [17]:
#std_scaler = 
df_std = sk_p.normalize(df, norm='l2', axis=1)

select = VarianceThreshold(threshold=0.0002)
df_select = select.fit_transform(df_std)
#df_std = std_scaler.fit_transform(df)

#df_std = std_scaler.transform(df)

#min_max_scaler = preprocessing.MinMaxScaler()

#min_max_scaler.fit(X)

#X_minmax = min_max_scaler.transform(X)

#max_abs_scaler = preprocessing.MaxAbsScaler()
#X_maxabs = max_abs_scaler.fit_transform(X)

In [18]:
df_std.var(axis=1).mean()

0.00023835302983602955

In [19]:
df_select.shape

(17066, 1079)

In [24]:
kmeans = sk.KMeans(init='k-means++', n_clusters=2, n_init=20)  

means = kmeans.fit_predict(df_select)

print ('\n Cluster Centroids')
centroids = kmeans.cluster_centers_
print (centroids)

print ('\nCluster labels')
kmeans_labels = kmeans.labels_
print(kmeans_labels)


print('\n')
error = kmeans.inertia_       #SSE; Sum of squared distances of samples to their closest cluster center.
print ("The total error of the clustering is: ", error)


 Cluster Centroids
[[0.00567812 0.0012242  0.00078956 ... 0.04701323 0.0009099  0.00069385]
 [0.02004345 0.00123158 0.0054746  ... 0.03655021 0.00125211 0.00016135]]

Cluster labels
[0 0 0 ... 0 0 0]


The total error of the clustering is:  11627.092722698546


In [25]:
true_values = pd.read_table('.\data\clinton_trump_user_classes.txt', names=['user_id', 'label'])
true_values.set_index('user_id', inplace=True)

In [157]:
true_values.shape

(200000, 1)

In [26]:
t_values = df.join(true_values, how='inner')

In [27]:
t_labels = pd.DataFrame(t_values['label'], index = t_values.index)

In [28]:
t_values['label'].values

array([1, 1, 1, ..., 1, 1, 1], dtype=int64)

In [29]:
means

array([0, 0, 0, ..., 0, 0, 0])

In [30]:
users = df.index
labels = pd.DataFrame(kmeans_labels, index=users)
mean_labels_0 = labels[labels.iloc[:,0]==0]
mean_labels_1 = labels[labels.iloc[:, 0]==1]

In [31]:
mean_cluster_0 = df.drop(mean_labels_1.index)
mean_cluster_1 = df.drop(mean_labels_0.index)

In [33]:
#Clinton
mean_cluster_0.sum(axis=0).sort_values(ascending=False).head(n=30)

youtube            13132
trump              6308 
news               5546 
worldseries        5250 
pjnet              4100 
hillaryclinton     3769 
hillary            3546 
cubs               3504 
now                3458 
realdonaldtrump    3374 
quote              3348 
cosproject         3208 
spotify            3094 
download           3059 
cnn                2902 
maga               2745 
tcot               2711 
nowplaying         2661 
lgbt               2490 
halloween          2440 
ebay               2346 
wikileaks          2143 
imwithher          2141 
rallytogether      2136 
fbi                1905 
clinton            1865 
gop                1858 
business           1809 
indians            1793 
reinindc           1714 
dtype: int64

In [34]:
#Trump
mean_cluster_1.sum(axis=0).sort_values(ascending=False).head(n=30)

realdonaldtrump    21177
hillaryclinton     17852
foxnews            8353 
cnn                5426 
megynkelly         5317 
fbi                3518 
wikileaks          3208 
maga               3184 
newtgingrich       3052 
seanhannity        2868 
kellyannepolls     2588 
msnbc              2430 
trump              2398 
youtube            2386 
potus              2161 
gop                2091 
draintheswamp      1853 
abc                1662 
washingtonpost     1630 
imwithher          1619 
jasoninthehouse    1555 
cnnpolitics        1500 
nytimes            1472 
mike_pence         1396 
mitchellvii        1361 
speakerryan        1285 
wdfx2eu7           1157 
evan_mcmullin      1142 
hillary            1129 
joyannreid         1119 
dtype: int64

In [43]:
import sklearn.metrics as metrics

In [44]:
mean_c = metrics.confusion_matrix(t_values['label'].values, means, labels=[0, 1])

In [45]:
mean_c

array([[5348, 2327],
       [8047, 1344]], dtype=int64)

In [51]:
#mean_sum = mean_c.sum(axis=0)
#print(mean_sum, mean_c[0,1], mean_sum[0], mean_c[1,0], mean_sum[1])
#mean_percision = ((mean_c[1,0]/mean_sum[0])+(mean_c[0,1]/mean_sum[1]))/2
mean_percision = metrics.precision_score(t_values['label'].values, means, labels=[0,1])
mean_percision

0.3661127758104059

In [52]:
#mean_sum = [mean_c[0].sum(), mean_c[1].sum()]
#print(mean_sum, mean_c[1,0], mean_c[0,1])
#mean_recall = ((mean_c[1,0]/mean_sum[1])+(mean_c[0,1]/mean_sum[0]))/2
mean_recall = metrics.recall_score(t_values['label'], means, labels=[0,1])
mean_recall

0.1431157491214993

In [50]:
#mean_f_score = (2*mean_percision*mean_recall)/(mean_percision+mean_recall)
mean_f_score = metrics.f1_score(t_values['label'].values, means, labels=[0,1])
mean_f_score

0.2057877813504823

cluster 0 is Hilary while cluster 1 is Trump

In [41]:
ag = sk.AgglomerativeClustering(n_clusters = 2, linkage = 'ward')   

#complete: The maximum distances
#n_clusters: The number of clusters to find.

ag_sse = ag.fit_predict(df_select)
#ag_sse.labels_

In [53]:
ag_sse_labels = pd.DataFrame(ag_sse, index=users)
sse_labels_0 = ag_sse_labels[labels.iloc[:,0]==0]
sse_labels_1 = ag_sse_labels[labels.iloc[:, 0]==1] 

In [54]:
sse_cluster_0 = df.drop(sse_labels_1.index)
sse_cluster_1 = df.drop(sse_labels_0.index)

In [56]:
#Clinton
sse_cluster_0.max(axis=0).sort_values(ascending=False).head(n=30)

download        3016
spotify         3016
news            2654
lgbt            2320
now             1717
nowplaying      1492
trump           1349
cosproject      1292
pjnet           1020
biz             993 
vip             957 
music           897 
sales           758 
borisjohnson    721 
youtube         705 
florida         680 
leadership      667 
maine           629 
hillary         616 
us              579 
tcot            573 
markets         570 
gop             561 
watch           551 
clinton         544 
finance         535 
jewelry         523 
law             515 
nypost          506 
newyork         506 
dtype: int64

In [57]:
#Trump
sse_cluster_1.max(axis=0).sort_values(ascending=False).head(n=30)

gop                 527
evan_mcmullin       407
speakerryan         370
mittromney          364
megynkelly          281
shepnewsteam        273
danaperino          273
newtgingrich        232
hillaryclinton      218
realdonaldtrump     181
lorettalynch        177
braveheart_usa      152
ingrahamangle       151
maga                149
seanhannity         148
kellyannepolls      143
foxnews             129
cnn                 125
msnbc               123
samsteinhp          121
steph93065          106
therickwilson       102
peggy7172           100
wikileaks           97 
trump               95 
fbi                 93 
mike_pence          88 
guilty              87 
hillaryforprison    86 
sykescharlie        84 
dtype: int64

In [58]:
sse_c = metrics.confusion_matrix(t_values['label'].values, ag_sse, labels=[0, 1])

In [59]:
sse_c

array([[4258, 3417],
       [6601, 2790]], dtype=int64)

From this confusion matrix, cluster 0 represents Hilary followers while cluster 1 represents Trump followers

In [61]:
#sse_sum = mean_c.sum(axis=0)
#print(sse_sum, sse_c[0,1], sse_sum[0], sse_c[1,0], sse_sum[1])
#sse_percision = ((sse_c[1,0]/sse_sum[0])+(sse_c[0,1]/sse_sum[1]))/2
sse_precision = metrics.precision_score(t_values['label'].values, ag_sse, labels=[0,1])
sse_precision

0.4494925084581924

In [62]:
#sse_sum = [sse_c[0].sum(), sse_c[1].sum()]
#print(sse_sum, sse_c[1,0], sse_c[0,1])
#sse_recall = ((sse_c[1,0]/sse_sum[1])+(sse_c[0,1]/sse_sum[0]))/2
sse_recall = metrics.recall_score(t_values['label'].values, ag_sse, labels=[0,1])
sse_recall

0.2970929613459695

In [76]:
#sse_f_score = (2*sse_percision*sse_recall)/(sse_percision+sse_recall)
sse_f_score = metrics.f1_score(t_values['label'].values, ag_sse)
sse_f_score

0.35773817156045645

In [65]:
ag = sk.AgglomerativeClustering(n_clusters = 2, linkage = 'complete')
ag_max = ag.fit_predict(df_select)
#ag_max.labels_

In [66]:
ag_max_labels = pd.DataFrame(ag_max, index=users)
max_labels_0 = ag_max_labels[labels.iloc[:,0]==0]
max_labels_1 = ag_max_labels[labels.iloc[:, 0]==1] 

In [67]:
max_cluster_0 = df.drop(max_labels_1.index)
max_cluster_1 = df.drop(max_labels_0.index)

In [68]:
#Trump
max_cluster_0.max(axis=0).sort_values(ascending=False).head(n=30)

download        3016
spotify         3016
news            2654
lgbt            2320
now             1717
nowplaying      1492
trump           1349
cosproject      1292
pjnet           1020
biz             993 
vip             957 
music           897 
sales           758 
borisjohnson    721 
youtube         705 
florida         680 
leadership      667 
maine           629 
hillary         616 
us              579 
tcot            573 
markets         570 
gop             561 
watch           551 
clinton         544 
finance         535 
jewelry         523 
law             515 
nypost          506 
newyork         506 
dtype: int64

In [69]:
#Clinton
max_cluster_1.max(axis=0).sort_values(ascending=False).head(n=30)

gop                 527
evan_mcmullin       407
speakerryan         370
mittromney          364
megynkelly          281
shepnewsteam        273
danaperino          273
newtgingrich        232
hillaryclinton      218
realdonaldtrump     181
lorettalynch        177
braveheart_usa      152
ingrahamangle       151
maga                149
seanhannity         148
kellyannepolls      143
foxnews             129
cnn                 125
msnbc               123
samsteinhp          121
steph93065          106
therickwilson       102
peggy7172           100
wikileaks           97 
trump               95 
fbi                 93 
mike_pence          88 
guilty              87 
hillaryforprison    86 
sykescharlie        84 
dtype: int64

In [70]:
max_c = confusion_matrix(t_values['label'].values, ag_max, labels=[0, 1])

In [71]:
max_c

array([[3228, 4447],
       [3218, 6173]], dtype=int64)

cluster 0 is Trump and cluster 1 is Hilary

In [72]:
#max_sum = max_c.sum(axis=0)
#print(max_sum, max_c[0,0], max_sum[0], max_c[1,1], max_sum[1])
#max_percision = ((max_c[0,0]/max_sum[0])+(max_c[1,1]/max_sum[1]))/2
max_precision = metrics.precision_score(t_values['label'].values, ag_max, labels=[0,1])
max_precision

0.5812617702448211

In [73]:
#max_sum = [max_c[0].sum(), max_c[1].sum()]
#print(max_sum, max_c[0,0], max_c[1,1])
#max_recall = ((max_c[0,0]/max_sum[0])+(max_c[1,1]/sse_sum[1]))/2
max_recall = metrics.recall_score(t_values['label'].values, ag_max, labels=[0,1])
max_recall

0.6573314875945053

In [78]:
#max_f_score = (2*max_percision*max_recall)/(max_percision+max_recall)
max_f_score = metrics.f1_score(t_values['label'].values, ag_max)
max_f_score

0.6169606716306031

In [None]:
# same matrix just swapped rows and columns
df.transpose().head()