In [None]:
import pandas as pd
import sklearn.feature_extraction.text as sk_text
import sklearn.cluster as sk_cluster
import sklearn.metrics as metrics
%matplotlib inline
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
pd.set_option('display.max_colwidth', -1)
import datetime
startTime = datetime.datetime.now()
print(str(startTime))

In [None]:
tweets = pd.read_csv('clean_data.csv')
tweets['FrequencyOver20'] = tweets.FrequencyOver20.str.lower()
#print initial data
tweets.head()

In [None]:
aggregateTweetsHashtags =tweets.groupby('UserID')['FrequencyOver20'].apply(lambda x: x.str.cat(sep=' '))
tweetsPrepareSKText = pd.DataFrame({'User_id': aggregateTweetsHashtags.index, 'All_hashtags': aggregateTweetsHashtags.values})
vectorizer = sk_text.CountVectorizer(max_features = 100,
                             min_df=1, 
                             #max_df=100000,
                             stop_words = 'english'
                             )


In [None]:
matrix = vectorizer.fit_transform(tweetsPrepareSKText.All_hashtags.values)
df_text = pd.DataFrame(matrix.todense(), index=aggregateTweetsHashtags.index, columns=vectorizer.get_feature_names())
df_text.head()

In [None]:
ground_cols = ['UserID', 'team']
ground_truth = pd.read_table('clinton_trump_user_classes.txt', encoding ="ISO-8859-1", dtype=str, names= ground_cols)
ground_truth.UserID = ground_truth.UserID.astype(int)
df_text = pd.merge(df_text, ground_truth, on = 'UserID')
#ground truth
true = df_text.team.astype(int).values

In [None]:
tdidf = matrix.toarray()

**K-means Clustering**

In [None]:
kmeans = sk_cluster.KMeans(n_clusters=2, n_init=10, max_iter=100)  

km_labels = kmeans.fit_predict(tdidf)

print ('\n Cluster Centroids')
centroids = kmeans.cluster_centers_
print (centroids)

error = kmeans.inertia_       #SSE; Sum of squared distances of samples to their closest cluster center.
print ("\nThe total error of the clustering is: ", error)

In [None]:
metrics.confusion_matrix(true, kmeans.labels_, labels=[0, 1])

In [None]:
metrics.precision_score(true, kmeans.labels_, average='weighted') # weighted: the average precision of all clusters is returned


In [None]:
metrics.recall_score(true, kmeans.labels_, average='weighted')  # weighted: the average recall of all clusters is returned


In [None]:
metrics.f1_score(true, kmeans.labels_, average= 'weighted')     # weighted: the average f1 of all clusters is returned


In [None]:
asc_order_centroids = kmeans.cluster_centers_.argsort()    # argsort() returns the indices that would sort an array.

des_order_centroids = asc_order_centroids[:,::-1]     #  get the indices that sort array in descending order

In [None]:
terms = vectorizer.get_feature_names()
print ("All the terms:")
print(terms, '\n')

for i in range(2):
    print ("Cluster:", i)
    for ind in des_order_centroids[i, :]:
        print (terms[ind])
    print()  

**MAX-Agglomerative Clustering**

In [None]:
ag = sk_cluster.AgglomerativeClustering(linkage = 'complete', n_clusters = 2)   

ag_labels = ag.fit_predict(tdidf)

print ('\nPrinting cluster assignment:')
ag_labels 

**SSE-Agglomerative Clustering**

In [None]:
ag = sk_cluster.AgglomerativeClustering(linkage = 'ward', n_clusters = 3)   

#complete: The maximum distances
#n_clusters: The number of clusters to find.

ag_labels = ag.fit_predict(tdidf)

print ('\nPrinting cluster assignment:')
ag_labels 

In [None]:
from scipy.cluster.hierarchy import dendrogram, linkage      

ag = linkage(tdidf, method='complete')       # Performs agglomerative clustering with MAX metric

dendrogram(ag)  