In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn import tree    # Decision Tree
from sklearn.linear_model import LogisticRegression # LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC # Support Vector Machine 
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
import sklearn.feature_extraction.text as sk_text


## File Prep ##

In [None]:
%%time
column_names=['Name', 'ScreenName', 'UserID', 'FollowersCount', 'FriendsCount', 'Location', 'Description', 'CreatedAt', 'StatusID', 'Language', 'Place', 'RetweetCount', 'FavoriteCount', 'Text']
tweets = pd.read_csv('clinton_trump_tweets.txt', sep="\t",  encoding="ISO-8859-1", header=None, names=column_names)
#print initial data
tweets = tweets.drop(['ScreenName', 'FollowersCount', 'FriendsCount', 'CreatedAt', 'StatusID','Language', 'Place', 'RetweetCount', 'FavoriteCount'], axis=1)
tweets.Location = tweets.Location.astype(str)
tweets.Text = tweets.Text.astype(str)

ground_truth = pd.read_table('clinton_trump_user_classes.txt', encoding ="ISO-8859-1", dtype=int, names=["UserID", "TrumpOrClinton"])



### Task 1.1 (10 pts): Remove all retweets first. Remove all users that have less than 20 tweets. You may want to keep the entire tweet content, including hashtags/handles. ###

In [None]:
tweets = tweets[~tweets.Text.str.startswith('RT')]

In [None]:
# %%time
# def keepHashMentions(text):
#     hashMentions = []
#     for word in text:
#         hashMentions.extend(word)
#     return " ".join(hashMentions).strip()
# ## keeping tweet including hashtags and mentions here ## 
# tweets['HashMentions'] = tweets.Text.str.findall('(@\w+)|(#\w+)').apply(keepHashMentions)


### Remove all users that have less than 20 tweets. ###

In [None]:
#1.2 keep tweets where UID appears 20+ times
tweets = tweets.groupby("UserID").filter(lambda x: len(x) >= 20)

In [None]:
# %%timeit
# # Generate a list of mention/hashes that have a frequency of 20+ #
# top_hash = pd.Series(tweets['HashMentions'].str.cat(sep=' ').split()).value_counts()
# top20 = top_hash[top_hash>=20]
# top20List = top20.index.tolist()

# # Convert list to set so it will have quick lookup
# top20Set = set(top20List)


# # generate list of mention/hashes that occur 20+ times from our orginal list 
# def removeUnder20Mentions(hashMentions):
#     mentions =  hashMentions.split()
#     mentionsOver20 = []
#     for mention in mentions:
#         if(mention in top20Set):
#             mentionsOver20.append(mention)
#     return " ".join(mentionsOver20)
# tweets['HashMentions'] = tweets.HashMentions.apply(removeUnder20Mentions)

## Create new df merging hashmentions and UserID  ##

In [None]:
# hashMentionsMerged = tweets.groupby('UserID')['Text'].apply(' '.join).reset_index()
# tweetsWithNoHashMentions = tweets.drop('Text', axis=1)
# tweets = pd.merge(tweetsWithNoHashMentions, hashMentionsMerged, on='UserID')
# tweets = tweets.drop_duplicates('UserID')


### #remove users that have no hashtag/handles ###

In [None]:
#tweets = tweets[tweets.Text.apply(lambda x:  bool(x and x.strip()))]
#print(tweets.shape)

In [None]:
tweets.shape

In [None]:
dataFrameWithHashTagHandlesAndTruths.head()

In [None]:
%%time
##NEW
tweets['InputData'] =  tweets['Location']+" "+tweets['Description'] +" "+tweets['Text']
aggregateTweetsHashtags = tweets.groupby('UserID')['InputData'].apply(lambda x: x.str.cat(sep=' '))
tweetsPrepareSKText = pd.DataFrame({'User_id': aggregateTweetsHashtags.index, 'All_hashtags': aggregateTweetsHashtags.values})

vectorizerInput = sk_text.TfidfVectorizer(max_features = 50000,
                             #min_df=.001, 
                             #max_df=.99,
                             stop_words = ['UserID','10','100','11','12','14','15','16','17','18','19','2016',"00","000","000s","001","007","00am","00pm","01","02"]
                            )

                             
matrix = vectorizerInput.fit_transform(tweetsPrepareSKText.All_hashtags.values)
tdidf = matrix.toarray()
df_text = pd.DataFrame(matrix.todense(), index=aggregateTweetsHashtags.index, columns=vectorizerInput.get_feature_names())

dataFrameWithHashTagHandlesAndTruths = pd.merge(df_text,ground_truth, on = 'UserID')


In [None]:
len(vectorizerInput.get_feature_names())

In [None]:
# Remove locations that are null 
#tweets['LocationCleaned'] = tweets['Location'].apply(lambda x: x.split(',')[0])

# tweetsLocations = tweets[tweets['Location'].notnull() & tweets['Description'].notnull() & tweets['HashMentions'].notnull()]

# tweetsLocations = tweetsLocations[(tweetsLocations['Location'] != 'nan') & (tweetsLocations['Location'] != ' ') & (tweetsLocations['Location'] != '')]
# print(tweets.shape)
# tweets = tweetsLocations[tweetsLocations['Location'].isin(tweetsLocations['Location'].value_counts().nlargest(300).index.tolist())]
# tweets['Location'] = tweetsLocations['Location'].map({'NYC':'New York City','New York':'New York City', 'NY':'New York City', 'SF':'San Francisco', 'Hollywood':'Los Angeles'}).fillna(tweetsLocations['Location'])

# tweets['Location'].value_counts()

In [None]:
# aggregateTweetsHashtags = tweets.groupby('UserID')['HashMentions'].apply(lambda x: x.str.cat(sep=' '))
# tweetsPrepareSKText = pd.DataFrame({'User_id': aggregateTweetsHashtags.index, 'All_hashtags': aggregateTweetsHashtags.values})
# vectorizer = sk_text.TfidfVectorizer(max_features = 1000,
#                              #min_df=100, 
#                              #max_df=.8,
#                              stop_words = ['UserID']
#                              )
# matrix = vectorizer.fit_transform(tweetsPrepareSKText.All_hashtags.values)
# tdidf = matrix.toarray()
# df_text = pd.DataFrame(matrix.todense(), index=aggregateTweetsHashtags.index, columns=vectorizer.get_feature_names())
# df_text.head()

### Vectorize Description ###

In [None]:
# vectorizerDescription = sk_text.TfidfVectorizer(max_features = 500,
#                              #min_df=100, 
#                              #max_df=.8,
#                              stop_words = ['UserID','10','100','11','12','14','15','16','17','18','19','2016']
#                             )

# matrixDescription = vectorizerDescription.fit_transform(tweets.Description.values)
# tdidfDescription = matrixDescription.toarray()
# df_text_Description = pd.DataFrame(matrixDescription.todense(), index=tweets.UserID, columns=vectorizerDescription.get_feature_names())
# df_text_Description.head()

### Vectorize Description ###

In [None]:
# vectorizerLocation = sk_text.TfidfVectorizer(#max_features = 1000,
#                              #min_df=100, 
#                              #max_df=.8,
#                              stop_words = ['UserID']                
#                             )
# matrixLocation = vectorizerLocation.fit_transform(tweets.Location.values)
# tdidfLocation = matrixDescription.toarray()
# df_text_Location = pd.DataFrame(matrixLocation.todense(), index=tweets.UserID, columns=vectorizerLocation.get_feature_names())
# df_text_Location.head()



## Apply ground truths column to UserID of tweets Dataframe ##

In [None]:
# #mergeGroundTruths = pd.merge(tweets, ground_truth, on = 'UserID')
# #ground truth
# dataFrameWithHashTagHandlesAndTruths = pd.merge(df_text,ground_truth, on = 'UserID')


# ##merge Description and Location into out dataframe 
# dataFrameWithHashTagHandlesAndTruths = pd.merge(dataFrameWithHashTagHandlesAndTruths,df_text_Description,on = 'UserID')

# dataFrameWithHashTagHandlesAndTruths = pd.merge(dataFrameWithHashTagHandlesAndTruths,df_text_Location,on = 'UserID')

## Implementing Logistic Regression on Location column, rest of models are below ##

### 1.1: Use train_test_split() to split data into training and test sets, where 20 percent of the records go to test set. ###

In [None]:
X = dataFrameWithHashTagHandlesAndTruths.drop(['UserID','TrumpOrClinton'], axis=1)
# grabbing one of the location arrays: USA
y = dataFrameWithHashTagHandlesAndTruths.TrumpOrClinton.values


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4)

# Task 1.2 (20 pts): Train Decision Tree, SVM, Logistic Regression, and Neural Networks #

## Decision Tree ##

In [None]:
## Decision tree boilerplate
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print(metrics.precision_score(y_test, y_pred, average= 'weighted'))
print(metrics.recall_score(y_test, y_pred, average= 'weighted'))
print(metrics.f1_score(y_test, y_pred, average= 'weighted'))
print(metrics.confusion_matrix(y_test, y_pred))

## SVM ##

In [None]:
clf = SVC(C=1.0, gamma='auto') 
clf.fit(X_train, y_train) 
y_pred = clf.predict(X_test)

print(metrics.precision_score(y_test, y_pred, average= 'weighted'))
print(metrics.recall_score(y_test, y_pred, average= 'weighted'))
print(metrics.f1_score(y_test, y_pred, average= 'weighted'))
print(metrics.confusion_matrix(y_test, y_pred))

## LogisticRegression ##

In [None]:
logreg = LogisticRegression(solver='lbfgs') 
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)


print(metrics.precision_score(y_test, y_pred, average= 'weighted'))
print(metrics.recall_score(y_test, y_pred, average= 'weighted'))
print(metrics.f1_score(y_test, y_pred, average= 'weighted'))
print(metrics.confusion_matrix(y_test, y_pred))

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier

scaler = StandardScaler()
scaler.fit(X_train)
X_train= scaler.transform(X_train)
X_test = scaler.transform(X_test)

## Nueral Networks ##

In [None]:
mlp = MLPClassifier(hidden_layer_sizes=(1000,1000,1000), solver='adam', max_iter=1000)

mlp.fit(X_train, y_train)
y_pred = mlp.predict(X_test)

print(metrics.classification_report(y_test, y_pred))
print(metrics.precision_score(y_test, y_pred, average= 'weighted'))
print(metrics.recall_score(y_test, y_pred, average= 'weighted'))
print(metrics.f1_score(y_test, y_pred, average= 'weighted'))
print(metrics.confusion_matrix(y_test, y_pred, labels=[0, 1]))


# Task 1.3 (20 pts): Train k-NN model. In your report describe the features that you used for k-NN. Perform parameter tuning on k-NN model. Apply 5-fold cross validation and use grid search to find the best K value for k-NN model. Set scoring metric to F1 score (F-measure). Use the best K value identified from grid search to train your k-NN model. Plot the F1 score against K value based on the results you achieved from grid search. #

In [None]:
%%time
param_grid = dict(n_neighbors=range(1,21))
knn = KNeighborsClassifier(n_neighbors=1)
grid = GridSearchCV(knn, param_grid, cv=5, scoring='f1_weighted',n_jobs=2)
grid.fit(X_train, y_train)
# view the complete results
means = grid.cv_results_['mean_test_score']

In [None]:
import matplotlib.pyplot as plt
# plot the results
plt.plot(range(1,21),means)
plt.xlabel('Value of K for KNN')
plt.ylabel('F1 score based on Cross-Validation')
plt.show()

In [None]:
# identify the best model
print(grid.best_score_)
print(grid.best_params_)
print(grid.best_estimator_)

In [None]:
%%time
# train your model using all data and the best known parameters
#TODO: use best n_neighbors from grid search
knn = KNeighborsClassifier(n_neighbors=1, weights='uniform')
knn.fit(X_train, y_train)
y_pred =  knn.predict(X_test)

print(metrics.precision_score(y_test, y_pred, average= 'weighted'))
print(metrics.recall_score(y_test, y_pred, average= 'weighted'))
print(metrics.f1_score(y_test, y_pred, average= 'weighted'))
print(metrics.confusion_matrix(y_test, y_pred))