In [None]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn import tree    # Decision Tree
from sklearn.svm import SVC # Support Vector Machine 
from sklearn import metrics




## File Prep ##

In [None]:
%%time
column_names=['Name', 'ScreenName', 'UserID', 'FollowersCount', 'FriendsCount', 'Location', 'Description', 'CreatedAt', 'StatusID', 'Language', 'Place', 'RetweetCount', 'FavoriteCount', 'Text']
tweets = pd.read_csv('clinton_trump_tweets.txt', sep="\t",  encoding="ISO-8859-1", header=None, names=column_names)
#print initial data
tweets.drop(['ScreenName', 'FollowersCount', 'FriendsCount', 'CreatedAt', 'StatusID'], axis=1)
tweets.Location = tweets.Location.astype(str)
tweets.Text = tweets.Text.astype(str)

### Task 1.1 (10 pts): Remove all retweets first. Remove all users that have less than 20 tweets. You may want to keep the entire tweet content, including hashtags/handles. ###

In [None]:
tweets = tweets[~tweets.Text.str.startswith('RT')]

In [None]:
%%time
def keepHashMentions(text):
    hashMentions = []
    for word in text:
        hashMentions.extend(word)
    return " ".join(hashMentions).strip()
## keeping tweet including hashtags and mentions here ## 
tweets['HashMentions'] = tweets.Text.str.findall('(@\w+)|(#\w+)').apply(keepHashMentions)


### Remove all users that have less than 20 tweets. ###

In [None]:
#1.2 keep tweets where UID appears 20+ times
tweets = tweets.groupby("UserID").filter(lambda x: len(x) >= 20)

In [None]:
%%timeit
# Generate a list of mention/hashes that have a frequency of 20+ #
top_hash = pd.Series(tweets['HashMentions'].str.cat(sep=' ').split()).value_counts()
top20 = top_hash[top_hash>=20]
top20List = top20.index.tolist()

# Convert list to set so it will have quick lookup
top20Set = set(top20List)


# generate list of mention/hashes that occur 20+ times from our orginal list 
def removeUnder20Mentions(hashMentions):
    mentions =  hashMentions.split()
    mentionsOver20 = []
    for mention in mentions:
        if(mention in top20Set):
            mentionsOver20.append(mention)
    return " ".join(mentionsOver20)
tweets['FrequencyOver20'] = tweets.HashMentions.apply(removeUnder20Mentions)
tweets.head(20)


## Apply ground truths column to UserID of tweets Dataframe ##

In [None]:
ground_truth = pd.read_table('clinton_trump_user_classes.txt', encoding ="ISO-8859-1", dtype=int, names=["UserID", "TrumpOrClinton"])
#ground_truth.UserID = ground_truth.UserID.astype(int)
mergeGroundTruths = pd.merge(tweets, ground_truth, on = 'UserID')
#ground truth

In [None]:
mergeGroundTruths.head()

### 1.1: Use train_test_split() to split data into training and test sets, where 20 percent of the records go to test set. ###

In [None]:
# dummyDataFrame = pd.DataFrame({'top20List': top20List})
# tweets.HashMentions.str.get_dummies(sep=" ")
# pd.get_dummies(dummyDataFrame,prefix=['top20List'], drop_first=True)
# dummyDataFrame

# Testing models with dummy Iris data #

In [None]:
# split X and y into training and testing sets
# enc = OneHotEncoder(handle_unknown='ignore')\
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

iris = load_iris()

# create X (features) and y (response)
#Upper Case is for input, lower case is for output 
X = iris.data
y = iris.target

# print the shapes of X and y
print(X.shape)
print(y.shape)

# Applying x train and y train here 
# X contains all records 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=4)

# Task 1.2 (20 pts): Train Decision Tree, SVM, Logistic Regression, and Neural Networks #

## Decision Tree ##

In [None]:
## Decision tree boilerplate
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X, y)
clf.predict(X_test)


## SVM ##

In [None]:
clf = SVC(C=1.0, gamma='auto')  # train your model here
clf.fit(X_train, y_train) 
y_pred = clf.predict(X_test)


# compare actual response values (y_test) with predicted response values (y_pred) 

# you need to set parameter "weighted" because this is multi-class classifier

print(metrics.precision_score(y_test, y_pred, average= 'weighted'))
print(metrics.recall_score(y_test, y_pred, average= 'weighted'))
print(metrics.f1_score(y_test, y_pred, average= 'weighted'))
### Results for SVM

## LogisticRegression ##

In [None]:
from sklearn.linear_model import LogisticRegression # uses gradient descent 
from sklearn.model_selection import cross_val_score


# set a few optional parameter
# solver uses implementation of gradient descent \/
lr = LogisticRegression(solver='lbfgs', max_iter=200, multi_class='auto')      

print(cross_val_score(lr, X, y, cv=10, scoring='f1_weighted').mean())

# search for an optimal value of K for KNN
k_range = list(range(1, 31))
k_scores = []
for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn, X, y, cv=10, scoring='f1_weighted')
    k_scores.append(scores.mean())
print(k_scores)


### Plot the scores
import matplotlib.pyplot as plt
%matplotlib inline

# plot the value of K for KNN (x-axis) versus the cross-validated accuracy (y-axis)
plt.plot(k_range, k_scores)
plt.xlabel('Value of K for KNN')
plt.ylabel('F1 Score based on Cross-Validation')
plt.show()

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=3) 
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

# you need to set parameter "weighted" because this is multi-class classfier

print(metrics.precision_score(y_test, y_pred, average= 'weighted'))
print(metrics.recall_score(y_test, y_pred, average= 'weighted'))
print(metrics.f1_score(y_test, y_pred, average= 'weighted'))

## Nueral Networks ##

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier

scaler = StandardScaler()
scaler.fit(X_train)
X_train= scaler.transform(X_train)
X_test = scaler.transform(X_test)

### Multilayer Perception Model: 30 30 30 is number of nerurons etc.
### Note: each feature does not have to have same number of neurons

In [None]:
### hidden_layer_sizes: 3 hidden layers,  each has 30 neurons
### solver='adam' is a variation of gradient descent 
### max_iter determines the number of epochs

mlp = MLPClassifier(hidden_layer_sizes=(30,30,30), solver='adam', max_iter=1000)

mlp.fit(X_train, y_train)

In [None]:
## Dont forget to compare model to ground truths

y_pred = mlp.predict(X_test)
y_pred

In [None]:
# Conusion Matrix
# IMPORTANT: first argument is true values, second argument is predicted values
confusion = metrics.confusion_matrix(y_test, y_pred, labels=[0, 1])

print(confusion)
print(metrics.classification_report(y_test, y_pred))


# Task 1.3 (20 pts): Train k-NN model. In your report describe the features that you used for k-NN. Perform parameter tuning on k-NN model. Apply 5-fold cross validation and use grid search to find the best K value for k-NN model. Set scoring metric to F1 score (F-measure). Use the best K value identified from grid search to train your k-NN model. Plot the F1 score against K value based on the results you achieved from grid search. #