In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn import tree    # Decision Tree
from sklearn.linear_model import LogisticRegression # LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC # Support Vector Machine 
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
import sklearn.feature_extraction.text as sk_text



## File Prep ##

In [2]:
%%time
column_names=['Name', 'ScreenName', 'UserID', 'FollowersCount', 'FriendsCount', 'Location', 'Description', 'CreatedAt', 'StatusID', 'Language', 'Place', 'RetweetCount', 'FavoriteCount', 'Text']
tweets = pd.read_csv('clinton_trump_tweets.txt', sep="\t",  encoding="ISO-8859-1", header=None, names=column_names)
#print initial data
tweets = tweets.drop(['ScreenName', 'FollowersCount', 'FriendsCount', 'CreatedAt', 'StatusID','Language', 'Place', 'RetweetCount', 'FavoriteCount'], axis=1)
tweets.Location = tweets.Location.astype(str)
tweets.Text = tweets.Text.astype(str)

ground_truth = pd.read_table('clinton_trump_user_classes.txt', encoding ="ISO-8859-1", dtype=int, names=["UserID", "TrumpOrClinton"])



CPU times: user 38.9 s, sys: 2.47 s, total: 41.4 s
Wall time: 41.2 s


### Task 1.1 (10 pts): Remove all retweets first. Remove all users that have less than 20 tweets. You may want to keep the entire tweet content, including hashtags/handles. ###

In [3]:
tweets = tweets[~tweets.Text.str.startswith('RT')]

In [4]:
%%time
def keepHashMentions(text):
    hashMentions = []
    for word in text:
        hashMentions.extend(word)
    return " ".join(hashMentions).strip()
## keeping tweet including hashtags and mentions here ## 
tweets['HashMentions'] = tweets.Text.str.findall('(@\w+)|(#\w+)').apply(keepHashMentions)


CPU times: user 13.9 s, sys: 301 ms, total: 14.2 s
Wall time: 14 s


### Remove all users that have less than 20 tweets. ###

In [5]:
#1.2 keep tweets where UID appears 20+ times
tweets = tweets.groupby("UserID").filter(lambda x: len(x) >= 20)

In [6]:
%%timeit
# Generate a list of mention/hashes that have a frequency of 20+ #
top_hash = pd.Series(tweets['HashMentions'].str.cat(sep=' ').split()).value_counts()
top20 = top_hash[top_hash>=20]
top20List = top20.index.tolist()

# Convert list to set so it will have quick lookup
top20Set = set(top20List)


# generate list of mention/hashes that occur 20+ times from our orginal list 
def removeUnder20Mentions(hashMentions):
    mentions =  hashMentions.split()
    mentionsOver20 = []
    for mention in mentions:
        if(mention in top20Set):
            mentionsOver20.append(mention)
    return " ".join(mentionsOver20)
tweets['HashMentions'] = tweets.HashMentions.apply(removeUnder20Mentions)

2.29 s ± 59.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [7]:
hashMentionsMerged = tweets.groupby('UserID')['HashMentions'].apply(' '.join).reset_index()
tweetsWithNoHashMentions = tweets.drop('HashMentions', axis=1)
tweets = pd.merge(tweetsWithNoHashMentions, hashMentionsMerged, on='UserID')
tweets = tweets.drop_duplicates('UserID')


In [8]:
#remove users that have no hashtag/handles
tweets = tweets[tweets.HashMentions.apply(lambda x:  bool(x and x.strip()))]


In [9]:
# Remove locations that are null 
tweets['LocationCleaned'] = tweets['Location'].apply(lambda x: x.split(',')[0])
tweetsLocations = tweets[tweets['Location'].notnull()]
## manual cleanup of bad location data, fix later
tweetsLocations = tweetsLocations[tweetsLocations['LocationCleaned'] != 'NAN']
tweetsLocations = tweetsLocations[tweetsLocations['LocationCleaned'] != 'nan']
tweetsLocations = tweetsLocations[tweetsLocations['LocationCleaned'] != ' ']
tweetsLocations = tweetsLocations[tweetsLocations['LocationCleaned'] != '']

tweets = tweetsLocations[tweetsLocations['LocationCleaned'].isin(tweetsLocations['LocationCleaned'].value_counts().nlargest(150).index.tolist())]

# TODO: maybe try this later on with other locations
# tweetsLocations['LocationCleaned'] = tweetsLocations.LocationCleaned.map({'NYC':'New York','New York City':'New York','United States':'USA', 'United States of America': 'USA'})

In [10]:
aggregateTweetsHashtags = tweets.groupby('UserID')['HashMentions'].apply(lambda x: x.str.cat(sep=' '))
tweetsPrepareSKText = pd.DataFrame({'User_id': aggregateTweetsHashtags.index, 'All_hashtags': aggregateTweetsHashtags.values})
vectorizer = sk_text.TfidfVectorizer(max_features = 2000,
                             #min_df=100, 
                             #max_df=.8,
                             stop_words = ['TrumpOrClinton','userid','UserID','039','0hour1__','100percfedup','1a','1shawnster','2000shp','2a','8216','8217','8220']
                             )


In [11]:
matrix = vectorizer.fit_transform(tweetsPrepareSKText.All_hashtags.values)
tdidf = matrix.toarray()
df_text = pd.DataFrame(matrix.todense(), index=aggregateTweetsHashtags.index, columns=vectorizer.get_feature_names())
df_text.head()

  'stop_words.' % sorted(inconsistent))


Unnamed: 0_level_0,0hour1___,100ktweets4safety,1uvote,20committee,2351onthelist,2alaw,2xshhhh,4evertruther,4la_volpe,4xalerts,...,yoga,youngcons,youngdems4trump,younggun2016,youranonnews,youtube,yuengling_beer,yusufdfi,zekejmiller,zerohedge
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2391,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14763,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
24263,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
164833,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1026541,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Apply ground truths column to UserID of tweets Dataframe ##

In [13]:
mergeGroundTruths = pd.merge(tweets, ground_truth, on = 'UserID')
#ground truth
dataFrameWithHashTagHandlesAndTruths = pd.merge(df_text,ground_truth, on = 'UserID')

## Implementing Logistic Regression on Location column, rest of models are below ##

In [15]:
inputsV = dataFrameWithHashTagHandlesAndTruths.drop(['UserID','TrumpOrClinton'], axis=1)

### 1.1: Use train_test_split() to split data into training and test sets, where 20 percent of the records go to test set. ###

In [16]:
X = inputsV
# grabbing one of the location arrays: USA
y = dataFrameWithHashTagHandlesAndTruths.TrumpOrClinton.values


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4)

# Task 1.2 (20 pts): Train Decision Tree, SVM, Logistic Regression, and Neural Networks #

## Decision Tree ##

In [17]:
## Decision tree boilerplate
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X, y)
y_pred = clf.predict(X_test)

print(metrics.precision_score(y_test, y_pred, average= 'weighted'))
print(metrics.recall_score(y_test, y_pred, average= 'weighted'))
print(metrics.f1_score(y_test, y_pred, average= 'weighted'))
print(metrics.confusion_matrix(y_test, y_pred))

0.93762607498625
0.9373297002724795
0.9369390442124098
[[503  63]
 [ 29 873]]


## SVM ##

In [20]:
clf = SVC(C=1.0, gamma='auto')  # train your model here
clf.fit(X_train, y_train) 
y_pred = clf.predict(X_test)

print(metrics.precision_score(y_test, y_pred, average= 'weighted'))
print(metrics.recall_score(y_test, y_pred, average= 'weighted'))
print(metrics.f1_score(y_test, y_pred, average= 'weighted'))
print(metrics.confusion_matrix(y_test, y_pred))

0.37753825479437814
0.614441416893733
0.46770139918830983
[[  0 566]
 [  0 902]]


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


## LogisticRegression ##

In [21]:
# set a few optional parameter
# solver uses implementation of gradient descent 
logreg = LogisticRegression(solver='lbfgs') 
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)


print(metrics.precision_score(y_test, y_pred, average= 'weighted'))
print(metrics.recall_score(y_test, y_pred, average= 'weighted'))
print(metrics.f1_score(y_test, y_pred, average= 'weighted'))
print(metrics.confusion_matrix(y_test, y_pred))

0.7475912801074617
0.7438692098092643
0.7285046100293432
[[275 291]
 [ 85 817]]


## Nueral Networks ##

In [22]:
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier

scaler = StandardScaler()
scaler.fit(X_train)
X_train= scaler.transform(X_train)
X_test = scaler.transform(X_test)

### Multilayer Perception Model: 30 30 30 is number of nerurons etc.
### Note: each feature does not have to have same number of neurons

In [23]:
### hidden_layer_sizes: 3 hidden layers,  each has 30 neurons
### solver='adam' is a variation of gradient descent 
### max_iter determines the number of epochs

mlp = MLPClassifier(hidden_layer_sizes=(100,100,100), solver='adam', max_iter=1000)

mlp.fit(X_train, y_train)
y_pred = mlp.predict(X_test)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100, 100, 100), learning_rate='constant',
       learning_rate_init=0.001, max_iter=1000, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

In [25]:
print(metrics.classification_report(y_test, y_pred))
print(metrics.precision_score(y_test, y_pred, average= 'weighted'))
print(metrics.recall_score(y_test, y_pred, average= 'weighted'))
print(metrics.f1_score(y_test, y_pred, average= 'weighted'))
print(metrics.confusion_matrix(y_test, y_pred, labels=[0, 1]))


              precision    recall  f1-score   support

           0       0.62      0.63      0.63       566
           1       0.77      0.76      0.76       902

   micro avg       0.71      0.71      0.71      1468
   macro avg       0.70      0.70      0.70      1468
weighted avg       0.71      0.71      0.71      1468

0.7113631547078413
0.7111716621253406
0.7112660787394985
[[355 211]
 [213 689]]


# Task 1.3 (20 pts): Train k-NN model. In your report describe the features that you used for k-NN. Perform parameter tuning on k-NN model. Apply 5-fold cross validation and use grid search to find the best K value for k-NN model. Set scoring metric to F1 score (F-measure). Use the best K value identified from grid search to train your k-NN model. Plot the F1 score against K value based on the results you achieved from grid search. #

In [None]:
%%timeit
# define the parameter values that should be searched
k_range = list(range(10, 21))
# uniform: uniform weights. All points in each neighborhood are weighted equally

# distance: weight points by the inverse of their distance.# instantiate and fit the grid  
# create a parameter grid: map the parameter names to the values that should be searched
param_grid = dict(n_neighbors=k_range)
knn = KNeighborsClassifier(n_neighbors=1)
grid = GridSearchCV(knn, param_grid, cv=10, scoring='f1_weighted')
grid.fit(X, y)
# view the complete results
means = grid.cv_results_['mean_test_score']

In [None]:
# plot the results
plt.plot(k_range, means)
plt.xlabel('Value of K for KNN')
plt.ylabel('F1 score based on Cross-Validation')
plt.show()

In [None]:
# identify the best model
print(grid.best_score_)
print(grid.best_params_)
print(grid.best_estimator_)

In [None]:
# train your model using all data and the best known parameters
#TODO: use best n_neighbors from grid search
knn = KNeighborsClassifier(n_neighbors=13, weights='uniform')
knn.fit(X, y)
y_pred =  knn.predict(X_test)

print(metrics.precision_score(y_test, y_pred, average= 'weighted'))
print(metrics.recall_score(y_test, y_pred, average= 'weighted'))
print(metrics.f1_score(y_test, y_pred, average= 'weighted'))
print(metrics.confusion_matrix(y_test, y_pred))