## CSUS - CSc 177-02 Data Warehousing and Data Mining - Project 1: Clustering  
### 2016 U.S. presidential election Twitter analysis  

**Group members: Aaron Enberg, Nima Sarrafzadeh, Kyne Liu**  
**Professor: Haiquan (Victor) Chen**

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import (
    preprocessing,  
    cluster as sk_cluster,
    metrics as metrics
)
from sklearn.feature_selection import VarianceThreshold
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import (
    cross_val_score,
    train_test_split,
    GridSearchCV
)
import sklearn.feature_extraction.text as sk_text
import gc

%matplotlib inline
pd.set_option('display.max_colwidth', -1)

In [2]:
column_names = ['name', 'screen_name', 'user_id', 
                'followers_count', 'friends_count', 
                'location', 'description', 'created_at', 
                'status_id', 'language', 'place', 
                'retweet_count', 'favorite_count', 'text']

tweets = pd.read_table('data/clinton_trump_tweets.txt', names=column_names, encoding='ISO-8859-1')

In [3]:
print(tweets.shape)
print(tweets.dtypes)

(5250980, 14)
name               object
screen_name        object
user_id            int64 
followers_count    int64 
friends_count      int64 
location           object
description        object
created_at         object
status_id          int64 
language           object
place              object
retweet_count      int64 
favorite_count     int64 
text               object
dtype: object


## Preprocessing

In [4]:
tweets.drop(['name', 'screen_name', 
            'followers_count', 
            'friends_count', 
            'location', 
            'description', 
            'created_at',
            'status_id', 
            'language', 
            'place', 
            'retweet_count', 
            'favorite_count'], axis=1, inplace=True)
tweets.head()

Unnamed: 0,user_id,followers_count,friends_count,retweet_count,favorite_count,text
0,1519696717,132,263,0,1,@NWAJimmy I've read it now though brother. Was pretty spot on Lots of bright spots but a lot to work on. Exactly as an exhibition should be!
1,109945090,2154,2034,1937,0,RT @wikileaks: New poll puts Pirate Party on course to win Iceland's national elections on Saturday. https://t.co/edTqjeJaQ6
2,1450086582,797,1188,0,0,@gaystoner821 I think New Orleans spoiled me with food. I need to try and branch out in BR.
3,167177185,204,448,891,0,RT @LOLGOP: ACA needs fixes but know da facts: *70% can get covered in marketplaces for under $75/month *Hikes affect 3% *GOP will uninsu
4,1191022351,775,154,7,0,RT @FastCompany: Alphabet shares soar on better-than-expected earnings as mobile video strategy pays off https://t.co/bokbXngMJt https://t.


### Remove all retweets

In [5]:
# lines that start with "RT" followed by a space
pattern = r'^RT\s'
 
# matches retweets and removes them
tweets = tweets[tweets.text.str.match(pattern) == False]

In [6]:
tweets.shape

(2416818, 6)

### Extract all handles and hashtags

In [7]:
# match all hashtags and handles in a tweet, ignoring possible email addresses
pattern = r'(?<=^|(?<=[^a-zA-Z0-9-\.]))@([A-Za-z_]+[A-Za-z0-9_]+)|(?<=^|(?<=[^a-zA-Z0-9-\.]))#([A-Za-z_]+[A-Za-z0-9_]+)'

""" returns a DataFrame with a MultiIndex:
    First index is our original index. Second index is "match" which is a running
    total of the number of occurences of hashtags and mentions for a particular 
    tweet. """
handles_hashtags = tweets.text.str.extractall(pattern)

# make the dataframe look nice 
handles_hashtags.columns = ['handles', 'hashtags']
handles_hashtags = handles_hashtags.reset_index().set_index('level_0')
del handles_hashtags.index.name
handles_hashtags.drop(['match'], axis=1, inplace=True)

# stack handles and hashtags into one column
handles_hashtags = pd.concat([handles_hashtags.handles, handles_hashtags.hashtags]).dropna().to_frame(name='handles_hashtags')

tweets = tweets.join(handles_hashtags, how='inner').reset_index()
tweets.drop(['text'], axis=1, inplace=True)

del handles_hashtags
gc.collect()

119

### keep active users with at least 20 distinct hashtags/handles

In [8]:
# returns users along with the distinct hashtags/handles they've used
handles_hashtags_distinct = tweets.groupby('user_id')['handles_hashtags'].unique().to_frame()
# retrieve only those who have used 20 or more distinct handles/hashtags
users_active = handles_hashtags_distinct[handles_hashtags_distinct.handles_hashtags.str.len() > 19]

del handles_hashtags_distinct
gc.collect()

0

In [9]:
''' returns users along with all the hashtags and handles (including duplicates) they've used 
    (each occurrence of a hashtag/handle will also show up in the list) '''
handles_hashtags_all = tweets.reset_index().groupby('user_id')['handles_hashtags'].apply(list).to_frame()

users_active = users_active.join(handles_hashtags_all, lsuffix='_distinct', rsuffix='_all', how='inner')

del handles_hashtags_all
gc.collect()

30

In [11]:
# convert each list of handles/hashtags into a string
users_active['handles_hashtags_stringified'] = users_active['handles_hashtags_all'].apply(lambda x: ' '.join(map(str, x)))

# list of strings, each containing all of the handles and hashtags for a user
corpus = list(users_active['handles_hashtags_stringified'])

### hashtags/handles that have been used by at least 20 distinct users (min_df=20)

In [12]:
# constructs a dataframe with only the handles/hashtags used by 20 users 
vectorizer = sk_text.CountVectorizer(min_df=20, max_features=100, lowercase=True, encoding='ISO-8859-1')
matrix = vectorizer.fit_transform(corpus)
df_idx_id = pd.DataFrame(matrix.toarray(), index=users_active.index, columns=vectorizer.get_feature_names())

In [13]:
# rows are users, columns are hashtags/handles
# values are frequency of a handle/hashtag
df_idx_id.head()

Unnamed: 0_level_0,_altright_anew,_carja,_cfj_,_makada_,_proud_american,_realvalentina_,a_miller48,abbydphillip,abbymartin,abbymartinm,...,zaibatsunews,zaidjilani,zekejmiller,zerohedge,zhaabowekwe,zigmanfreud,zika,zimmermanrob,zip90210,retweet_count
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1644,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,6
1737,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,4
2391,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,18
2426,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,199
14763,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,9


In [14]:
# represent each handle/hashtag as an integer
train_df = df_idx_id.transpose().reset_index(drop=True).transpose()

del df_idx_id
gc.collect()

55

In [15]:
labels = pd.read_table('./data/clinton_trump_user_classes.txt', 
                       names=['user_id', 'label'])
labels.set_index('user_id', inplace=True)
train_df = labels.join(train_df, how='inner')
y = train_df['label']
train_df.drop('label', axis=1, inplace=True)

del labels
gc.collect()

56

In [16]:
X_train, X_test, y_train, y_test = train_test_split(train_df, y, 
                                                    test_size=0.2, 
                                                    random_state=42)

In [17]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(13652, 4036)
(3414, 4036)
(13652,)
(3414,)


In [18]:
std_scaler = preprocessing.StandardScaler()
X_train_scaled = std_scaler.fit_transform(X_train)
X_test_scaled = std_scaler.fit_transform(X_test)

#### SVM Classification

In [None]:
svm_clf = SVC(C=1, kernel='rbf', degree=3, 
              gamma='auto', coef0=0.0, shrinking=True, 
              probability=False, tol=0.001, cache_size=200, 
              class_weight=None, verbose=False, max_iter=-1, 
              decision_function_shape='ovr', random_state=42)
svm_clf.fit(X_train_scaled, y_train)

In [None]:
y_pred = svm_clf.predict(X_test_scaled)
print(metrics.f1_score(y_test, y_pred))

#### Decision Tree Classification

In [None]:
tree_clf = DecisionTreeClassifier(criterion='gini', 
                                  splitter='best', 
                                  max_depth=None, 
                                  min_samples_split=2, 
                                  min_samples_leaf=1, 
                                  min_weight_fraction_leaf=0.0, 
                                  max_features=None, 
                                  random_state=42, 
                                  max_leaf_nodes=99, 
                                  min_impurity_decrease=0.0, 
                                  min_impurity_split=None, 
                                  class_weight=None, 
                                  presort=False)
tree_clf.fit(X_train_scaled, y_train)

In [None]:
y_pred = tree_clf.predict(X_test_scaled)
print(metrics.f1_score(y_test, y_pred))

#### kNN Classification with grid search for best k and 5-fold cross-validation

In [None]:
k_range = list(range(1, 20))
params = {'n_neighbors': k_range}
knn = KNeighborsClassifier(n_neighbors=1)
knn_grid_search_cv = GridSearchCV(knn, 
                                  params, cv=5, n_jobs=-1, 
                                  verbose=1, scoring='f1_weighted')

knn_grid_search_cv.fit(X_train_scaled, y_train)

In [None]:
plt.plot(k_range, means)
plt.xlabel('Value of K for KNN')
plt.ylabel('F1 score based on Cross-Validation')
plt.show()

In [None]:
print(knn_grid_search_cv.best_score_)
print(knn_grid_search_cv.best_params_)
print(knn_grid_search_cv.best_estimator_)

In [None]:
y_pred = knn_grid_search_cv.predict(X_test_scaled)
print(metrics.f1_score(y_test, y_pred))