## CSUS - CSc 177-02 Data Warehousing and Data Mining - Final Project:   
### 2016 U.S. presidential election Twitter analysis  

**Group members: Aaron Enberg, Nima Sarrafzadeh, Kyne Liu**  
**Professor: Haiquan (Victor) Chen**

In [1]:
from collections import Counter
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import (
    preprocessing,  
    cluster as sk_cluster,
    metrics
)
from sklearn.metrics import classification_report
from imblearn.pipeline import make_pipeline, Pipeline
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.under_sampling import (
    RandomUnderSampler, 
    NearMiss, 
    EditedNearestNeighbours, 
    RepeatedEditedNearestNeighbours,
    CondensedNearestNeighbour,
    NeighbourhoodCleaningRule,
    OneSidedSelection,
    AllKNN,
    TomekLinks
)
from imblearn.combine import SMOTEENN
from imblearn.ensemble import EasyEnsemble, BalanceCascade
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import VarianceThreshold
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import (
    cross_val_score,
    train_test_split,
    GridSearchCV,
    StratifiedKFold
)
import sklearn.feature_extraction.text as sk_text
import gc

%matplotlib inline

pd.set_option('display.max_colwidth', -1)

In [2]:
daily_spotify = pd.read_csv('./data/data.csv')
daily_spotify.columns = ['position', 'track_name', 'artist', 'streams', 'url', 'date', 'region']

In [3]:
print(daily_spotify.shape)
print(daily_spotify.dtypes)

(3441197, 7)
position      int64 
track_name    object
artist        object
streams       int64 
url           object
date          object
region        object
dtype: object


In [4]:
daily_spotify.head()

Unnamed: 0,position,track_name,artist,streams,url,date,region
0,1,Reggaetón Lento (Bailemos),CNCO,19272,https://open.spotify.com/track/3AEZUABDXNtecAOSC1qTfo,2017-01-01,ec
1,2,Chantaje,Shakira,19270,https://open.spotify.com/track/6mICuAdrwEjh6Y6lroV2Kg,2017-01-01,ec
2,3,Otra Vez (feat. J Balvin),Zion & Lennox,15761,https://open.spotify.com/track/3QwBODjSEzelZyVjxPOHdq,2017-01-01,ec
3,4,Vente Pa' Ca,Ricky Martin,14954,https://open.spotify.com/track/7DM4BPaS7uofFul3ywMe46,2017-01-01,ec
4,5,Safari,J Balvin,14269,https://open.spotify.com/track/6rQSrBHf7HlZjtcMZ4S4bO,2017-01-01,ec


In [5]:
daily_spotify['region'].unique()

array(['ec', 'fr', 'ar', 'fi', 'no', 'it', 'lt', 'ph', 'tw', 'nz', 'ee',
       'tr', 'us', 'sv', 'cr', 'de', 'cl', 'jp', 'br', 'hn', 'gt', 'ch',
       'hu', 'ca', 'pe', 'be', 'my', 'dk', 'bo', 'pl', 'at', 'pt', 'se',
       'mx', 'pa', 'uy', 'is', 'es', 'cz', 'ie', 'nl', 'sk', 'co', 'sg',
       'id', 'do', 'lu', 'gb', 'global', 'py', 'au', 'lv', 'gr', 'hk'],
      dtype=object)

In [6]:
df_streams_by_country = daily_spotify.groupby(['track_name', 'artist', 'region', 'date', 'position'], as_index=False)['streams'].sum()

In [7]:
df_streams_by_country.head(n=10)

Unnamed: 0,track_name,artist,region,date,position,streams
0,"""All That Is or Ever Was or Ever Will Be""",Alan Silvestri,pl,2017-01-08,185,3547
1,"""All That Is or Ever Was or Ever Will Be""",Alan Silvestri,tr,2017-01-08,198,3764
2,"""Read All About It, Pt. III""",Emeli Sandé,be,2017-10-09,186,3075
3,"""Read All About It, Pt. III""",Emeli Sandé,be,2017-10-15,192,3053
4,"""Read All About It, Pt. III""",Emeli Sandé,be,2017-10-16,147,3330
5,"""Read All About It, Pt. III""",Emeli Sandé,be,2017-10-22,182,3179
6,"""Read All About It, Pt. III""",Emeli Sandé,be,2017-10-23,159,3334
7,"""Read All About It, Pt. III""",Emeli Sandé,cz,2017-10-09,184,1338
8,"""Read All About It, Pt. III""",Emeli Sandé,cz,2017-10-15,190,1222
9,"""Read All About It, Pt. III""",Emeli Sandé,cz,2017-10-16,163,1527


In [8]:
# the first day a song hits the top 200, which countries' charts does it appear on? 
df_country_initial_appear = df_streams_by_country.drop_duplicates(
    ['track_name', 'artist', 'region'], keep='first')

In [9]:
df_country_initial_appear.head()

Unnamed: 0,track_name,artist,region,date,position,streams
0,"""All That Is or Ever Was or Ever Will Be""",Alan Silvestri,pl,2017-01-08,185,3547
1,"""All That Is or Ever Was or Ever Will Be""",Alan Silvestri,tr,2017-01-08,198,3764
2,"""Read All About It, Pt. III""",Emeli Sandé,be,2017-10-09,186,3075
7,"""Read All About It, Pt. III""",Emeli Sandé,cz,2017-10-09,184,1338
12,"""Read All About It, Pt. III""",Emeli Sandé,dk,2017-02-12,188,6967


In [29]:
df_country_initial_appear = df_country_initial_appear.groupby(
    ['track_name', 'artist', 'region']
).date.min().to_frame()

In [44]:
df_country_initial_appear.reset_index().groupby(['track_name', 'artist', 'date'])

AttributeError: 'DataFrameGroupBy' object has no attribute 'to_frame'

In [None]:
# make the regions features
df_country_initial_appear.unstack(['region'], fill_value=0).reset_index()

In [None]:
df_country_initial_appear.head()

In [None]:
# instead of actual dates the song first appeared, just mark with 0 or 1
df_country_initial_appear[df_country_initial_appear['date'] != 0] = 1

In [None]:
df = df_streams_by_country.pivot_table(index=['track_name', 'artist'], columns='region', values='streams', aggfunc=sum, fill_value=0)

In [None]:
df.reset_index(level=[0,1], inplace=True)

In [None]:
df['track_name_by_artist'] = df.track_name.str.cat(df.artist, sep=' ')

In [None]:
df.drop(['track_name', 'artist'], axis=1, inplace=True)

In [None]:
df_streams_by_country['top10'] = np.where(df_streams_by_country['position'] <= 10, 1, 0)

In [None]:
df_streams_by_country['top10'].value_counts()

In [None]:
df_top10 = df_streams_by_country.drop_duplicates(['track_name', 'artist', 'top10'])

In [None]:
df_top10 = df_top10.sort_values(['track_name', 'artist', 'top10'])

In [None]:
df_top10.drop_duplicates(['track_name', 'artist'], keep='last', inplace=True)

In [None]:
df_top10['top10'].value_counts()

In [None]:
df_top10.reset_index(level=0, drop=True, inplace=True)
df_top10.drop(['track_name', 'artist', 'region', 'position', 'date', 'streams'], axis=1, inplace=True)

In [None]:
df_top10.head()

In [None]:
df.head()

In [None]:
df_2 = df.join(df_country_initial_appear.date.add_prefix('appear_'), how='inner')
df_2.set_index('track_name_by_artist', inplace=True)

In [None]:
df_2.head()

In [None]:
df.set_index('track_name_by_artist', inplace=True)

In [None]:
df.head()

In [None]:
y = df_top10['top10']

In [None]:
y.value_counts()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df, y, 
                                                    test_size=0.2, 
                                                    random_state=42)

In [None]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
std_scaler = preprocessing.StandardScaler()
X_train_scaled = std_scaler.fit_transform(X_train)
X_test_scaled = std_scaler.fit_transform(X_test)

In [None]:
pca = PCA(n_components=2)
# Create the samplers
#enn = EditedNearestNeighbours()
#renn = RepeatedEditedNearestNeighbours()
ak = AllKNN(ratio='auto', kind_sel='mode', n_neighbors=3)
#oss = OneSidedSelection()
#ncr = NeighbourhoodCleaningRule()
#nm = NearMiss()
#rus = RandomUnderSampler()
# Create the classifier
k_range = list(range(1, 20))
params = {'n_neighbors': k_range}

''' weights='distance' weight points by the inverse of their distance. 
    in this case, closer neighbors of a query point will have a greater 
    influence than neighbors which are further away. 
    p=1 manhattan distance '''

knn = KNeighborsClassifier(n_neighbors=1, weights='uniform', 
                           algorithm='auto', leaf_size=30, 
                           p=1, metric='minkowski', 
                           metric_params=None, n_jobs=1)

knn_grid_search_cv = GridSearchCV(knn, params, 
                                  cv=5, n_jobs=-1, 
                                  verbose=1, 
                                  scoring='f1_weighted')

# Add one transformers and two samplers in the pipeline object

pipeline = make_pipeline(pca, ak, knn_grid_search_cv)

pipeline.fit(X_train_scaled, y_train)
y_pred = pipeline.predict(X_test_scaled)

print(classification_report(y_test, y_pred))

In [None]:
Cs = np.logspace(-5, 15, num=11, base=2.0)
gammas = np.logspace(-15, 3, num=11, base=2.0)
print("Tested Cs", Cs)
print("Tested gammas", gammas)
param_grid = {'C': Cs, 'gamma': gammas}

# fit the model and get the separating hyperplane
svm_grid_search_cv = GridSearchCV(SVC(kernel='rbf'), param_grid, scoring='f1_weighted', cv=10)

# Add one transformers and two samplers in the pipeline object

pipeline = make_pipeline(pca, ak, svm_grid_search_cv)

pipeline.fit(X_train_scaled, y_train)
y_pred = pipeline.predict(X_test_scaled)

print(classification_report(y_test, y_pred))

In [None]:
pca = PCA(n_components=2)

In [None]:
X_vis = pca.fit_transform(X_train_scaled)

In [None]:
ak = AllKNN()
X_resampled, y_resampled = ak.fit_sample(X_train_scaled, y_train)

In [None]:
clf_smote = LinearSVC().fit(X_resampled, y_resampled)

In [None]:
y_pred = clf_smote.predict(X_test_scaled)
print("Support Vector Machine classification results")
print("\n F1-Score \n", metrics.f1_score(y_test, y_pred))
print("\n Precision \n", metrics.precision_score(y_test, y_pred))
print("\n Recall \n", metrics.recall_score(y_test, y_pred))
print("\n Confusion Matrix \n", metrics.confusion_matrix(y_test, y_pred))

In [None]:
svm_clf = SVC(C=5, kernel='rbf', degree=3, 
              gamma=.005, coef0=0.0, shrinking=True, 
              probability=False, tol=0.001, cache_size=200, 
              class_weight={0: 1, 1: 2}, verbose=False, max_iter=-1, 
              decision_function_shape='ovr', random_state=42)
svm_clf.fit(X_train_scaled, y_train)

In [None]:
y_pred = svm_clf.predict(X_test_scaled)
print("Support Vector Machine classification results")
print("\n F1-Score \n", metrics.f1_score(y_test, y_pred))
print("\n Precision \n", metrics.precision_score(y_test, y_pred))
print("\n Recall \n", metrics.recall_score(y_test, y_pred))
print("\n Confusion Matrix \n", metrics.confusion_matrix(y_test, y_pred))

In [None]:
linear_svm_clf = LinearSVC(penalty='l2', loss='squared_hinge', 
                           dual=True, tol=0.0001, C=1.0, 
                           multi_class='ovr', fit_intercept=True, 
                           intercept_scaling=1, class_weight=None, 
                           verbose=0, random_state=None, max_iter=1000)
linear_svm_clf.fit(X_train_scaled, y_train)

In [None]:
y_pred = linear_svm_clf.predict(X_test_scaled)
print("Support Vector Machine classification results")
print("\n F1-Score \n", metrics.f1_score(y_test, y_pred))
print("\n Precision \n", metrics.precision_score(y_test, y_pred))
print("\n Recall \n", metrics.recall_score(y_test, y_pred))
print("\n Confusion Matrix \n", metrics.confusion_matrix(y_test, y_pred))

In [None]:
tree_clf = DecisionTreeClassifier(criterion='entropy', 
                                  splitter='best', 
                                  max_depth=None, 
                                  min_samples_split=2, 
                                  min_samples_leaf=1, 
                                  min_weight_fraction_leaf=0.0, 
                                  max_features=None, 
                                  random_state=42, 
                                  max_leaf_nodes=23, 
                                  min_impurity_decrease=0.0, 
                                  min_impurity_split=None, 
                                  class_weight='balanced', 
                                  presort=False)
tree_clf.fit(X_train_scaled, y_train)

In [None]:
y_pred = tree_clf.predict(X_test_scaled)
print("Decision Tree classification results")
print("\n F1-Score \n", metrics.f1_score(y_test, y_pred))
print("\n Precision \n", metrics.precision_score(y_test, y_pred))
print("\n Recall \n", metrics.recall_score(y_test, y_pred))
print("\n Confusion Matrix \n", metrics.confusion_matrix(y_test, y_pred))

In [None]:
log_reg = LogisticRegression(penalty='l2', dual=False, 
                             tol=0.0001, C=.0094, 
                             fit_intercept=True, 
                             intercept_scaling=1, 
                             class_weight=None, 
                             random_state=42, 
                             solver='liblinear', 
                             max_iter=100, 
                             multi_class='ovr', 
                             verbose=0, 
                             warm_start=False, n_jobs=1)
log_reg.fit(X_train_scaled, y_train)

In [None]:
y_pred = log_reg.predict(X_test_scaled)
print("Logistic Regression classification results")
print("\n F1-Score \n", metrics.f1_score(y_test, y_pred))
print("\n Precision \n", metrics.precision_score(y_test, y_pred))
print("\n Recall \n", metrics.recall_score(y_test, y_pred))
print("\n Confusion Matrix \n", metrics.confusion_matrix(y_test, y_pred))

In [None]:
k_range = list(range(1, 20))
params = {'n_neighbors': k_range}

''' weights='distance' weight points by the inverse of their distance. 
    in this case, closer neighbors of a query point will have a greater 
    influence than neighbors which are further away. 
    p=1 manhattan distance '''

knn = KNeighborsClassifier(n_neighbors=1, weights='distance', 
                           algorithm='auto', leaf_size=30, 
                           p=1, metric='minkowski', 
                           metric_params=None, n_jobs=1)

knn_grid_search_cv = GridSearchCV(knn, params, 
                                  cv=5, n_jobs=-1, 
                                  verbose=1, 
                                  scoring='f1_weighted')

knn_grid_search_cv.fit(X_train_scaled, y_train)


In [None]:
means = knn_grid_search_cv.cv_results_['mean_test_score']
for mean, params in zip(means, knn_grid_search_cv.cv_results_['params']):
    print(mean, params)

In [None]:
plt.plot(k_range, means)
plt.xlabel('Value of K for KNN')
plt.ylabel('F1 score based on Cross-Validation')
plt.show()

In [None]:
print(knn_grid_search_cv.best_score_)
print(knn_grid_search_cv.best_params_)
print(knn_grid_search_cv.best_estimator_)

In [None]:
y_pred = knn_grid_search_cv.predict(X_test_scaled)
print("k-Nearest Neighbors classification results")
print("\n F1-Score \n", metrics.f1_score(y_test, y_pred))
print("\n Precision \n", metrics.precision_score(y_test, y_pred))
print("\n Recall \n", metrics.recall_score(y_test, y_pred))
print("\n Confusion Matrix \n", metrics.confusion_matrix(y_test, y_pred))

In [None]:
sm = SMOTE(ratio='auto', random_state=None, 
           k=None, k_neighbors=5, m=None, 
           m_neighbors=10, out_step=0.5, 
           kind='regular', svm_estimator=None, 
           n_jobs=1)

smote_enn = SMOTEENN(smote = sm)

pipeline = make_pipeline(smote_enn, knn)

In [None]:
params = {'n_neighbors': k_range}
knn_grid_search_cv = GridSearchCV(pipeline, param_grid=params, 
                                  cv=5, n_jobs=-1, 
                                  verbose=1, 
                                  scoring='f1_weighted')

In [None]:
knn_grid_search_cv.fit(X_train_scaled, y_train)

In [None]:
sm = SMOTE(random_state=12, ratio = 1.0)
x_train_res, y_train_res = sm.fit_sample(X_train_scaled, y_train)

In [None]:
clf_rf = RandomForestClassifier(n_estimators=25, random_state=12)
clf_rf.fit(x_train_res, y_train_res)

In [None]:
y_pred = clf_rf.predict(X_test_scaled)

In [None]:
print("Random Forest classification results")
print("\n F1-Score \n", metrics.f1_score(y_test, y_pred))
print("\n Precision \n", metrics.precision_score(y_test, y_pred))
print("\n Recall \n", metrics.recall_score(y_test, y_pred))
print("\n Confusion Matrix \n", metrics.confusion_matrix(y_test, y_pred))

In [None]:
pipe = Pipeline([
    ('oversample', SMOTE(random_state=444)),
    ('clf', knn)
    ])

skf = StratifiedKFold()
param_grid = {'clf__n_neighbors': k_range}
grid = GridSearchCV(pipe, param_grid, return_train_score=False,
                    n_jobs=-1, scoring='roc_auc', cv=skf)
grid.fit(X_train, y_train)

In [None]:
print(grid.best_score_)
print(grid.best_params_)
print(grid.best_estimator_)

In [None]:
y_pred = grid.predict(X_test_scaled)
print("k-Nearest Neighbors classification results")
print("\n F1-Score \n", metrics.f1_score(y_test, y_pred))
print("\n Precision \n", metrics.precision_score(y_test, y_pred))
print("\n Recall \n", metrics.recall_score(y_test, y_pred))
print("\n Confusion Matrix \n", metrics.confusion_matrix(y_test, y_pred))