## CSUS - CSc 177-02 Data Warehousing and Data Mining - Final Project:   
### 2016 U.S. presidential election Twitter analysis  

**Group members: Aaron Enberg, Nima Sarrafzadeh, Kyne Liu**  
**Professor: Haiquan (Victor) Chen**

In [1]:
from collections import Counter
import pandas as pd
import numpy as np
from sklearn import (
    preprocessing,  
    metrics
)
from imblearn.under_sampling import AllKNN
import sklearn.feature_extraction.text as sk_text
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import (
    train_test_split,
    GridSearchCV
)
pd.set_option('display.max_colwidth', -1)

In [2]:
daily_spotify = pd.read_csv('./data/data.csv')
daily_spotify.columns = ['position', 'track_name', 'artist', 'streams', 'url', 'date', 'region']

In [3]:
print(daily_spotify.shape)
print(daily_spotify.dtypes)

(3441197, 7)
position      int64 
track_name    object
artist        object
streams       int64 
url           object
date          object
region        object
dtype: object


In [4]:
daily_spotify.head()

Unnamed: 0,position,track_name,artist,streams,url,date,region
0,1,Reggaetón Lento (Bailemos),CNCO,19272,https://open.spotify.com/track/3AEZUABDXNtecAOSC1qTfo,2017-01-01,ec
1,2,Chantaje,Shakira,19270,https://open.spotify.com/track/6mICuAdrwEjh6Y6lroV2Kg,2017-01-01,ec
2,3,Otra Vez (feat. J Balvin),Zion & Lennox,15761,https://open.spotify.com/track/3QwBODjSEzelZyVjxPOHdq,2017-01-01,ec
3,4,Vente Pa' Ca,Ricky Martin,14954,https://open.spotify.com/track/7DM4BPaS7uofFul3ywMe46,2017-01-01,ec
4,5,Safari,J Balvin,14269,https://open.spotify.com/track/6rQSrBHf7HlZjtcMZ4S4bO,2017-01-01,ec


In [5]:
daily_spotify['region'].unique()

array(['ec', 'fr', 'ar', 'fi', 'no', 'it', 'lt', 'ph', 'tw', 'nz', 'ee',
       'tr', 'us', 'sv', 'cr', 'de', 'cl', 'jp', 'br', 'hn', 'gt', 'ch',
       'hu', 'ca', 'pe', 'be', 'my', 'dk', 'bo', 'pl', 'at', 'pt', 'se',
       'mx', 'pa', 'uy', 'is', 'es', 'cz', 'ie', 'nl', 'sk', 'co', 'sg',
       'id', 'do', 'lu', 'gb', 'global', 'py', 'au', 'lv', 'gr', 'hk'],
      dtype=object)

In [6]:
df_streams_by_country = daily_spotify.groupby(
    ['track_name', 'artist', 'region', 'date', 'position'],
    as_index=False)['streams'].sum()

In [7]:
df_streams_by_country.head(n=10)

Unnamed: 0,track_name,artist,region,date,position,streams
0,"""All That Is or Ever Was or Ever Will Be""",Alan Silvestri,pl,2017-01-08,185,3547
1,"""All That Is or Ever Was or Ever Will Be""",Alan Silvestri,tr,2017-01-08,198,3764
2,"""Read All About It, Pt. III""",Emeli Sandé,be,2017-10-09,186,3075
3,"""Read All About It, Pt. III""",Emeli Sandé,be,2017-10-15,192,3053
4,"""Read All About It, Pt. III""",Emeli Sandé,be,2017-10-16,147,3330
5,"""Read All About It, Pt. III""",Emeli Sandé,be,2017-10-22,182,3179
6,"""Read All About It, Pt. III""",Emeli Sandé,be,2017-10-23,159,3334
7,"""Read All About It, Pt. III""",Emeli Sandé,cz,2017-10-09,184,1338
8,"""Read All About It, Pt. III""",Emeli Sandé,cz,2017-10-15,190,1222
9,"""Read All About It, Pt. III""",Emeli Sandé,cz,2017-10-16,163,1527


In [8]:
# the first day a song hits the top 200, which countries' charts does it appear on? 
df_country_initial_appear = df_streams_by_country.drop_duplicates(
    ['track_name', 'artist', 'region'], keep='first')

df_country_initial_appear.drop(['position', 'streams'], axis=1, inplace=True)
df_country_initial_appear.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


Unnamed: 0,track_name,artist,region,date
0,"""All That Is or Ever Was or Ever Will Be""",Alan Silvestri,pl,2017-01-08
1,"""All That Is or Ever Was or Ever Will Be""",Alan Silvestri,tr,2017-01-08
2,"""Read All About It, Pt. III""",Emeli Sandé,be,2017-10-09
7,"""Read All About It, Pt. III""",Emeli Sandé,cz,2017-10-09
12,"""Read All About It, Pt. III""",Emeli Sandé,dk,2017-02-12


In [9]:
df_country_initial_appear = df_country_initial_appear.groupby(
    ['track_name', 'artist', 'date']
)['region'].apply(list).to_frame().reset_index().drop_duplicates(['track_name', 'artist'], keep='first')

In [10]:
df_country_initial_appear['regions_stringified'] = df_country_initial_appear['region'].apply(
    lambda x: ' '.join(map(str, x)))
corpus = list(df_country_initial_appear['regions_stringified'])

vectorizer = sk_text.CountVectorizer()
matrix = vectorizer.fit_transform(corpus)

In [11]:
df = df_streams_by_country.pivot_table(
    index=['track_name', 'artist'], 
    columns='region', 
    values='streams', 
    aggfunc=sum, 
    fill_value=0)

df.reset_index(level=[0,1], inplace=True)
df['track_name_by_artist'] = df.track_name.str.cat(df.artist, sep=' ')
df.drop(['track_name', 'artist'], axis=1, inplace=True)

In [12]:
df_streams_by_country['top10'] = np.where(df_streams_by_country['position'] <= 10, 1, 0)
df_streams_by_country['top10'].value_counts()

0    3244032
1    196508 
Name: top10, dtype: int64

In [13]:
df_top10 = df_streams_by_country.drop_duplicates(['track_name', 'artist', 'top10'])
df_top10 = df_top10.sort_values(['track_name', 'artist', 'top10'], inplace=False)
df_top10.drop_duplicates(['track_name', 'artist'], keep='last', inplace=True)
df_top10.reset_index(level=0, drop=True, inplace=True)
df_top10.drop(['track_name', 'artist', 'region', 'position', 'date', 'streams'], axis=1, inplace=True)
df_top10['top10'].value_counts()

0    18576
1    1346 
Name: top10, dtype: int64

In [14]:
# 0 = song did NOT hit 1-10 chart position | # 1 = song hit 1-10 chart position
df_top10.head()

Unnamed: 0,top10
0,0
1,0
2,0
3,0
4,0


In [15]:
df.set_index('track_name_by_artist', inplace=True)

In [16]:
df_initial_regions = pd.DataFrame(matrix.toarray(), 
                  index=df.index, 
                  columns=vectorizer.get_feature_names())

df_initial_regions = df_initial_regions.add_prefix('first_appear_')
df_initial_regions.shape

(19922, 54)

In [17]:
df_2 = df.join(df_initial_regions, how='inner')

In [18]:
df_2.head()

Unnamed: 0_level_0,ar,at,au,be,bo,br,ca,ch,cl,co,...,first_appear_pt,first_appear_py,first_appear_se,first_appear_sg,first_appear_sk,first_appear_sv,first_appear_tr,first_appear_tw,first_appear_us,first_appear_uy
track_name_by_artist,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"""All That Is or Ever Was or Ever Will Be"" Alan Silvestri",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
"""Read All About It, Pt. III"" Emeli Sandé",0,0,0,15971,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
#99 JVG,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
#Askip Black M,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
#Biziz - feat. Lil Bege Reynmen,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [19]:
y = df_top10['top10']
y.value_counts()

0    18576
1    1346 
Name: top10, dtype: int64

In [20]:
X_train, X_test, y_train, y_test = train_test_split(df_2, y, 
                                                    test_size=0.2, 
                                                    random_state=42)

In [21]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(15937, 108)
(3985, 108)
(15937,)
(3985,)


In [22]:
# undersampling by prototype selection with cleaning heuristcs
ak = AllKNN(ratio='auto', kind_sel='mode', n_neighbors=3)

# Create the classifier
k_range = list(range(1, 20))
params = {'n_neighbors': k_range, 'weights': ['uniform', 'distance'], 'p': [1, 2]}

knn = KNeighborsClassifier(n_neighbors=1, weights='uniform', 
                           algorithm='auto', leaf_size=30, 
                           p=1, metric='minkowski', 
                           metric_params=None, n_jobs=1)

knn_grid_search_cv = GridSearchCV(knn, params, 
                                  cv=5, n_jobs=-1, 
                                  verbose=1, 
                                  scoring='f1_weighted')

X_train_resampled, y_train_resampled = ak.fit_sample(X_train, y_train)
print(sorted(Counter(y_train_resampled).items()))

std_scaler = preprocessing.StandardScaler()
X_train_scaled = std_scaler.fit_transform(X_train_resampled)
X_test_scaled = std_scaler.transform(X_test)

knn_grid_search_cv.fit(X_train_scaled, y_train_resampled)
y_pred = knn_grid_search_cv.predict(X_test_scaled)
print(knn_grid_search_cv.best_score_)
print(knn_grid_search_cv.best_params_)
print(knn_grid_search_cv.best_estimator_)
print(metrics.classification_report(y_test, y_pred))

[(0, 14289), (1, 1073)]
Fitting 5 folds for each of 76 candidates, totalling 380 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed: 15.1min
[Parallel(n_jobs=-1)]: Done 380 out of 380 | elapsed: 32.2min finished


0.9691022036404104
{'p': 1, 'n_neighbors': 4, 'weights': 'distance'}
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=4, p=1,
           weights='distance')
             precision    recall  f1-score   support

          0       0.97      0.97      0.97      3712
          1       0.63      0.61      0.62       273

avg / total       0.95      0.95      0.95      3985

