## CSUS - CSc 177-02 Data Warehousing and Data Mining - Final Project:   
### 2016 U.S. presidential election Twitter analysis  

**Group members: Aaron Enberg, Nima Sarrafzadeh, Kyne Liu**  
**Professor: Haiquan (Victor) Chen**

In [26]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import (
    preprocessing,  
    cluster as sk_cluster,
    metrics
)
from sklearn.feature_selection import VarianceThreshold
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import (
    cross_val_score,
    train_test_split,
    GridSearchCV
)
import sklearn.feature_extraction.text as sk_text
import gc

%matplotlib inline

pd.set_option('display.max_colwidth', -1)

In [2]:
daily_spotify = pd.read_csv('./data/data.csv')
spotify_data = pd.read_csv('./data/featuresdf.csv')

daily_spotify.columns = ['position', 'track_name', 'artist', 'streams', 'url', 'date', 'region']

In [3]:
print(spotify_data.shape)
print(spotify_data.dtypes)
print(daily_spotify.shape)
print(daily_spotify.dtypes)

(100, 16)
id                   object
name                 object
artists              object
danceability        float64
energy              float64
key                 float64
loudness            float64
mode                float64
speechiness         float64
acousticness        float64
instrumentalness    float64
liveness            float64
valence             float64
tempo               float64
duration_ms         float64
time_signature      float64
dtype: object
(3441197, 7)
position       int64
track_name    object
artist        object
streams        int64
url           object
date          object
region        object
dtype: object


In [4]:
spotify_data.head()

Unnamed: 0,id,name,artists,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,7qiZfU4dY1lWllzX7mPBI,Shape of You,Ed Sheeran,0.825,0.652,1.0,-3.183,0.0,0.0802,0.581,0.0,0.0931,0.931,95.977,233713.0,4.0
1,5CtI0qwDJkDQGwXD1H1cL,Despacito - Remix,Luis Fonsi,0.694,0.815,2.0,-4.328,1.0,0.12,0.229,0.0,0.0924,0.813,88.931,228827.0,4.0
2,4aWmUDTfIPGksMNLV2rQP,Despacito (Featuring Daddy Yankee),Luis Fonsi,0.66,0.786,2.0,-4.757,1.0,0.17,0.209,0.0,0.112,0.846,177.833,228200.0,4.0
3,6RUKPb4LETWmmr3iAEQkt,Something Just Like This,The Chainsmokers,0.617,0.635,11.0,-6.769,0.0,0.0317,0.0498,1.4e-05,0.164,0.446,103.019,247160.0,4.0
4,3DXncPQOG4VBw3QHh3S81,I'm the One,DJ Khaled,0.609,0.668,7.0,-4.284,1.0,0.0367,0.0552,0.0,0.167,0.811,80.924,288600.0,4.0


In [5]:
daily_spotify.head()

Unnamed: 0,position,track_name,artist,streams,url,date,region
0,1,Reggaetón Lento (Bailemos),CNCO,19272,https://open.spotify.com/track/3AEZUABDXNtecAO...,2017-01-01,ec
1,2,Chantaje,Shakira,19270,https://open.spotify.com/track/6mICuAdrwEjh6Y6...,2017-01-01,ec
2,3,Otra Vez (feat. J Balvin),Zion & Lennox,15761,https://open.spotify.com/track/3QwBODjSEzelZyV...,2017-01-01,ec
3,4,Vente Pa' Ca,Ricky Martin,14954,https://open.spotify.com/track/7DM4BPaS7uofFul...,2017-01-01,ec
4,5,Safari,J Balvin,14269,https://open.spotify.com/track/6rQSrBHf7HlZjtc...,2017-01-01,ec


In [6]:
# keep only chart data from 2017
daily_spotify = daily_spotify[(daily_spotify['date'] < '2018-01-01')]

In [7]:
daily_spotify.to_csv('./data/global_tracks_2017.csv', index=False)

In [9]:
daily_spotify['url'].nunique()

21380

In [10]:
daily_spotify['region'].unique()

array(['ec', 'fr', 'ar', 'fi', 'no', 'it', 'lt', 'ph', 'tw', 'nz', 'ee',
       'tr', 'us', 'sv', 'cr', 'de', 'cl', 'jp', 'br', 'hn', 'gt', 'ch',
       'hu', 'ca', 'pe', 'be', 'my', 'dk', 'bo', 'pl', 'at', 'pt', 'se',
       'mx', 'pa', 'uy', 'is', 'es', 'cz', 'ie', 'nl', 'sk', 'co', 'sg',
       'id', 'do', 'lu', 'gb', 'global', 'py', 'au', 'lv', 'gr', 'hk'],
      dtype=object)

In [22]:
df = daily_spotify.groupby(['track_name', 'artist'])['region'].unique().to_frame()

In [23]:
mlb = MultiLabelBinarizer()
df = df.join(pd.DataFrame(mlb.fit_transform(df.pop('region')),
                          columns=mlb.classes_,
                          index=df.index))

In [24]:
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,ar,at,au,be,bo,br,ca,ch,cl,co,...,pt,py,se,sg,sk,sv,tr,tw,us,uy
track_name,artist,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
"""All That Is or Ever Was or Ever Will Be""",Alan Silvestri,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
"""Read All About It, Pt. III""",Emeli Sandé,0,0,0,1,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
#99,JVG,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
#Askip,Black M,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
#Biziz - feat. Lil Bege,Reynmen,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [28]:
daily_spotify[200:400]

Unnamed: 0,position,track_name,artist,streams,url,date,region
200,1,Reggaetón Lento (Bailemos),CNCO,16672,https://open.spotify.com/track/3AEZUABDXNtecAOSC1qTfo,2017-01-02,ec
201,2,Chantaje,Shakira,15594,https://open.spotify.com/track/6mICuAdrwEjh6Y6lroV2Kg,2017-01-02,ec
202,3,Otra Vez (feat. J Balvin),Zion & Lennox,13507,https://open.spotify.com/track/3QwBODjSEzelZyVjxPOHdq,2017-01-02,ec
203,4,Safari,J Balvin,11958,https://open.spotify.com/track/6rQSrBHf7HlZjtcMZ4S4bO,2017-01-02,ec
204,5,Vente Pa' Ca,Ricky Martin,11590,https://open.spotify.com/track/7DM4BPaS7uofFul3ywMe46,2017-01-02,ec
205,6,Let Me Love You,DJ Snake,9096,https://open.spotify.com/track/4pdPtRcBmOSQDlJ3Fk945m,2017-01-02,ec
206,7,Ay Mi Dios,IAmChino,9023,https://open.spotify.com/track/6stYbAJgTszHAHZMPxWWCY,2017-01-02,ec
207,8,Traicionera,Sebastian Yatra,8511,https://open.spotify.com/track/5J1c3M4EldCfNxXwrwt8mT,2017-01-02,ec
208,9,La Bicicleta,Carlos Vives,8288,https://open.spotify.com/track/0sXvAOmXgjR2QUqLK1MltU,2017-01-02,ec
209,10,Vacaciones,Wisin,8205,https://open.spotify.com/track/3dQDid3IUNhZy1OehIfYfE,2017-01-02,ec


In [31]:
daily_spotify.groupby(['track_name', 'artist'])['streams'].apply(sum).to_frame()

Unnamed: 0_level_0,Unnamed: 1_level_0,streams
track_name,artist,Unnamed: 2_level_1
"""All That Is or Ever Was or Ever Will Be""",Alan Silvestri,7311
"""Read All About It, Pt. III""",Emeli Sandé,57025
#99,JVG,31826
#Askip,Black M,296862
#Biziz - feat. Lil Bege,Reynmen,403591
#CTZK,Sir Mich,669563
#Elämänpeli (feat. Touko),Mr. Elämänpeli,405771
#HEY!,Pase Libre,109035
#JM,Broederliefde,6291002
#LDL,Niro,51922


In [38]:
daily_spotify.groupby(['track_name', 'artist', 'region', 'position', 'date'])['streams'].apply(list).to_frame()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,streams
track_name,artist,region,position,date,Unnamed: 5_level_1
"""All That Is or Ever Was or Ever Will Be""",Alan Silvestri,pl,185,2017-01-08,[3547]
"""All That Is or Ever Was or Ever Will Be""",Alan Silvestri,tr,198,2017-01-08,[3764]
"""Read All About It, Pt. III""",Emeli Sandé,be,147,2017-10-16,[3330]
"""Read All About It, Pt. III""",Emeli Sandé,be,159,2017-10-23,[3334]
"""Read All About It, Pt. III""",Emeli Sandé,be,182,2017-10-22,[3179]
"""Read All About It, Pt. III""",Emeli Sandé,be,186,2017-10-09,[3075]
"""Read All About It, Pt. III""",Emeli Sandé,be,192,2017-10-15,[3053]
"""Read All About It, Pt. III""",Emeli Sandé,cz,163,2017-10-16,[1527]
"""Read All About It, Pt. III""",Emeli Sandé,cz,174,2017-10-23,[1458]
"""Read All About It, Pt. III""",Emeli Sandé,cz,175,2017-10-22,[1310]
