In [13]:
import dask.dataframe as dd
import pandas as pd
import numpy as np
import utils
import os

import seaborn as sns
%matplotlib inline 

from sklearn.decomposition import PCA
from sklearn.linear_model import Ridge, Lasso, ElasticNet, LogisticRegression
from sklearn.metrics import r2_score, regression, precision_score, recall_score, accuracy_score, fbeta_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier, VotingClassifier
from sklearn.model_selection import train_test_split, cross_val_score

import warnings
warnings.filterwarnings('ignore')

In [5]:
# combo_df = pd.read_csv("../data/combined_features.csv") 
tracks = pd.read_csv("../data/mod_tracks.csv")
features = pd.read_csv("../data/mod_features.csv")
echonest = pd.read_csv("../data/mod_echonest.csv")

In [43]:
new_feat = pd.read_csv("../data/new_feat_large.csv")

new_feat.index = new_feat.track_id
new_feat.drop(["track_id"], inplace=True, axis=1)

In [44]:
vanilla_test = pd.read_csv("../data/vanilla_test_features.csv")
test_features = ["duration", "acousticness","danceability","energy","instrumentalness","liveness","speechiness","tempo","valence"]
vanilla_test.index = vanilla_test["track_id"]

true_y = vanilla_test["track_popularity"]
genre_test = vanilla_test["genre_top"]

genre_test = genre_test.map(lambda x: "Hip-Hop" if x == "Hip Hop" else x)
vanilla_test = vanilla_test[test_features]

In [45]:
hand_crafted_test = pd.read_csv("../data/hand_crafted_test.csv")
hand_crafted_test.index = hand_crafted_test["track_id"]
hand_crafted_test.drop(["track_id"], inplace=True, axis=1)

test = pd.merge(vanilla_test, hand_crafted_test, left_index=True, right_index=True)
test.head()

Unnamed: 0_level_0,duration,acousticness,danceability,energy,instrumentalness,liveness,speechiness,tempo,valence,Ft1,...,Ft5,Ft6,Ft7,Ft8,Ft9,Ft10,Ft11,Ft12,Ft13,Ft14
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3,216896,0.186,0.768,0.517,3.8e-05,0.104,0.0312,104.992,0.418,0,...,0,0,0,0,0,0,1,0,1,0.364964
4,263400,0.163,0.599,0.448,0.0,0.106,0.0232,95.05,0.168,0,...,0,0,0,0,1,0,0,0,1,0.355647
5,163870,0.047,0.643,0.783,0.0,0.083,0.0856,154.084,0.579,0,...,0,0,0,0,0,0,1,0,1,0.364176
7,213600,0.67,0.547,0.277,0.0,0.091,0.0429,153.974,0.684,0,...,0,0,0,1,0,0,0,0,1,0.361209
11,180973,0.469,0.565,0.649,0.0249,0.135,0.0842,91.704,0.803,0,...,0,0,0,0,0,0,1,0,1,0.354631


In [46]:
echonest_features = echonest[['echonest_audio_features_acousticness',
       'echonest_audio_features_danceability',
       'echonest_audio_features_energy',
       'echonest_audio_features_instrumentalness',
       'echonest_audio_features_liveness',
       'echonest_audio_features_speechiness', 'echonest_audio_features_tempo',
       'echonest_audio_features_valence', 'track_id']]

In [47]:
def get_df_exluding(df, except_cols):
    cols = [col for col in df.columns if col not in except_cols]
    return df[cols]

def get_df_with(df, cols_like):
    cols = []
    for col_part in cols_like:
        for col in df.columns:
            if col_part in col:
                cols.append(col)
    return df[cols]

In [48]:
def get_important_features(model, n, X):
    # For GBDT
    top_n = np.argsort(model.estimators_[0].feature_importances_)[::-1][:n]
    imp_feat = [X.columns[i] for i in top_n]
    print("Gradient Boosting Tree: {}".format(imp_feat))
    
    # For RF
    top_n = np.argsort(model.estimators_[1].feature_importances_)[::-1][:n]
    imp_feat = [X.columns[i] for i in top_n]
    print("Random Forest: {}".format(imp_feat))
    
    # For LR
    top_n = np.argsort(model.estimators_[2].coef_)[::-1][:n]
    imp_feat = [X.columns[i] for i in top_n]
#     print("Logistic Regression: {}".format(imp_feat))

In [49]:
tracks.groupby("track_genre_top").count()

Unnamed: 0_level_0,album_comments,album_date_created,album_date_released,album_engineer,album_favorites,album_id,album_information,album_listens,album_producer,album_tags,...,track_language_code,track_license,track_listens,track_lyricist,track_number,track_publisher,track_tags,track_title,track_id,popularity_index
track_genre_top,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Blues,110,105,78,41,110,110,85,110,38,110,...,58,110,110,1,110,0,110,110,110,110
Classical,1230,1209,564,67,1230,1230,1059,1230,108,1230,...,218,1230,1230,17,1230,16,1230,1230,1230,1230
Country,194,194,105,83,194,194,152,194,52,194,...,92,194,194,0,194,0,194,194,194,194
Easy Listening,24,24,17,0,24,24,19,24,1,24,...,1,24,24,0,24,0,24,24,24,24
Electronic,9372,9007,6288,516,9372,9372,6959,9372,767,9372,...,1378,9344,9372,12,9372,115,9372,9371,9372,9372
Experimental,10608,10394,7759,1028,10608,10608,8889,10608,1875,10608,...,1456,10608,10608,21,10608,44,10608,10608,10608,10608
Folk,2803,2726,1759,667,2803,2803,2076,2803,721,2803,...,739,2793,2803,20,2803,9,2803,2803,2803,2803
Hip-Hop,3552,3451,2558,199,3552,3552,2891,3552,312,3552,...,624,3552,3552,14,3552,3,3552,3552,3552,3552
Instrumental,2079,2042,1097,123,2079,2079,1599,2079,202,2079,...,138,2079,2079,13,2079,133,2079,2079,2079,2079
International,1389,1368,807,278,1389,1389,1151,1389,331,1389,...,202,1389,1389,10,1389,9,1389,1389,1389,1389


```
Genres
Popularity (Artist level popularity - calculated from all tracks by the artist)
Get other top tracks (Maybe use the number of top tracks by the artist)
Get artists albums (Album count of the artist)
Number of followers of the artist


Popularity (Album level popularity - calculated from all tracks by the artist)
Release date
```

In [50]:
genre_test.unique()

array(['Pop', 'Blues', 'Classical', 'Hip-Hop', 'Country', 'Electronic',
       'Rock'], dtype=object)

In [62]:
def generate_dataset(genre="ALL"):
    combo_df = pd.merge(tracks[["track_id","track_genre_top","popularity_index","track_duration"]], echonest_features, on="track_id")

    if genre != "ALL":
        combo_df = combo_df[combo_df.track_genre_top == genre]

    tracks.index = tracks.track_id
    features.index = features.track_id
    combo_df.index = combo_df.track_id
    combo_df = pd.merge(combo_df, new_feat, left_index=True, right_index=True)

    return combo_df
        
def prep_dataset(combo_df):
    X = get_df_exluding(combo_df, ["track_id","track_genre_top","popularity_index"])
#     print("Features used: {}".format(X.columns))
    y = combo_df.popularity_index 
    
    threshold = np.percentile(y, 90)
    
    y = y.map(lambda x: 1 if x > threshold else 0)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10)
    
    return X_train, X_test, y_train, y_test

def print_cv_metrics(X, y, model):
    print("Accuracy: {:.2f} Precision: {:.2f} Recall: {:.2f} ROC_AUC: {:.2f}".format(np.mean(cross_val_score(model, X, y, scoring="accuracy", n_jobs=-1, cv=5)), np.mean(cross_val_score(model, X, y, scoring="precision", n_jobs=-1, cv=5)), np.mean(cross_val_score(model, X, y, scoring="recall", n_jobs=-1, cv=5)), np.mean(cross_val_score(model, X, y, scoring="roc_auc", n_jobs=-1, cv=10))))
    
    

In [64]:
genres = list(tracks.track_genre_top.unique()) + ["ALL"]

for genre in genres:
    pca = PCA(n_components=1)
    tracks["popularity_index"] = pca.fit_transform(tracks[["track_interest","track_listens","track_favorites"]].as_matrix())
    tracks["popularity_index"] = np.log(tracks["popularity_index"] + np.abs(np.min(tracks["popularity_index"])) + 1)
#     print(tracks["popularity_index"].describe())
    
    if pd.isnull(genre):
        continue
    
    print() 
    print("*"*10 + " " + genre + " " + "*"*10)
    
    ## Generate dataset for Genre 
    
    combo_df = generate_dataset(genre)
    
    pop_perc = np.mean(combo_df.popularity_index)
    
    if np.isnan(pop_perc):
        continue

    X_train, X_test, y_train, y_test = prep_dataset(combo_df)
    
    print("Number of Tracks for training: {}".format(X_train.shape[0]))
    
    if X_train.shape[0] < 100:
        continue
        
    
        
    print("% of popular songs: {:.2f}".format(pop_perc*100))

    model = VotingClassifier(estimators=[('gbdt',GradientBoostingClassifier()), ('rf',RandomForestClassifier()), ('lr',LogisticRegression())], voting='soft')
    
    model = model.fit(X_train, y_train)
#     y_pred = model.predict(X)
#     comp = (y, y_pred)
    # print(model.coef_)
#     print(r2_score(y, y_pred), accuracy_score(y, y_pred), precision_score(y, y_pred), recall_score(y, y_pred), roc_auc_score(y, y_pred))

    print_cv_metrics(X_train, y_train, model)
    
    get_important_features(model, 3, X_train)
    
#     if genre in genre_test.unique():
#         print()
#         if genre != "ALL":
#             test_genre = test[genre_test == genre]
#         else:
#             test_genre = test.copy()
#         print("Test Scores for {} tracks:".format(test_genre.index.size))
#         y_pred = model.predict(test_genre)
#         y_true_genre = true_y[true_y.index.isin(test_genre.index)]
#         print("Accuracy: {}, AUC_ROC: {}".format(accuracy_score(y_true_genre, y_pred), precision_score(y_true_genre, y_pred)))


# pca = PCA(n_components=1)
# tracks["popularity_index"] = pca.fit_transform(tracks[["track_interest","track_listens","track_favorites"]].as_matrix())
# tracks["popularity_index"] = np.log(tracks["popularity_index"] + np.abs(np.min(tracks["popularity_index"])) + 1)
# #     print(tracks["popularity_index"].describe())

# print()
# print("*"*10 + " " + "Total" + " " + "*"*10)
# combo_df = pd.merge(tracks[["track_id","track_genre_top","popularity_index","track_duration"]], echonest_features, on="track_id")
# # combo_df = combo_df[combo_df.track_genre_top == genre]

# tracks.index = tracks.track_id
# features.index = features.track_id
# combo_df.index = combo_df.track_id
# combo_df = pd.merge(combo_df, new_feat, left_index=True, right_index=True)

# X = get_df_exluding(combo_df, ["track_id","track_genre_top","popularity_index"])
# #     print("Features used: {}".format(X.columns))
# y = combo_df.popularity_index

# print("Number of Tracks: {}".format(X.shape[0]))
# # y = [1 if song_pop > 0 else 0 for song_pop in y]
# threshold = np.percentile(y, 90)
# y = y.map(lambda x: 1 if x > threshold else 0)

# pop_perc = np.mean(y)
# print("% of popular songs: {:.2f}".format(np.mean(y)*100))

# model = VotingClassifier(estimators=[('gbdt',GradientBoostingClassifier()), ('rf',RandomForestClassifier()), ('lr',LogisticRegression())], voting='soft')
# model = model.fit(X, y)
# y_pred = model.predict(X)
# comp = (y, y_pred)
# # print(model.coef_)
# # print(r2_score(y, y_pred), accuracy_score(y, y_pred), precision_score(y, y_pred), recall_score(y, y_pred), roc_auc_score(y, y_pred))

# print("Accuracy: {:.2f} Precision: {:.2f} Recall: {:.2f} ROC_AUC: {:.2f}".format(np.mean(cross_val_score(model, X, y, scoring="accuracy", n_jobs=-1, cv=5)), np.mean(cross_val_score(model, X, y, scoring="precision", n_jobs=-1, cv=5)), np.mean(cross_val_score(model, X, y, scoring="recall", n_jobs=-1, cv=5)), np.mean(cross_val_score(model, X, y, scoring="roc_auc", n_jobs=-1, cv=10))))


********** Hip-Hop **********
Number of Tracks for training: 819
% of popular songs: 717.21
Accuracy: 0.90 Precision: 0.47 Recall: 0.08 ROC_AUC: 0.73
Gradient Boosting Tree: ['echonest_audio_features_acousticness', 'echonest_audio_features_instrumentalness', 'echonest_audio_features_liveness']
Random Forest: ['echonest_audio_features_acousticness', 'dissonance', 'echonest_audio_features_instrumentalness']

********** Pop **********
Number of Tracks for training: 311
% of popular songs: 733.43
Accuracy: 0.90 Precision: 0.00 Recall: 0.00 ROC_AUC: 0.47
Gradient Boosting Tree: ['dissonance', 'echonest_audio_features_danceability', 'Ft25']
Random Forest: ['Ft26', 'dissonance', 'echonest_audio_features_energy']

********** Rock **********
Number of Tracks for training: 3502
% of popular songs: 680.55
Accuracy: 0.90 Precision: 0.00 Recall: 0.00 ROC_AUC: 0.54
Gradient Boosting Tree: ['echonest_audio_features_acousticness', 'echonest_audio_features_danceability', 'Ft22']
Random Forest: ['echon

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

In [None]:
model.estimators[0]

In [None]:
for genre in tracks.track_genre_top.unique():
    pca = PCA(n_components=1)
    tracks["popularity_index"] = pca.fit_transform(tracks[["track_interest","track_listens","track_favorites"]].as_matrix())
    tracks["popularity_index"] = np.log(tracks["popularity_index"] + np.abs(np.min(tracks["popularity_index"])) + 1)
#     print(tracks["popularity_index"].describe())
    
    if pd.isnull(genre):
        continue
    
    print() 
    print("*"*10 + " " + genre + " " + "*"*10)
    combo_df = pd.merge(tracks[["track_id","track_genre_top","popularity_index","track_duration"]], echonest_features, on="track_id")
    combo_df = combo_df[combo_df.track_genre_top == genre]
    tracks.index = tracks.track_id
    features.index = features.track_id
    combo_df.index = combo_df.track_id
#     combo_df = pd.merge(combo_df, get_df_with(features, cols_like=["mfcc_"]), left_index=True, right_index=True)

    X = get_df_exluding(combo_df, ["track_id","track_genre_top","popularity_index"])
#     print("Features used: {}".format(X.columns))
    y = combo_df.popularity_index
    
    print("Number of Tracks: {}".format(X.shape[0]))
    # y = [1 if song_pop > 0 else 0 for song_pop in y]
    if X.shape[0] < 100:
        continue
    threshold = np.percentile(y, 90)
    y = y.map(lambda x: 1 if x > threshold else 0)
    
    pop_perc = np.mean(y)
    if pop_perc > 0.98 or np.isnan(pop_perc):
        continue
    print("% of popular songs: {:.2f}".format(np.mean(y)*100))

    model = VotingClassifier(estimators=[('gbdt',GradientBoostingClassifier()), ('rf',RandomForestClassifier(class_weight={0:9, 1:1})), ('lr',LogisticRegression())], voting='soft')
    model = model.fit(X, y)
    y_pred = model.predict(X)
    comp = (y, y_pred)
    # print(model.coef_)
    print(r2_score(y, y_pred), accuracy_score(y, y_pred), precision_score(y, y_pred), recall_score(y, y_pred), roc_auc_score(y, y_pred))

    print("Accuracy: {:.2f} Precision: {:.2f} Recall: {:.2f} ROC_AUC: {:.2f}".format(np.mean(cross_val_score(model, X, y, scoring="accuracy", n_jobs=-1, cv=5)), np.mean(cross_val_score(model, X, y, scoring="precision", n_jobs=-1, cv=5)), np.mean(cross_val_score(model, X, y, scoring="recall", n_jobs=-1, cv=5)), np.mean(cross_val_score(model, X, y, scoring="roc_auc", n_jobs=-1, cv=10))))


pca = PCA(n_components=1)
tracks["popularity_index"] = pca.fit_transform(tracks[["track_interest","track_listens","track_favorites"]].as_matrix())
tracks["popularity_index"] = np.log(tracks["popularity_index"] + np.abs(np.min(tracks["popularity_index"])) + 1)
#     print(tracks["popularity_index"].describe())

print()
print("*"*10 + " " + "Total" + " " + "*"*10)
combo_df = pd.merge(tracks[["track_id","track_genre_top","popularity_index","track_duration"]], echonest_features, on="track_id")
# combo_df = combo_df[combo_df.track_genre_top == genre]

tracks.index = tracks.track_id
features.index = features.track_id
combo_df.index = combo_df.track_id
# combo_df = pd.merge(combo_df, get_df_with(features, cols_like=["mfcc_"]), left_index=True, right_index=True)

X = get_df_exluding(combo_df, ["track_id","track_genre_top","popularity_index"])
#     print("Features used: {}".format(X.columns))
y = combo_df.popularity_index

print("Number of Tracks: {}".format(X.shape[0]))
# y = [1 if song_pop > 0 else 0 for song_pop in y]
threshold = np.percentile(y, 90)
y = y.map(lambda x: 1 if x > threshold else 0)

pop_perc = np.mean(y)
print("% of popular songs: {:.2f}".format(np.mean(y)*100))

model = RandomForestClassifier(class_weight={0:9, 1:1}) #VotingClassifier(estimators=[('gbdt',GradientBoostingClassifier()), ('rf',RandomForestClassifier(class_weight={0:9, 1:1})), ('lr',LogisticRegression())], voting='soft')
model = model.fit(X, y)
y_pred = model.predict(X)
comp = (y, y_pred)
# print(model.coef_)
print(r2_score(y, y_pred), accuracy_score(y, y_pred), precision_score(y, y_pred), recall_score(y, y_pred), roc_auc_score(y, y_pred))

print("Accuracy: {:.2f} Precision: {:.2f} Recall: {:.2f} ROC_AUC: {:.2f}".format(np.mean(cross_val_score(model, X, y, scoring="accuracy", n_jobs=-1, cv=5)), np.mean(cross_val_score(model, X, y, scoring="precision", n_jobs=-1, cv=5)), np.mean(cross_val_score(model, X, y, scoring="recall", n_jobs=-1, cv=5)), np.mean(cross_val_score(model, X, y, scoring="roc_auc", n_jobs=-1, cv=10))))

In [None]:
ge

In [None]:
a = np.argsort(model.estimators_[0].feature_importances_)
for i in a:
    print(X.columns[i], model.estimators_[0].feature_importances_[i])

In [None]:
print(np.argsort(model.estimators_[1].feature_importances_))
model.estimators_[1].feature_importances_[np.argsort(model.estimators_[1].feature_importances_)]

In [None]:
cross_val_score(model, X, y, scoring="accuracy", n_jobs=-1)

In [None]:
X.head()

In [None]:
combo_df.track_listens.describe()

In [None]:
y.describe()

In [None]:
log_index = np.log(combo_df.popularity_index + np.abs(np.min(combo_df.popularity_index)) + 1)

In [None]:
sns.distplot(log_index)

In [None]:
log_index.describe()

In [None]:
combo_df.columns

In [None]:
for col in echonest.columns:
    print(col)