In [114]:
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn import metrics 
from sklearn import svm

In [115]:
#read in the data using pandas

df = pd.read_csv(Path.cwd().parent / "data" / "02_tracks_from_spotify_with_word_count.csv")
to_drop = [col for col in df.columns if "Unnamed" in col.split(":")] + ["track_uri", "track_name", "album"]


# df = pd.read_csv(Path.cwd().parent / "data" / "final_dataset.csv")
#check data has been read in properly
# to_drop = [col for col in df.columns if "Unnamed" in col.split(":")] + ["track_uri", "artist_genres"]
# to_drop += [col for i, col in enumerate(df.columns) if i > 14]

to_drop

['Unnamed: 0', 'Unnamed: 0.1', 'track_uri', 'track_name', 'album']

In [116]:
df = df.drop(columns=to_drop)
df.shape

(8052, 12)

In [117]:
df.iloc[-1, :]

artist_name                                                         1349
artist_popularity                                                     33
artist_followers                                                   89380
artist_genres          ('black metal', 'dark black metal', 'metal', '...
track_popularity                                                      35
track_danceability                                                 0.176
track_energy                                                       0.963
track_loudness                                                    -4.817
track_tempo                                                      123.779
track_duration_ms                                                 250507
is_explicit_content                                                False
words_count                                                          179
Name: 8051, dtype: object

In [118]:
df.isnull().sum()

artist_name              0
artist_popularity        0
artist_followers         0
artist_genres            0
track_popularity         0
track_danceability       0
track_energy             0
track_loudness           0
track_tempo              0
track_duration_ms        0
is_explicit_content      0
words_count            985
dtype: int64

In [119]:
X = df.drop(columns=["is_explicit_content"])
y = df["is_explicit_content"]

In [120]:
df.dtypes

artist_name             object
artist_popularity        int64
artist_followers         int64
artist_genres           object
track_popularity         int64
track_danceability     float64
track_energy           float64
track_loudness         float64
track_tempo            float64
track_duration_ms        int64
is_explicit_content       bool
words_count            float64
dtype: object

In [121]:
# TEMP!

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df["artist_name"] = le.fit_transform(df["artist_name"])

In [122]:
df.iloc[-1, :]

artist_name                                                           10
artist_popularity                                                     33
artist_followers                                                   89380
artist_genres          ('black metal', 'dark black metal', 'metal', '...
track_popularity                                                      35
track_danceability                                                 0.176
track_energy                                                       0.963
track_loudness                                                    -4.817
track_tempo                                                      123.779
track_duration_ms                                                 250507
is_explicit_content                                                False
words_count                                                          179
Name: 8051, dtype: object

# Remove Outliers

In [123]:
df.shape

(8052, 12)

In [124]:
df.dropna(inplace=True)

In [125]:
df.shape

(7067, 12)

In [126]:
df.isnull().sum()

artist_name            0
artist_popularity      0
artist_followers       0
artist_genres          0
track_popularity       0
track_danceability     0
track_energy           0
track_loudness         0
track_tempo            0
track_duration_ms      0
is_explicit_content    0
words_count            0
dtype: int64

In [127]:
df.iloc[-1, :]

artist_name                                                           10
artist_popularity                                                     33
artist_followers                                                   89380
artist_genres          ('black metal', 'dark black metal', 'metal', '...
track_popularity                                                      35
track_danceability                                                 0.176
track_energy                                                       0.963
track_loudness                                                    -4.817
track_tempo                                                      123.779
track_duration_ms                                                 250507
is_explicit_content                                                False
words_count                                                          179
Name: 8051, dtype: object

In [128]:
# df["country_code"] = df["country_code"].astype('category')
df["artist_name"] = df["artist_name"].astype('category')

In [129]:
# take all numerics
quatiative = df.select_dtypes(exclude=['category'])

# boundaries for chaking outliers
upper_bound = quatiative.mean() + 3 * quatiative.std()
lower_bound = quatiative.mean() - 3 * quatiative.std()

outliers = []

for num, row in quatiative.iterrows():
    outliers_in_row = [1 for col in quatiative.columns if row[col] > upper_bound[col] or row[col] < lower_bound[col]].count(1)    
    # add this index(row) if 30% of it's content is outlaiered
    if outliers_in_row > 0.1 * len(row):
        outliers.append(num)

print(f"Number of indexes with 30% outliers of it's content are: {len(outliers)}")

KeyError: 'artist_genres'

In [None]:
df.drop(outliers, inplace=True)

df.shape

In [130]:
df["is_explicit_content"] = df["is_explicit_content"].astype("int").astype("category")

In [131]:
df.iloc[-1, :]

artist_name                                                           10
artist_popularity                                                     33
artist_followers                                                   89380
artist_genres          ('black metal', 'dark black metal', 'metal', '...
track_popularity                                                      35
track_danceability                                                 0.176
track_energy                                                       0.963
track_loudness                                                    -4.817
track_tempo                                                      123.779
track_duration_ms                                                 250507
is_explicit_content                                                    0
words_count                                                          179
Name: 8051, dtype: object

In [132]:
df.dtypes

artist_name            category
artist_popularity         int64
artist_followers          int64
artist_genres            object
track_popularity          int64
track_danceability      float64
track_energy            float64
track_loudness          float64
track_tempo             float64
track_duration_ms         int64
is_explicit_content    category
words_count             float64
dtype: object

# Normalization

In [15]:
df.describe()

Unnamed: 0,artist_popularity,artist_followers,track_popularity,track_danceability,track_energy,track_loudness,track_tempo,track_duration_ms,words_count
count,7067.0,7067.0,7067.0,7067.0,7067.0,7067.0,7067.0,7067.0,7067.0
mean,63.492571,4086961.0,50.185227,0.605783,0.65587,-7.435617,120.875314,219265.3,1421.834159
std,17.056069,10584770.0,23.807716,0.166263,0.216246,3.653642,29.505605,68929.93,8409.516816
min,0.0,1.0,0.0,0.0667,0.0047,-30.928,38.006,15160.0,0.0
25%,52.0,54975.0,41.0,0.494,0.518,-8.904,96.407,177150.0,232.0
50%,65.0,485157.0,55.0,0.614,0.682,-6.589,120.01,208693.0,330.0
75%,76.0,2700366.0,67.0,0.728,0.831,-4.9455,140.0585,246539.0,527.5
max,100.0,91127160.0,100.0,0.981,1.0,0.763,216.194,1050893.0,154678.0


In [16]:
from sklearn.preprocessing import MinMaxScaler
cols_to_scale = ["artist_popularity","artist_followers","track_popularity", "track_tempo", "track_loudness", "track_duration_ms"]
for c in cols_to_scale:
    scaler = MinMaxScaler() 
    scaled_values = scaler.fit_transform(df[c].to_numpy().reshape(-1, 1)) 
    df.loc[:,c] = scaled_values

In [67]:
df.describe()

Unnamed: 0,artist_popularity,artist_followers,track_popularity,track_danceability,track_energy,track_loudness,track_tempo,track_duration_ms
count,8052.0,8052.0,8052.0,8052.0,8052.0,8052.0,8052.0,8052.0
mean,62.328738,3816801.0,49.513413,0.601675,0.637045,-7.929202,120.160684,222110.0
std,17.501853,10207610.0,23.536597,0.171643,0.233869,4.481629,29.810634,80429.11
min,0.0,1.0,0.0,0.0,0.0021,-43.738,0.0,15160.0
25%,50.0,38665.0,41.0,0.489,0.494,-9.3755,95.9705,175720.5
50%,64.0,398996.0,54.0,0.613,0.672,-6.7625,119.9565,208465.5
75%,75.0,2322952.0,66.0,0.729,0.823,-5.02875,140.00825,249483.2
max,100.0,91127160.0,100.0,0.981,1.0,0.763,216.194,1252322.0


In [133]:
X = df.drop(columns=["is_explicit_content", "artist_genres"])
y = df["is_explicit_content"]

In [36]:
df.dropna()

Unnamed: 0,artist_name,artist_popularity,artist_followers,artist_genres,track_popularity,track_danceability,track_energy,track_loudness,track_tempo,track_duration_ms,is_explicit_content,words_count
0,1308,0.86,0.025819,"('gauze pop', 'indietronica', 'shiver pop')",0.90,0.761,0.525,0.758196,0.240555,0.215929,0.0,411.0
1,1261,0.85,0.002405,(),1.00,0.695,0.540,0.796314,0.470997,0.148148,1.0,343.0
2,3336,0.90,0.035248,"('australian hip hop',)",0.97,0.591,0.764,0.802878,0.740353,0.122277,1.0,606.0
3,1067,0.90,0.094098,"('glam rock', 'mellow gold', 'piano rock')",0.96,0.796,0.798,0.776750,0.437886,0.181104,0.0,280.0
4,74,0.94,0.363534,"('british soul', 'pop', 'pop soul', 'uk pop')",0.97,0.604,0.366,0.738664,0.583513,0.202306,0.0,208.0
...,...,...,...,...,...,...,...,...,...,...,...,...
6192,40,0.38,0.000036,(),0.32,0.792,0.873,0.703859,0.488192,0.213380,0.0,1547.0
6194,1892,0.76,0.015724,"('australian dance', 'australian pop', 'dance ...",0.57,0.767,0.643,0.780979,0.460048,0.239186,0.0,382.0
6195,1841,0.43,0.000185,"('vogue',)",0.37,0.803,0.888,0.748730,0.465677,0.303804,0.0,369.0
6196,1750,0.72,0.010305,"('escape room', 'indie soul', 'lgbtq+ hip hop')",0.69,0.794,0.757,0.766274,0.392754,0.165603,0.0,340.0


# All in one

In [134]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [135]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

clf1 = DecisionTreeClassifier()
clf2 = RandomForestClassifier(n_estimators=200)
clf3 = KNeighborsClassifier(n_neighbors=4)
clf4 = LogisticRegression()
alg_names = ["decision tree", "random forest", "KNN", "logistic regression"]

In [136]:
df.isnull().sum()

artist_name            0
artist_popularity      0
artist_followers       0
artist_genres          0
track_popularity       0
track_danceability     0
track_energy           0
track_loudness         0
track_tempo            0
track_duration_ms      0
is_explicit_content    0
words_count            0
dtype: int64

In [137]:
for idx, clf in enumerate([clf1, clf2, clf3, clf4]):
    clf.fit(X_train, y_train)
    y_pred_train = clf.predict(X_train)
    y_pred = clf.predict(X_test)
    
#     print(clf.feature_importances_)
    
    print(alg_names[idx])
    print("==========================")
#     print(f"\tTrain data")
#     print("\t==========================")
#     print(f"{metrics.classification_report(y_true = y_train, y_pred = y_pred_train)}")
#     print(f"\tTest data")
#     print("\t==========================")
#     print(f"{metrics.classification_report(y_true = y_test, y_pred = y_pred)}")


    print('f1 on Train data= ', metrics.f1_score(y_true = y_train, y_pred = y_pred_train))
    print('f1 on test data= ', metrics.f1_score(y_true = y_test, y_pred = y_pred))
#     print('Accuracy on Train data= ', metrics.accuracy_score(y_true = y_train, y_pred = y_pred_train))
#     print('Accuracy on test data= ', metrics.accuracy_score(y_true = y_test, y_pred = y_pred))
    print()

print(df.shape)

decision tree
f1 on Train data=  1.0
f1 on test data=  0.565365025466893

random forest
f1 on Train data=  1.0
f1 on test data=  0.666046511627907

KNN
f1 on Train data=  0.5453690644099671
f1 on test data=  0.357396449704142

logistic regression
f1 on Train data=  0.020242914979757085
f1 on test data=  0.032679738562091505

(7067, 12)


In [140]:
from sklearn.model_selection import GridSearchCV

params_decision_tree = {"max_depth": [7,9,15], "min_samples_split": [3,5,7,10]}
params_random_forest = {"n_estimators": [150,200, 250],"max_depth": [9,15,17]}
params_knn = {"n_neighbors": range(3, 20)}

clf_params = [(DecisionTreeClassifier(), params_decision_tree),
             (RandomForestClassifier(), params_random_forest), 
             (KNeighborsClassifier(), params_knn)]
             

for idx, (clf, params) in enumerate(clf_params):
    clf_cv = GridSearchCV(clf,params,cv=10)
    clf_cv.fit(X,y)
    print(alg_names[idx])
    print("==========================")
    print(f"best params are: {clf_cv.best_params_}")
    print(f"best score is: {clf_cv.best_score_}")

decision tree
best params are: {'max_depth': 9, 'min_samples_split': 7}
best score is: 0.7904427998445333
random forest
best params are: {'max_depth': 15, 'n_estimators': 250}
best score is: 0.8239737389360142
KNN
best params are: {'n_neighbors': 4}
best score is: 0.7218058187850351
