**Importing relevant libraries**

In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
import tensorflow as tf

**Importing Preprocessed Data**

In [3]:
df = pd.read_csv('data_preprocessed.csv')

In [4]:
df.head()

Unnamed: 0,year_41_60,year_61_80,year_81_00,year_01_20,acousticness,danceability,duration_ms,energy,explicit,instrumentalness,key,liveness,loudness,speechiness,tempo,valence,popularity
0,0,0,0,0,0.995,0.708,158648,0.195,0,0.563,10,0.151,-12.428,0.0506,118.469,0.779,0
1,0,0,0,0,0.994,0.379,282133,0.0135,0,0.901,8,0.0763,-28.454,0.0462,83.972,0.0767,0
2,0,0,0,0,0.604,0.749,104300,0.22,0,0.0,5,0.119,-19.924,0.929,107.177,0.88,0
3,0,0,0,0,0.995,0.781,180760,0.13,0,0.887,1,0.111,-14.734,0.0926,108.003,0.72,0
4,0,0,0,0,0.99,0.21,687733,0.204,0,0.908,11,0.098,-16.829,0.0424,62.149,0.0693,1


**Let's segragate inputs and targets for regression and classfication**

In [5]:
inputs = df.drop(columns=['popularity'])
targets_regression = df.popularity
targets_classification = np.where(df.popularity>df.popularity.mean(),1,0)

In [6]:
inputs.shape, targets_regression.shape, targets_classification.shape

((169909, 16), (169909,), (169909,))

**Now we will scale our inputs.**

**There is no clear answer of should we scale our dummies or not, here we are scaling all inputs.**

In [7]:
scaler = StandardScaler()
scaler.fit(inputs)
scaled_inputs = scaler.transform(inputs)

In [8]:
x_train_regression, x_test_regression, y_train_regression, y_test_regression = train_test_split(
    scaled_inputs, targets_regression, train_size=0.8, random_state=20)
x_train_classification, x_test_classification, y_train_classification, y_test_classification = train_test_split(
    scaled_inputs, targets_classification, train_size=0.8, random_state=20)

**Linear Regression**

In [20]:
linear_reg = LinearRegression()
linear_reg.fit(x_train_regression,y_train_regression)
linear_reg.score(x_train_regression,y_train_regression)

0.7644935896889175

In [21]:
linear_reg.score(x_test_regression,y_test_regression)

0.7607981046568085

**Let's make a dataframe containing information about features and their predicted weights by different models.**

In [22]:
findings = pd.DataFrame(inputs.columns.values, columns=['features'])
findings['linear_regression_weights'] = linear_reg.coef_
findings

Unnamed: 0,features,linear_regression_weights
0,year_41_60,1.989786
1,year_61_80,11.245073
2,year_81_00,14.766021
3,year_01_20,20.281005
4,acousticness,-1.349986
5,danceability,1.322712
6,duration_ms,0.041982
7,energy,-0.325945
8,explicit,0.754893
9,instrumentalness,-0.822728


**Logistic Regression**

In [12]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

In [23]:
logistic_reg = LogisticRegression()
logistic_reg.fit(x_train_classification,y_train_classification)
logistic_reg.score(x_train_classification,y_train_classification)

0.8505962759422337

In [24]:
logistic_reg.score(x_test_classification,y_test_classification)

0.8490671531987523

In [25]:
findings['logistic_regression_weights'] = logistic_reg.coef_[0]
findings

Unnamed: 0,features,linear_regression_weights,logistic_regression_weights
0,year_41_60,1.989786,0.794487
1,year_61_80,11.245073,1.916374
2,year_81_00,14.766021,2.591455
3,year_01_20,20.281005,4.177414
4,acousticness,-1.349986,-0.205189
5,danceability,1.322712,0.140955
6,duration_ms,0.041982,0.074141
7,energy,-0.325945,-0.148301
8,explicit,0.754893,0.219512
9,instrumentalness,-0.822728,-0.085407


**KNN Classifications**

import math
math.sqrt(len(y_test_classification))

knn_classifier = KNeighborsClassifier(n_neighbors=183, p=2)
knn_classifier.fit(x_train_classification,y_train_classification)
knn_classifier.score(x_train_classification,y_train_classification)

note: As we noticed, KNN with this huge number of records is beyond computational abilities of our local machine.
For this, let's apply othe classification models suitable for large data.

**Decision Tree Classification**

In [26]:
dt_classifier = DecisionTreeClassifier(criterion='entropy', max_depth=10,random_state=100)
dt_classifier.fit(x_train_classification, y_train_classification)
dt_classifier.score(x_train_classification,y_train_classification)

0.8589242755302479

In [27]:
dt_classifier.score(x_test_classification,y_test_classification)

0.8513036313342358

In [29]:
findings['dt_classification']=dt_classifier.feature_importances_
findings

Unnamed: 0,features,linear_regression_weights,logistic_regression_weights,dt_classification
0,year_41_60,1.989786,0.794487,0.003208
1,year_61_80,11.245073,1.916374,0.169878
2,year_81_00,14.766021,2.591455,0.321434
3,year_01_20,20.281005,4.177414,0.427852
4,acousticness,-1.349986,-0.205189,0.018287
5,danceability,1.322712,0.140955,0.006333
6,duration_ms,0.041982,0.074141,0.008481
7,energy,-0.325945,-0.148301,0.006214
8,explicit,0.754893,0.219512,0.001965
9,instrumentalness,-0.822728,-0.085407,0.004759


**Random Forest Classification**

In [30]:
rf_classifier = RandomForestClassifier(criterion='entropy', max_depth=10, random_state=100, n_jobs=2)
rf_classifier.fit(x_train_classification,y_train_classification)
rf_classifier.score(x_train_classification,y_train_classification)

0.8590861271123471

In [31]:
rf_classifier.score(x_test_classification,y_test_classification)

0.854628921193573

In [32]:
findings['rf_classification']=rf_classifier.feature_importances_
findings

Unnamed: 0,features,linear_regression_weights,logistic_regression_weights,dt_classification,rf_classification
0,year_41_60,1.989786,0.794487,0.003208,0.166184
1,year_61_80,11.245073,1.916374,0.169878,0.063375
2,year_81_00,14.766021,2.591455,0.321434,0.134201
3,year_01_20,20.281005,4.177414,0.427852,0.287671
4,acousticness,-1.349986,-0.205189,0.018287,0.146654
5,danceability,1.322712,0.140955,0.006333,0.006695
6,duration_ms,0.041982,0.074141,0.008481,0.030793
7,energy,-0.325945,-0.148301,0.006214,0.050959
8,explicit,0.754893,0.219512,0.001965,0.007544
9,instrumentalness,-0.822728,-0.085407,0.004759,0.013591
