In [5]:
# import libraies 
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from catboost import CatBoostClassifier
from dataprep.eda import create_report

# Reading Data

In [3]:
# importing the data.
raw_data = pd.read_csv('../data/train.csv')
raw_data.head()

Unnamed: 0,Artist Name,Track Name,Popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_in min/ms,time_signature,Class
0,Bruno Mars,That's What I Like (feat. Gucci Mane),60.0,0.854,0.564,1.0,-4.964,1,0.0485,0.0171,,0.0849,0.899,134.071,234596.0,4,5
1,Boston,Hitch a Ride,54.0,0.382,0.814,3.0,-7.23,1,0.0406,0.0011,0.00401,0.101,0.569,116.454,251733.0,4,10
2,The Raincoats,No Side to Fall In,35.0,0.434,0.614,6.0,-8.334,1,0.0525,0.486,0.000196,0.394,0.787,147.681,109667.0,4,6
3,Deno,Lingo (feat. J.I & Chunkz),66.0,0.853,0.597,10.0,-6.528,0,0.0555,0.0212,,0.122,0.569,107.033,173968.0,4,5
4,Red Hot Chili Peppers,Nobody Weird Like Me - Remastered,53.0,0.167,0.975,2.0,-4.279,1,0.216,0.000169,0.0161,0.172,0.0918,199.06,229960.0,4,10


In [4]:
raw_data.info()
""" 
17996 observation
17 variable
Artist Name, and Track Name are of type str, while the rest are numerical variables.
"""

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17996 entries, 0 to 17995
Data columns (total 17 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Artist Name         17996 non-null  object 
 1   Track Name          17996 non-null  object 
 2   Popularity          17568 non-null  float64
 3   danceability        17996 non-null  float64
 4   energy              17996 non-null  float64
 5   key                 15982 non-null  float64
 6   loudness            17996 non-null  float64
 7   mode                17996 non-null  int64  
 8   speechiness         17996 non-null  float64
 9   acousticness        17996 non-null  float64
 10  instrumentalness    13619 non-null  float64
 11  liveness            17996 non-null  float64
 12  valence             17996 non-null  float64
 13  tempo               17996 non-null  float64
 14  duration_in min/ms  17996 non-null  float64
 15  time_signature      17996 non-null  int64  
 16  Clas

' \n17996 observation\n17 variable\nArtist Name, and Track Name are of type str, while the rest are numerical variables.\n'

In [13]:
auto_report = create_report(raw_data, title="Auto EDA report")



  0%|          | 0/3015 [00:00<?, ?it/s]

In [14]:
auto_report.save(path="../report/Auto_EDA_report.html")

Report has been saved to ../report/Auto_EDA_report.html!


# Data Imputing

In [15]:
raw_data.isnull().sum()

Artist Name              0
Track Name               0
Popularity             428
danceability             0
energy                   0
key                   2014
loudness                 0
mode                     0
speechiness              0
acousticness             0
instrumentalness      4377
liveness                 0
valence                  0
tempo                    0
duration_in min/ms       0
time_signature           0
Class                    0
dtype: int64

In [16]:
# filling missing value with zeros. 
df = raw_data.fillna(0)

In [17]:
# impute using the mean of each column (other option)
"""df = raw_data
df['Popularity'].fillna(raw_data['Popularity'].mean(), inplace=True) # imputing the popularity column with the mean
df['key'].fillna(raw_data['key'].mean(), inplace=True) # imputing the key with the mean 
df['instrumentalness'].fillna(raw_data['instrumentalness'].mean(), inplace=True) # imputing the instrumentalness with mean
"""
df.isnull().sum()

Artist Name           0
Track Name            0
Popularity            0
danceability          0
energy                0
key                   0
loudness              0
mode                  0
speechiness           0
acousticness          0
instrumentalness      0
liveness              0
valence               0
tempo                 0
duration_in min/ms    0
time_signature        0
Class                 0
dtype: int64

In [18]:
# Encoding categorical variables.
encoder = LabelEncoder()

# Enconding artist name
df['Artist Name'] = encoder.fit_transform(df['Artist Name'])

In [19]:
# Encoding track name
df['Track Name'] = encoder.fit_transform(df['Track Name'])

# or remove it all together
df.drop('Track Name', axis=1, inplace=True)

In [20]:
df.head(2)

Unnamed: 0,Artist Name,Popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_in min/ms,time_signature,Class
0,1182,60.0,0.854,0.564,1.0,-4.964,1,0.0485,0.0171,0.0,0.0849,0.899,134.071,234596.0,4,5
1,1092,54.0,0.382,0.814,3.0,-7.23,1,0.0406,0.0011,0.00401,0.101,0.569,116.454,251733.0,4,10


# Data Splitting

In [22]:
X = df.drop('Class', axis=1)
y = df[['Class']]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

# 