# Simple base line model (for comparison with final model)

A simple base model will be built and tested.  
Feature engineering is not applied except simple one hot encoding for suitable categorical features with a low amount of unique values only; numerical features are also standardized.  
No Hyperparameter-Tuning yet.

In [None]:
# import modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import validation_curve

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import learning_curve

In [None]:
# read data
df = pd.read_csv('data/spotify_dataset.csv')
df.head()

### Train-Test-Split

In [None]:
# Train-Test-Split
df_train, df_test = train_test_split(df, test_size = 0.3, random_state = 42)

print('df_train: ', df_train.shape)
print('df_test: ', df_test.shape)

# Second Train-Test-Split for val/aim data
df_test, df_val = train_test_split(df_test, test_size=0.33, random_state = 42)

print('df_test: ', df_test.shape)
print('df_val: ', df_val.shape)

### Data cleaning

In [None]:
from src.features.clean_data_func import clean_data

#apply clean_data function on train data
df_train_cleaned = clean_data(df_train)
display(df_train_cleaned.head())

#apply clean_data function on test and val data
df_test_cleaned = clean_data(df_test)
df_val_cleaned = clean_data(df_val)


In [None]:
# splitting train data into features and target without further feature engineering
features_to_drop = [
    'track_id',
    'artists',
    'album_name',
    'track_name',
    'track_genre',
    'popularity',
    'popularity_cat']

features_train = df_train_cleaned.drop(features_to_drop, axis = 1)
target_train = df_train_cleaned['popularity_cat']

# splitting test data into features and target
features_test = df_test_cleaned.drop(features_to_drop, axis = 1)
target_test = df_test_cleaned['popularity_cat']

# splitting val data into features and target
features_val = df_val_cleaned.drop(features_to_drop, axis = 1)
target_val = df_val_cleaned['popularity_cat']

In [None]:
# check features and target of train data
display(features_train.head(), features_train.shape)
display(target_train.head(), target_train.shape)

### Data preparation and training

In [None]:
# getting columns easy for copy-paste
features_train.columns

In [None]:
## classifier DecisionTreeClassifier() can handle numerical (which are actual categorical) features well:
# - 'key' (0-11)
# - 'mode' (0-1)
# - 'time_signature' (0-4)
# - 'explicit' (0-1)

In [None]:
# defining simple base pipeline with DecisionTreeClassifier
pipeline_tree = Pipeline(steps=[('preprocessor', StandardScaler()),
    ('model', DecisionTreeClassifier(class_weight='balanced', random_state=42))
])

In [None]:
# training model
pipeline_tree.fit(features_train, target_train)

# predicting on test data
target_test_pred = pipeline_tree.predict(features_test)

# show metrics
#print('Accuracy: ', accuracy_score(target_test, target_test_pred))
#print('Precision: ', precision_score(target_test, target_test_pred, average='weighted'))
#print('Recall: ', recall_score(target_test, target_test_pred, average='weighted'))
#print('F1-Score: ', f1_score(target_test, target_test_pred, average='weighted'))
print('Confusion Matrix: \n', confusion_matrix(target_test, target_test_pred), '\n')
print('Classification Report: \n', classification_report(target_test, target_test_pred))

In [None]:
# predicting on val data
target_val_pred = pipeline_tree.predict(features_val)

# show metrics
print('Confusion Matrix: \n', confusion_matrix(target_val, target_val_pred), '\n')
print('Classification Report: \n', classification_report(target_val, target_val_pred))

In [None]:
# save classification report of val data in results folder of src to load it in finale model for direct comparison
simple_model_classification_report = classification_report(target_val, target_val_pred, output_dict=True)
simple_model_classification_report = pd.DataFrame(simple_model_classification_report).transpose()
simple_model_classification_report.columns = ['precision_simple', 'recall_simple', 'f1_score_simple', 'support_simple']
simple_model_classification_report.to_csv('src/results/simple_model_classification_report.csv')

In [None]:
# check cross validation score
cv_results = cross_val_score(estimator=pipeline_tree,
                            X=features_train,
                            y=target_train,
                            cv=5,
                            scoring='f1_weighted',
                            n_jobs=-1)
cv_results.mean()