<font color='lime'>

# Model training and parameter selection with Grid Search

</font> 

<font color='yellow'>

## Support Vector Machine (SVM)

</font> 

<font color='orange'>

### Load the data

</font> 

In [None]:
import pandas as pd

file_path = 'data/spotify_songs_with_mood.csv'
data = pd.read_csv(file_path)

<font color='orange'>

### Separate to subsets and train 

</font> 

In [None]:
from sklearn.model_selection import train_test_split

# We'll need to separate out the features (X) and the target variable (y)
X = data.drop(['mood'], axis=1)  # all columns except 'mood'
y = data['mood']  # the target variable

# Split the dataset into training (70%) and testing (30%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Shape of the training and testing data
X_train.shape, X_test.shape, y_train.shape, y_test.shape

<font color='orange'>

### Run the algo

</font> 

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

# Identifying numerical and categorical columns (excluding 'track_id' which is unique and 'mood' which is the target)
numerical_cols = X_train.select_dtypes(include=['float64', 'int64']).columns.tolist()
categorical_cols = X_train.select_dtypes(include=['object']).columns.tolist()
categorical_cols.remove('track_id')  # We exclude the track_id as it's a unique identifier

# For simplicity, let's use only numerical features for training the Decision Tree model
X_train_numerical = X_train[numerical_cols]
X_test_numerical = X_test[numerical_cols]

# Decision Tree model
dt = DecisionTreeClassifier(random_state=42)

# Parameters for GridSearchCV
params = {
    'max_depth': [10, 20, 30, None],
    'min_samples_leaf': [1, 2, 4],
    'min_samples_split': [2, 5, 10]
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(estimator=dt, param_grid=params, cv=5, n_jobs=-1, verbose=1)
grid_search.fit(X_train_numerical, y_train)

# Best parameters
best_params = grid_search.best_params_
best_params

<font color='orange'>

### Save and test

</font> 

In [None]:
# Train the model with the best parameters
best_dt = DecisionTreeClassifier(**best_params, random_state=42)
best_dt.fit(X_train_numerical, y_train)

# Save the model
import joblib

model_filename = 'data/songs_mood_decision_tree_model.joblib'
joblib.dump(best_dt, model_filename)

# Predictions on the test set
y_pred = best_dt.predict(X_test_numerical)

# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)
percent_accuracy = accuracy * 100  # convert to percent

model_filename, percent_accuracy

<font color='yellow'>

## Model estimation

</font> 

<font color='orange'>

### Visualization of predicted values

</font> 

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Generate confusion matrix
cm = confusion_matrix(y_test, y_pred, labels=best_dt.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=best_dt.classes_)

# Plot the confusion matrix
plt.figure(figsize=(10,10))
disp.plot(cmap=plt.cm.Blues)
plt.xticks(rotation=45)
plt.title('Confusion Matrix for Predicted Mood Values')
plt.show()

<font color='orange'>

### Visualization of the decision tree

</font> 

In [None]:
from sklearn.tree import plot_tree

# Visualize the top levels of the Decision Tree
plt.figure(figsize=(40,20))
plot_tree(best_dt, filled=True, feature_names=numerical_cols, class_names=best_dt.classes_, max_depth=3)
plt.title('Decision Tree Visualization (Top Levels)')
plt.show()