AI MODEL TRAIN NOTEBOOK WITH SPOTIFY AUDIO FEATURES

In [None]:
import pandas as pd
import json

# Load audio features from JSON file
with open('audio_features.json') as file:  # Replace 'audio_features.json' with the actual file name
    data = json.load(file)

# Convert JSON data to DataFrame
df_audio_features = pd.DataFrame(data)

# Display the first few rows to confirm it loaded correctly
print(df_audio_features.head())

In [None]:
print(len(df_audio_features))

In [None]:
# Generate sample mood labels to match the DataFrame length
# Relevant features for mood classification
features = [
    'acousticness', 'danceability', 'energy', 'instrumentalness', 
    'liveness', 'loudness', 'speechiness', 'tempo', 'valence'
]
X = df_audio_features[features]

moods = ['happy', 'sad', 'energetic', 'calm']
df_audio_features['mood'] = [moods[i % len(moods)] for i in range(len(df_audio_features))]

X = df_audio_features.select_dtypes(include=['number'])

# If you have a 'mood' column as your target variable, separate it
y = df_audio_features['mood'] if 'mood' in df_audio_features.columns else None

# Check the resulting DataFrame
print(X.head())

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Visualize the distribution of mood classes
sns.countplot(x=y)
plt.title("Distribution of Classes")
plt.xlabel("Mood")
plt.ylabel("Count")
plt.show()


In [None]:
from sklearn.preprocessing import StandardScaler

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
from imblearn.over_sampling import SMOTE

# Use SMOTE to oversample the minority classes
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

# Check the new distribution after SMOTE
sns.countplot(x=y_resampled)
plt.title("Distribution of Classes After SMOTE")
plt.xlabel("Mood")
plt.ylabel("Count")
plt.show()

In [None]:
from sklearn.model_selection import train_test_split
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Set up parameter grid for Random Forest
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

# Initialize RandomForestClassifier and GridSearchCV
rf_model = RandomForestClassifier(class_weight='balanced', random_state=42)
grid_search = GridSearchCV(rf_model, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Best model from GridSearchCV
best_rf_model = grid_search.best_estimator_
print("Best Parameters:", grid_search.best_params_)


In [None]:
from sklearn.model_selection import cross_val_score

# Perform cross-validation with the best model
cross_val_scores = cross_val_score(best_rf_model, X_scaled, y, cv=5)
print("Cross-validation scores:", cross_val_scores)
print("Average cross-validation score:", cross_val_scores.mean())


In [None]:
from sklearn.metrics import accuracy_score, classification_report

# Predict on the test data
y_pred = best_rf_model.predict(X_test)

# Evaluate the model's performance
print("Test Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


In [None]:
from sklearn.metrics import confusion_matrix

# Generate and plot the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
sns.heatmap(conf_matrix, annot=True, cmap='Blues', fmt='d')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()
