In [1]:
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
import pandas as pd


df = pd.read_csv('result.csv')

# Add duration in minutes
df["duration_mins"] = df["duration_ms"] / 60000

# Define popularity levels
data = df.copy()
data.loc[((df.popularity >= 0) & (df.popularity <= 50)), "popularity_level"] = 1
data.loc[((df.popularity > 50) & (df.popularity <= 70)), "popularity_level"] = 2
data.loc[((df.popularity > 70) & (df.popularity <= 100)), "popularity_level"] = 3
data["popularity_level"] = data["popularity_level"].astype("int")
data['popularity_level'].value_counts()


# Define target variable 'y' and features 'X'
y = data['popularity_level']
X = data.drop(columns=['popularity_level'])

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Define the preprocessing steps
preprocessor = ColumnTransformer([
    ('minmax', MinMaxScaler(), [
        'tempo', 'duration_mins', 'loudness', 
        'energy', 'speechiness', 'danceability', 'liveness', 
        'instrumentalness', 'valence',  
        'acousticness'
    ]),
    ('categorical', OneHotEncoder(), ['key'])
], remainder='passthrough')

# K-Nearest Neighbors (KNN) Classifier with best parameters
pipeline_knn = make_pipeline(preprocessor, KNeighborsClassifier(
    n_neighbors=15,
    weights='distance',
    metric='manhattan'
))

# Fit the model
pipeline_knn.fit(X_train, y_train)

# Make predictions
y_pred_knn = pipeline_knn.predict(X_test)

# Evaluate the model
accuracy_knn = accuracy_score(y_test, y_pred_knn)
train_score_knn = pipeline_knn.score(X_train, y_train)
test_score_knn = pipeline_knn.score(X_test, y_test)
cr_knn = classification_report(y_test, y_pred_knn)
scores_knn = cross_val_score(pipeline_knn, X_train, y_train, cv=StratifiedKFold(n_splits=10))

print(f"Model: KNeighborsClassifier")
print(f"Best parameters for KNN: {{'kneighborsclassifier__metric': 'manhattan', 'kneighborsclassifier__n_neighbors': 15, 'kneighborsclassifier__weights': 'distance'}}")
print(f"Accuracy on Test Set for KNeighborsClassifier = {accuracy_knn:.2f}")
print(f"Train Score for KNeighborsClassifier = {train_score_knn:.2f}")
print(f"Test Score for KNeighborsClassifier = {test_score_knn:.2f}\n")
print(cr_knn)
print(f"KNeighborsClassifier: CrossVal Accuracy Mean: {scores_knn.mean():.2f} and Standard Deviation: {scores_knn.std():.2f} \n")

# SVC Classifier with best parameters
pipeline_svc = make_pipeline(preprocessor, SVC(
    C=0.1,
    kernel='linear',
    degree=2,
    gamma='scale'
))

pipeline_svc.fit(X_train, y_train)
y_pred_svc = pipeline_svc.predict(X_test)

# Evaluate the model
accuracy_svc = accuracy_score(y_test, y_pred_svc)
train_score_svc = pipeline_svc.score(X_train, y_train)
test_score_svc = pipeline_svc.score(X_test, y_test)
cr_svc = classification_report(y_test, y_pred_svc)
scores_svc = cross_val_score(pipeline_svc, X_train, y_train, cv=StratifiedKFold(n_splits=10))

print(f"Model: SVC")
print(f"Best parameters for SVC: {{'svc__C': 0.1, 'svc__degree': 2, 'svc__gamma': 'scale', 'svc__kernel': 'linear'}}")
print(f"Accuracy on Test Set for SVC = {accuracy_svc:.2f}")
print(f"Train Score for SVC = {train_score_svc:.2f}")
print(f"Test Score for SVC = {test_score_svc:.2f}\n")
print(cr_svc)
print(f"SVC: CrossVal Accuracy Mean: {scores_svc.mean():.2f} and Standard Deviation: {scores_svc.std():.2f} \n")

NameError: name 'data' is not defined