In [None]:
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns 
from sklearn.preprocessing import StandardScaler, LabelEncoder 
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold 
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix   
import warnings
warnings.filterwarnings('ignore')  

## Loading features from text file into a Pandas dataframe
features = pd.read_csv('features.txt', delimiter='\t')

X = features.drop(['Genre'], axis=1)
y = features['Genre'] 

le = LabelEncoder() # Label Encoding is more suitable for ordered data
y_encoded=le.fit_transform(y) # Groups every song and its features with a numerical label based on genre

X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=1, stratify=y_encoded) 
# stratify ensures all genres are trained and tested

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

## Hyperparameter Tuning
param_grid = {
    'C':[0.1,1,10,1000],
    'solver':['liblinear','lbfgs'],
    'max_iter':[1000, 5000]
}

grid_search = GridSearchCV(LogisticRegression(random_state=1), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_scaled, y_train)

print("Best Parameters: ",grid_search.best_params_)
print("Best Score: ", grid_search.best_score_)

## Observations
## The results show the stability of Logisitic Regression
## The features and preprocessing are already well-suited since the best score is extremely close to actual score
## The model has been evaluated clearly


Best Parameters:  {'C': 0.1, 'max_iter': 1000, 'solver': 'lbfgs'}
Best Score:  0.6908411949685535
