## Part 1.3 Non NN models using sci_kit learn
This notebook contains the preprocessing of the sampled data for non-neural network model training & validation on both 1-input & 2-input features using sci-kit learn library

### 1.3.1 Two-input feature model training

#### Import dataset & preprocess

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from joblib import dump

#import file
music_df = pd.read_csv("p1_data/sample_train.csv", index_col=False, sep=",", quotechar='"')

# split data into train/test
X1 = music_df["Lyrics"]+ " " + music_df["Artist"]
y1 = music_df["Genre"]
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=0.2, random_state=42)

# vectorize features & labels using TF-IDF
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X1_train)
X_test_tfidf = vectorizer.transform(X1_test)
dump(vectorizer, "models_p11/rf_vectorizer.joblib") # save for final testing

#### SVM

In [None]:
from sklearn.svm import SVC

# build & train SVM model
svm_model = SVC(kernel='linear', C=1.0)
svm_model.fit(X_train_tfidf, y1_train)

# predict & calculate accuracy
y_pred = svm_model.predict(X_test_tfidf)

# calculate accuracy
accuracy = accuracy_score(y1_test, y_pred)
print("Test Accuracy:", accuracy)

# calculate & plot confusion matrix
svm_conf_matrix = sklearn.metrics.confusion_matrix(y1_test, y_pred)
plt.figure(figsize=(10, 8))
sns.heatmap(svm_conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=svm_model.classes_, yticklabels=svm_model.classes_)
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix SVM')
plt.show()

#### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

# define & train  Random Forest model
rf_model = RandomForestClassifier(n_estimators=200, random_state=42)
rf_model.fit(X_train_tfidf, y1_train)

# predict & calculate accuracy
y_pred1 = rf_model.predict(X_test_tfidf)
accuracy1 = accuracy_score(y1_test, y_pred1)
print("Test Accuracy:", accuracy1)

# calculate & plot confusion matrix
rf_conf_matrix = sklearn.metrics.confusion_matrix(y1_test, y_pred1)
plt.figure(figsize=(10, 8))
sns.heatmap(rf_conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=rf_model.classes_, yticklabels=rf_model.classes_)
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix - Random Forest')
plt.show()

# save RF model for final testing
dump(rf_model, "models_p11/rf_model.joblib")

#### MLP

In [None]:
from sklearn.neural_network import MLPClassifier

# define & train MLPClassifier model
mlp_model = MLPClassifier(hidden_layer_sizes=(128,64,32), activation='tanh', solver='adam', random_state=42)
mlp_model.fit(X_train_tfidf, y1_train)

# predict & calculate accuracy
y_pred2 = mlp_model.predict(X_test_tfidf)
accuracy2 = accuracy_score(y1_test, y_pred2)
print("Test Accuracy:", accuracy2)

# calculate confusion matrix
mlp_conf_matrix = sklearn.metrics.confusion_matrix(y1_test, y_pred2)

# Plot confusion matrix
plt.figure(figsize=(10, 8))
sns.heatmap(mlp_conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=mlp_model.classes_, yticklabels=mlp_model.classes_)
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix MLP')
plt.show()

### 1.3.2 One-input feature model training

#### Import dataset & preprocess

In [None]:
# split data into train/test
X1_b = music_df["Lyrics"]
y1_b = music_df["Genre"]
X1_trainb, X1_testb, y1_trainb, y1_testb = train_test_split(X1_b, y1_b, test_size=0.2, random_state=42)

# vectorize features & labels using TF-IDF
vectorizer_b = TfidfVectorizer()
X_train_tfidf_b = vectorizer_b.fit_transform(X1_trainb)
X_test_tfidf_b = vectorizer_b.transform(X1_testb)

#### SVM

In [None]:
# build & train SVM model
svm_model_b = SVC(kernel='linear', C=1.0)
svm_model_b.fit(X_train_tfidf_b, y1_trainb)

# predict & calculate accuracy
y_pred_b = svm_model_b.predict(X_test_tfidf_b)
svm_accuracy_b = accuracy_score(y1_testb, y_pred_b)
print("Test Accuracy:", svm_accuracy_b)

# calculate & plot confusion matrix
svm_conf_matrix_b = sklearn.metrics.confusion_matrix(y1_testb, y_pred_b)
plt.figure(figsize=(10, 8))
sns.heatmap(svm_conf_matrix_b, annot=True, fmt="d", cmap="Blues", xticklabels=svm_model_b.classes_, yticklabels=svm_model_b.classes_)
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix SVM')
plt.show()

#### Random Forest

In [None]:
# define & train Random Forest model
rf_model_b = RandomForestClassifier(n_estimators=200, random_state=42)
rf_model_b.fit(X_train_tfidf_b, y1_trainb)

# predict & calculate accuracy
y_pred_b1 = rf_model.predict(X_test_tfidf_b)
rf_accuracy_b = accuracy_score(y1_testb, y_pred_b1)
print("Test Accuracy:", rf_accuracy_b)

# calculate & plot confusion matrix
rf_conf_matrix_b = sklearn.metrics.confusion_matrix(y1_testb, y_pred_b1)
plt.figure(figsize=(10, 8))
sns.heatmap(rf_conf_matrix_b, annot=True, fmt="d", cmap="Blues", xticklabels=rf_model_b.classes_, yticklabels=rf_model_b.classes_)
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix - Random Forest')
plt.show()

#### MLP

In [None]:
# define & train MLP model
mlp_model_b = MLPClassifier(hidden_layer_sizes=(128,64,32), activation='tanh', solver='adam', random_state=42)
mlp_model_b.fit(X_train_tfidf_b, y1_trainb)

# predict & calculate accuracy
y_pred_b2 = mlp_model_b.predict(X_test_tfidf_b)
mlp_accuracy_b = accuracy_score(y1_testb, y_pred_b2)
print("Test Accuracy:", mlp_accuracy_b)

# calculate & plot confusion matrix
mlp_conf_matrix_b = sklearn.metrics.confusion_matrix(y1_testb, y_pred_b2)
plt.figure(figsize=(10, 8))
sns.heatmap(mlp_conf_matrix_b, annot=True, fmt="d", cmap="Blues", xticklabels=mlp_model_b.classes_, yticklabels=mlp_model_b.classes_)
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix MLP')
plt.show()