In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import seaborn.objects as so

In [None]:
df = pd.read_csv('data/data.csv')
df.head()

In [None]:
df["popularity"].describe()

In [None]:
df.loc[df['popularity'] == 0]

In [None]:
p = so.Plot(df, "popularity")
p.add(so.Bars(), so.Hist())

In [None]:
# dropping any song where "popularity" = 0 as it's likely 0 is just a placeholder for missing values
df.drop(df[df["popularity"] == 0].index, inplace=True)

In [None]:
# sanity check - this should equal 0
df.loc[df["popularity"] ==0]

In [None]:
p = so.Plot(df, "popularity")
p.add(so.Bars(), so.Hist())

In [None]:
df['target'].value_counts(normalize=True)

In [None]:
^^ There's clearly some class imbalance here ... I think we need to SMOTE...

In [None]:
# train test split
from sklearn.model_selection import train_test_split, GridSearchCV

to_drop = ['popularity', 'id', 'artists', 'name', 'release_date', 'year', 'target']

X = df.drop(labels= to_drop, axis=1)
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=27)

X_train.info()

In [None]:
y_train[:5]

In [None]:
X_test.info()

# Baseline Model - might delete this since we didn't do any preprocessing, it feels a little irrelevant
let's just do a dummy classifier

In [None]:
from sklearn.dummy import DummyClassifier
# Instantiate the model
dummy_model = DummyClassifier(strategy="most_frequent")

# fitting the model to our newly encoded dataset
dummy_model.fit(X_train, y_train)

# predicting the first 10
dummy_model.predict(X_test)[:10]

In [None]:
# checking the ratio of 1 to 0 in our training set
print(y_train.value_counts(normalize=True))

This tracks that the dummy classifier gave us zeroes as our train and test have basically the same ratio of 0 to 1

# First model - no SMOTE

In [None]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from imblearn.pipeline import Pipeline as ImPipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score, classification_report



numeric_features = ['valence', 'acousticness','energy', 'danceability', 'duration_ms', 'explicit', 'instrumentalness', 
              'liveness', 'loudness', 'mode', 'speechiness', 'tempo']
numeric_transformer = StandardScaler()

categorical_features = ['key']
categorical_transformer = OneHotEncoder()


CT = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Logistic Regression Pipeline with SMOTE and GridSearchCV
pipe_lr = ImPipeline(steps=[
    ('preprocessor', CT),
    ('classifier', LogisticRegression(random_state=0, class_weight='balanced'))
])

In [None]:
pipe_lr.fit(X_train, y_train)

#  predictions
y_predict_lr = pipe_lr.predict(X_test)
y_predict_lr

In [None]:
# plotting a confusion matrix to assess our model

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
cm = confusion_matrix(y_test, y_predict_lr)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()
plt.show()

In [None]:
print(f"""
Our model's accuracy on the test set is {round(accuracy_score(y_test, y_predict_lr), 2)}.
Our model's recall on the test set is {round(recall_score(y_test, y_predict_lr), 2)}.
Our model's precision on the test set is {round(precision_score(y_test, y_predict_lr), 2)}.
Our model's f1-score on the test is {round(f1_score(y_test, y_predict_lr), 2)}.
""")

# Print classification report
print(classification_report(y_test, y_predict_lr))

# Evaluation 

The evaluation metric that I think makes the most sense is precision. If we have a false positive (a song is listed as popular but it is not) there is a higher probability the listener will skip the song if they don't like it. If we have a false negative (a song is not listed as popular but it is) it will likely not be recommended to the listener and will hence not be heard. For business purposes, it makes the most sense to maximize listening time.

# Now let's try with SMOTE

In [None]:
from imblearn.over_sampling import SMOTE

# Logistic Regression Pipeline with SMOTE and GridSearchCV
pipe_smote_lr = ImPipeline(steps=[
    ('preprocessor', CT),
    ('smote', SMOTE(random_state=27)),
    ('classifier', LogisticRegression(random_state=0, class_weight='balanced'))
])

pipe_smote_lr.fit(X_train, y_train)

In [None]:
# Extract the trained logistic regression classifier from the pipeline
lr_model = pipe_smote_dt.named_steps['classifier']

In [None]:
# predictions
y_predict_smote_lr = lr_model.predict(X_test)
y_predict_smote_lr

In [None]:
# plotting a confusion matrix to assess our model

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
cm = confusion_matrix(y_test, y_predict_smote_lr)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()
plt.show()

In [None]:
print(f"""
Our model's accuracy on the test set is {round(accuracy_score(y_test, y_predict_smote_lr), 2)}.
Our model's recall on the test set is {round(recall_score(y_test, y_predict_smote_lr), 2)}.
Our model's precision on the test set is {round(precision_score(y_test, y_predict_smote_lr), 2)}.
Our model's f1-score on the test is {round(f1_score(y_test, y_predict_smote_lr), 2)}.
""")

# Print classification report
print(classification_report(y_test, y_predict_smote_lr))

It looks like there is minimal difference with SMOTE.

# Decision Tree for Feature Importance

In [None]:
from sklearn.tree import DecisionTreeClassifier, plot_tree

# Define the pipeline
pipe_smote_dt = ImPipeline(steps=[
    ('preprocessor', CT),
    ('smote', SMOTE(random_state=27)),
    ('dt', DecisionTreeClassifier(max_depth=3, random_state=27))
])

# Fit the pipeline to the training data
pipe_smote_dt.fit(X_train, y_train)

# Make predictions on the test data
y_pred_dt = pipe_smote_dt.predict(X_test)

# Evaluate the model
print(f"""
Our model's accuracy on the test set is {round(accuracy_score(y_test, y_pred_dt), 2)}.
Our model's recall on the test set is {round(recall_score(y_test, y_pred_dt), 2)}.
Our model's precision on the test set is {round(precision_score(y_test, y_pred_dt), 2)}.
Our model's f1-score on the test is {round(f1_score(y_test, y_pred_dt), 2)}.
""")

# Print classification report
print(classification_report(y_test, y_pred_dt))

# Extract the trained decision tree classifier from the pipeline
dt_model = pipe_smote_dt.named_steps['dt']

# Plotting the decision tree
plt.figure(figsize=(20, 15))  # Set plot size (denoted in inches)
plot_tree(dt_model, filled=True, fontsize=12)
plt.show()

In [None]:
X_train.iloc[:, 4]

# The features that the decision tree split on are key, energy, danceability, and duration. Let's try running a model with just those columns.  

In [None]:
# singling out those 4 columns
X_train_small = X_train[['energy', 'key', 'danceability', 'duration_ms']]
X_train_small.head()

In [None]:
X_train_small.loc[X_train_small["key"] == 'D']

In [None]:
X_train_small.info()

In [None]:
# we need to define a new pipeline since the orinigal column transformer was trained on the full X_train set

numeric_small = ['energy', 'danceability', 'duration_ms']
categorical_small = ['key']

ss_nn = StandardScaler()
ohe_nn = OneHotEncoder(handle_unknown='ignore')

numeric_transformed = ss_nn.fit_transform(X_train_small[numeric_small])
cat_transformed = ohe_nn.fit_transform(X_train_small[categorical_small])

print("Numeric Transformed Shape:", numeric_transformed.shape)
print("Categorical Transformed Shape:", cat_transformed.shape)

# Convert the sparse matrix to a dense numpy array
cat_transformed_dense = cat_transformed.toarray()

# Concatenate the transformed numeric and categorical features
X_train_small_processed = np.concatenate([numeric_transformed, cat_transformed_dense], axis=1)

# this isn't working. I'll look at it later


In [None]:
small_model = LogisticRegression()

small_model.fit(X_train_small_processed, y_train)

# Make predictions
y_pred_small = small_model.predict(X_test)
print(y_pred_small)

In [None]:
# confusion matrix
cm = confusion_matrix(y_test, y_predict_smote_lr)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()
plt.show()

In [None]:
print(f"""
Our model's accuracy on the test set is {round(accuracy_score(y_test, y_predict_smote_lr), 2)}.
Our model's recall on the test set is {round(recall_score(y_test, y_predict_smote_lr), 2)}.
Our model's precision on the test set is {round(precision_score(y_test, y_predict_smote_lr), 2)}.
Our model's f1-score on the test is {round(f1_score(y_test, y_predict_smote_lr), 2)}.
""")

# Print classification report
print(classification_report(y_test, y_predict_smote_lr))

## Grid Search? might get rid of this

In [None]:
# Sample 100% of the training data for initial tuning
# Create a random sample of 100 rows
X_train_sample = X_train.sample(n=100, random_state=27)
y_train_sample = y_train.sample(n=100, random_state=27)

# Split the sampled data into small training and testing sets
small_X_train, small_X_test, small_y_train, small_y_test = train_test_split(
    X_train_sample, y_train_sample, test_size=0.25, random_state=27)

In [None]:
# let's try a gridsearch with cross validation
parameters = {'rfc__criterion': ['gini', 'entropy'],
          'rfc__max_depth': [3, 5],
          'smote__k_neighbors': [3, 5, 9]}

gs = GridSearchCV(estimator=pipe_smote_rf,
                 param_grid=parameters,
                 cv=5)

# fit to samples to minimize run time
#gs.fit(small_X_train, small_y_train)

print(#gs.best_params_)

In [None]:
new_parameters = {'rfc__criterion': ['gini', 'entropy'],
          'rfc__max_depth': [10, 20],
          'smote__k_neighbors': [3, 5, 9]}