# Imports

In [3]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression

In [14]:
file_path = Path('Resources/first20_playlists.csv')
playlist_df = pd.read_csv(file_path)

list(playlist_df.columns)

['Unnamed: 0',
 'Playlist',
 'Followers',
 'Songs',
 'Genre 1',
 'Genre 2',
 'URI',
 'Song URI',
 'Popularity',
 'Artist Name',
 'Song Name']

# Loading Data

In [15]:
columns = list(df.columns)

target = ["Popularity"]

In [27]:
# Load the data
file_path = Path('Resources/first20_playlists.csv')
playlist_df = pd.read_csv(file_path)

# Drop the null columns where all values are null
playlist_df = playlist_df.dropna(axis='columns', how='all')

# Drop the null rows
playlist_df = playlist_df.dropna()

# Drop '0' column
playlist_df = playlist_df.drop(columns='Unnamed: 0')

# Order by Popularity metric
playlist_df = playlist_df.sort_values(by=['Popularity'])


# Remove the `Issued` loan status
#issued_mask = df['loan_status'] != 'Issued'
#df = df.loc[issued_mask]

# convert interest rate to numerical
#df['int_rate'] = df['int_rate'].str.replace('%', '')
#df['int_rate'] = df['int_rate'].astype('float') / 100


# Convert the target column values to low_risk and high_risk based on their values
#x = {'Current': 'low_risk'}   
#df = df.replace(x)

#x = dict.fromkeys(['Late (31-120 days)', 'Late (16-30 days)', 'Default', 'In Grace Period'], 'high_risk')    
#df = df.replace(x)

playlist_df.reset_index(inplace=True, drop=True)

playlist_df.head(20)

Unnamed: 0,Playlist,Followers,Songs,Genre 1,Genre 2,URI,Song URI,Popularity,Artist Name,Song Name
0,Pop Rising,2642529,87,Pop,Indie,spotify:playlist:37i9dQZF1DWUa8ZRTfalHk,spotify:track:2Oo397nWzelAKMQBBIL8YI,0,blackbear,dead inside
1,Rock Hard,850600,150,Rock,Alternative,spotify:playlist:37i9dQZF1DWWJOmJ7nRx0C,spotify:track:3iSgwMXZtxE7fBMqK214JX,0,Fire From The Gods,Thousand Lifetimes
2,Pop Rising,2642529,87,Pop,Indie,spotify:playlist:37i9dQZF1DWUa8ZRTfalHk,spotify:track:4uUG5RXrOk84mYEfFvj3cK,0,David Guetta,I'm Good (Blue)
3,Rock This,4649112,50,Rock,Alternative,spotify:playlist:37i9dQZF1DXcF6B6QPhFDv,spotify:track:2Ot85xcajHDvU7cD2BdR2M,0,L.S. Dunes,Permanent Rebellion
4,Indie Pop,1707366,75,Pop,Indie,spotify:playlist:37i9dQZF1DWWEcRhUVtL8n,spotify:track:0Rop7nCpDSuqnuvzWvZIdq,0,Blake Rose,Magazine
5,Rock This,4649112,50,Rock,Alternative,spotify:playlist:37i9dQZF1DXcF6B6QPhFDv,spotify:track:4scqfHWE0J9dgdbjqstjS3,0,Muse,Kill Or Be Killed
6,Pop Rising,2642529,87,Pop,Indie,spotify:playlist:37i9dQZF1DWUa8ZRTfalHk,spotify:track:72yP0DUlWPyH8P7IoxskwN,0,Elton John,Hold Me Closer
7,Just Good Music,768845,59,Pop,Adult Contemporary,spotify:playlist:37i9dQZF1DX0b1hHYQtJjp,spotify:track:7f9K6fJcvBVcati3SivAPW,0,Jake Wesley Rogers,Modern Love
8,Pop Rising,2642529,87,Pop,Indie,spotify:playlist:37i9dQZF1DWUa8ZRTfalHk,spotify:track:3ZQLH6uKCfvgkbnMSVNCQe,0,Joji,YUKON (INTERLUDE)
9,Just Good Music,768845,59,Pop,Adult Contemporary,spotify:playlist:37i9dQZF1DX0b1hHYQtJjp,spotify:track:1nSbS97RaTSBF0ouJyoXM5,0,John K,something worth working on


# Split data into training and testing datasets

In [28]:
# Create our features
X = playlist_df.drop(columns='Popularity')
X = pd.get_dummies(X)


# Create our target
y = playlist_df.loc[:, target].copy()

In [29]:
X.describe()

Unnamed: 0,Songs,Playlist_Adrenaline Workout,Playlist_Alt Now,Playlist_Bedroom Pop,Playlist_Dirty Rock,Playlist_Feel Good Indie Rock,Playlist_Hard Rock,Playlist_Hot Hits USA,Playlist_Indie Pop,Playlist_Indie Rock Road Trip,...,Song Name_tears in the club (feat. the weeknd),"Song Name_the older you get, the less you cry",Song Name_the perfect pair,Song Name_toxic energy (with Bert McCracken of The Used),Song Name_two door tiffany,Song Name_watch,Song Name_watch you sleep.,Song Name_well…,Song Name_when we were young,Song Name_you're not special
count,1528.0,1528.0,1528.0,1528.0,1528.0,1528.0,1528.0,1528.0,1528.0,1528.0,...,1528.0,1528.0,1528.0,1528.0,1528.0,1528.0,1528.0,1528.0,1528.0,1528.0
mean,94.984293,0.065445,0.032723,0.065445,0.065445,0.065445,0.032723,0.032723,0.049084,0.065445,...,0.000654,0.000654,0.001309,0.000654,0.000654,0.000654,0.000654,0.000654,0.000654,0.000654
std,34.291134,0.24739,0.177968,0.24739,0.24739,0.24739,0.177968,0.177968,0.216114,0.24739,...,0.025582,0.025582,0.036167,0.025582,0.025582,0.025582,0.025582,0.025582,0.025582,0.025582
min,50.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,59.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,120.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,150.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [30]:
# Check the balance of our target values
y['Popularity'].value_counts()

59     64
58     57
63     51
61     51
62     49
       ..
24      1
26      1
97      1
4       1
100     1
Name: Popularity, Length: 76, dtype: int64

In [31]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

# Resampling

## Naive Random Oversampling

In [32]:
# Resample the training data with the RandomOversampler
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

Counter(y_resampled)

Counter({'Popularity': 1})

In [33]:
X_resampled.dtypes

Songs                           int64
Playlist_Adrenaline Workout     uint8
Playlist_Alt Now                uint8
Playlist_Bedroom Pop            uint8
Playlist_Dirty Rock             uint8
                                ...  
Song Name_watch                 uint8
Song Name_watch you sleep.      uint8
Song Name_well…                 uint8
Song Name_when we were young    uint8
Song Name_you're not special    uint8
Length: 3576, dtype: object

In [34]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [35]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix

y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 1, 0]])

In [36]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score

balanced_accuracy_score(y_test, y_pred)

0.13575164152087227

In [37]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.03      0.25      0.92      0.06      0.48      0.21         4
         26       0.00      0.00      1.00      0.00      0.00      0.00         1
         33       0.00      0.00      1.00      0.00      0.00      0.00         5
         34       0.00      0.00      1.00      0.00      0.00      0.00         3
         35       0.00      0.00      1.00      0.00      0.00      0.00         1
         36       0.00      0.00      1.00      0.00      0.00      0.00         4
         37       0.00      0.00      1.00      0.00      0.00      0.00         3
         38       0.00      0.00      0.95      0.00      0.00      0.00         4
         39       0.00      0.00      1.00      0.00      0.00      0.00         6
         40       0.00      0.00      1.00      0.00      0.00      0.00         2
         41       1.00      0.20      1.00      0.33      0.45      0.18         5
   