<a href="https://colab.research.google.com/github/adityamulik/INFO-6105---Data-Science-Engineering-And-Tools/blob/main/Spotify_Music_Prediction_Algorithm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Importing necessary Libraries

In [None]:
import pandas as pd
import numpy as np 
import seaborn as sns
import os 
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf
from google.colab import drive
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import confusion_matrix

%matplotlib inline

  import pandas.util.testing as tm


### Loading dataset from Google Drive
Dataset: https://www.kaggle.com/datasets/yamaerenay/spotify-dataset-19212020-600k-tracks?select=tracks.csv

In [None]:
# Load Data from Google Drive
drive.mount('/content/drive')
csvFile = open('/content/drive/MyDrive/dataset_INFO6105/tracks.csv')

MessageError: ignored

In [None]:
# Load Data
data = pd.read_csv(csvFile)
data.info()

In [None]:
data.head(5)

In [None]:
data.shape

In [None]:
data.columns

In [None]:
pd.DataFrame(data.dtypes, columns=['DataType'])

In [None]:
data[['danceability', 'energy', 'tempo']].describe()

In [None]:
data['time_signature'].unique()

In [None]:
data['loudness'].unique()

In [None]:
data['popularity'].unique()

In [None]:
data.sort_values(by='popularity', ascending=False).head(15)

In [None]:
sns.relplot(x="tempo", y="popularity", data=data)

In [None]:
sns.relplot(x="danceability", y="popularity", data=data)

In [None]:
sns.relplot(x="key", y="popularity", data=data)

In [None]:
sns.relplot(x="speechiness", y="popularity", data=data)

In [None]:
# sns.pairplot(data, vars=['tempo', 'popularity', 'valence', 'energy', 'speechiness', 'danceability', 'acousticness'])

In [None]:
data[data['popularity'] == 1]['tempo'].hist(grid=False, bins=10)

In [None]:
data.describe()

In [None]:
pd.isnull(data).sum()

In [None]:
null_data = data[data.isnull().any(axis=1)]
print(null_data)

In [None]:
data.dropna(inplace = True)

In [None]:
pd.isnull(data).sum()

In [None]:
sns.distplot(data['popularity']).set_title('Popularity Distribution')

In [None]:
data.corr()

In [None]:
popular_greater_50 = data[data.popularity > 50]
sns.distplot(popular_greater_50['acousticness'])

In [None]:
popular_less_50 = data[data.popularity < 50]
sns.distplot(popular_less_50['acousticness'])

In [None]:
data['duration_m'] = data['duration_ms']/60000
data = data.reindex(sorted(data.columns), axis=1)
data.head()

In [None]:
data.drop('duration_ms', axis = 1, inplace = True)

In [None]:
data.describe()

In [None]:
data.info()

In [None]:
# Finding Outliers, where songs are larger than 60 minutes
data[data['duration_m']>60]

In [None]:
plt.figure(figsize=(20, 10))
sns.heatmap(data.corr(),annot = True)

In [None]:
# Modeling and Prediction

In [None]:
data.head(15)

In [None]:
data = pd.get_dummies(data, columns=['time_signature', 'key', 'mode'], drop_first=True)

In [None]:
pd.options.display.max_columns = None

In [None]:
data.head()

In [None]:
features = ['acousticness', 'instrumentalness', 'loudness', 'energy']

In [None]:
X = data[features]
y = data['popularity']

In [None]:
pd.cut(data['popularity'], bins=3)

In [None]:
pd.cut(data['popularity'], bins=3, labels = ["low", "medium", "high"]).value_counts()

In [None]:
data['popularity'] = pd.cut(data.popularity, bins=3, labels = ["low", "medium", "high"], right = True)

In [None]:
data.head(100)

In [None]:
data.loc[data['popularity']== 'high']

In [None]:
y = data.popularity
y.value_counts()/y.count()

In [None]:
pop_count = data.popularity.value_counts()

In [None]:
ros = RandomOverSampler()
X_ros, y_ros = ros.fit_resample(X, y)
print('Removed indexes:')
print(X_ros.shape, y_ros.shape)

In [None]:
X_ros.value_counts()

In [None]:
y_ros.value_counts()

In [None]:
feature_cols = ['acousticness', 'instrumentalness', 'loudness', 'energy']
X = data[feature_cols]

In [None]:
X_ros.head()

In [None]:
knn = KNeighborsClassifier(n_neighbors=1)

In [None]:
knn.fit(X_ros, y_ros)

In [None]:
# Model Evaluation

In [None]:
y_pred_class = knn.predict(X_ros)

In [None]:
# Overfitting as 98% is too good to be a score
print(metrics.accuracy_score(y_ros, y_pred_class))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_ros, y_ros, random_state=99, test_size=0.5)

knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train, y_train)

y_pred_class = knn.predict(X_test)
print(metrics.accuracy_score(y_test, y_pred_class))

In [None]:
k_range2 = range(1, 22)
param_dist = dict(n_neighbors=k_range2)

### Imputation using RandomizedSearchCV

In [None]:
rand = RandomizedSearchCV(knn, param_dist, cv=5, scoring='accuracy', n_iter=5, random_state=5)

In [None]:
rand.fit(X_train, y_train)

In [None]:
# examining the best model
print(rand.best_score_) 
print(rand.best_params_) 
print(rand.best_estimator_)

### Confusion Matrix

In [None]:
cmat = confusion_matrix(y_test, y_pred_class)
print('True Negative {}'.format(cmat[0,0]))
print('Flase Positive {}'.format(cmat[0,1]))
print('False Negative {}'.format(cmat[1,0]))
print('True Positive {}'.format(cmat[1,1]))
print('Accuracy Score: {}'.format(np.divide(np.sum([cmat[0,0], cmat[1,1], cmat[2,2]]), np.sum(cmat)))) 
print('Misclassification Rate: {}'.format(np.divide(np.sum([cmat[1,0], cmat[0,1], cmat[0,2], cmat[2,0], cmat[1,2], cmat[2,1]]), np.sum(cmat))))

### Model Accuracy on Test Data

In [None]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)

y_pred_class = knn.predict(X_test)

In [None]:
metrics.accuracy_score(y_test, y_pred_class)