In [None]:
#importing necessary libraries and packages
import os
import pandas as pd
import seaborn as sns


In [None]:
#loading the spotify 1 million songs data set
data_path = os.path.join(os.getcwd(), "..", "..", "data", "spotify_data.csv")
spotify_tracks = pd.read_csv(data_path)

In [None]:
#getting a overview of dataset
spotify_tracks.head()

In [None]:
#getting information about data types in dataset
spotify_tracks.info()

In [None]:
#statistical overview of data
spotify_tracks.describe()

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

In [None]:
#plotting histogram for all the columns/features of data
spotify_tracks.hist(bins=50, figsize=(20, 15))

In [None]:
#adding a new column in data set duration_mins by diving duration_ms (duration milliseconds) by 60000
spotify_tracks['duration_mins'] = spotify_tracks['duration_ms'] / 60000

In [None]:
#overview of new column
spotify_tracks['duration_mins']

In [None]:
#dropping the columns we don't need
spotify_tracks = spotify_tracks.drop(['track_id', 'duration_ms'], axis=1)

In [None]:
#overview of data after dropping and adding of columns
spotify_tracks.head()

In [None]:
#creating a new dataframe with features with numeric values
attributes = spotify_tracks[['popularity', 'year', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature', 'duration_mins']]

In [None]:
#correlation of all the features
corr_matrix = attributes.corr()

corr_matrix['popularity'].sort_values(ascending=False)

In [None]:
#plotting correlation heatmap using seaborn
cmap = sns.diverging_palette(230, 20, as_cmap=True)

plt.figure(figsize=(15, 10))
sns.heatmap(attributes.corr(), annot=True, fmt='.1g', vmin=-1, vmax=1, center=0, cmap=cmap)
plt.title("Correlation Matrix", fontweight='bold', fontsize='large')

In [None]:
#identifying highly correlated pairs
corr_matrix = attributes.corr()
corr_pairs = corr_matrix.unstack()
positive_corr = corr_pairs[(corr_pairs >= 0.5) & (corr_pairs < 1)]
print("Highly Correlated Pairs: \n", positive_corr)


negative_corr = corr_pairs[(corr_pairs <= -0.5) & (corr_pairs > -1)]
print("\nLess Correlated Pairs: \n", negative_corr)

In [None]:
#setting ratings based on popularity score - popularity score 0 - 50 = Low, score = 51 - 100 = Popular
popularity_verdict = spotify_tracks.copy()
popularity_verdict['verdict'] = ''

for i, row in popularity_verdict.iterrows():
    score = 'low'
    if row.popularity >= 50:
        score = 'popular'
    popularity_verdict.at[i, 'verdict'] = score
    

In [None]:
#getting sense of new dataframe by looking at few columns
popularity_verdict[['artist_name', 'track_name', 'popularity', 'verdict']].head()

In [None]:
#counting popularity score values
popularity_verdict[['popularity']].value_counts()

In [None]:
#printing number of songs with popularity score = 0
print('Number of songs with 0 as popularity value: ', len(popularity_verdict[popularity_verdict.popularity == 0]))


In [None]:
#printing percentage of songs with 0 popularity score
print('Percent of data with a popularity of 0: {0:.2f}%'.format\
      (len(popularity_verdict[popularity_verdict.popularity == 0]) / len(popularity_verdict) * 100))

In [None]:
#sorting the data based on descending popularity
popularity_verdict = popularity_verdict.sort_values('popularity', ascending=False).reset_index()

In [None]:
#last 5 records
popularity_verdict.tail()

In [None]:
#as almost 15% of entries have 0 popularity score, we drop the records with 0 popularity score as this will help
#model in predicting better. 0 value records will not have significance in our analysis.
popularity_verdict = popularity_verdict[popularity_verdict.popularity > 0]

#last 5 records with popularity score = 1, hence records with popularity score = 0 are dropped
popularity_verdict.tail()

In [None]:
#plotting histogram for popularity score distribution
popularity_verdict['popularity'].plot(kind='hist', bins=50)
plt.xlabel('Popularity Score')
plt.ylabel('Number of Songs')
plt.title('Popularity Score Distribution')


In [None]:
#popularity vs loudness scatterplot
fig, ax = plt.subplots(figsize= (15, 10))
sns.scatterplot(data = popularity_verdict, x = 'popularity', y = 'loudness', hue = 'verdict', size = 'verdict', sizes = (20, 200), legend = 'full')

In [None]:
#popularity vs danceability scatter plot
fig, ax = plt.subplots(figsize= (15, 10))
sns.scatterplot(data = popularity_verdict, x = 'popularity', y = 'danceability', hue = 'verdict', size = 'verdict', sizes = (20, 200), legend = 'full')

In [None]:
pop_ver_att = popularity_verdict[['year', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature', 'duration_mins']]

In [None]:
#defining x and y df for our analysis
X = pop_ver_att.select_dtypes(include='number')
y = popularity_verdict['verdict']

In [None]:
X.head()

In [None]:
y.head()

In [None]:
#getting ready for ml using Sci-kit learn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
model_X = scaler.fit(X)
X_scaled = model_X.transform(X)

X_train, X_test, y_train, y_test = train_test_split (X_scaled, y, random_state=42, test_size=0.3, shuffle=True)


Logistic Regression

In [None]:
#importing necessary libraries to perform logistic regression
from sklearn import datasets, linear_model, metrics

log_reg = linear_model.LogisticRegression(solver = 'lbfgs', max_iter = 1000)

#train the model using training set
log_reg.fit(X_train, y_train)

#making predictions on the testing set
y_pred = log_reg.predict(X_test)

#printing accuracy of algorithm
print("Logistic Regression Model Accuracy (in %):",
metrics.accuracy_score(y_test, y_pred) * 100)




In [None]:
#importing confusion matrix and classification report
from sklearn.metrics import classification_report, confusion_matrix

#printing confusion matrix and classification report
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

**K Nearest Neighbors**

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train, y_train)
pred = knn.predict(X_test)

#printing confusion matrix and classification report
print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))

In [None]:
#printing accuracy of algorithm
print("KNN Model Accuracy (in %):",
metrics.accuracy_score(y_test, pred) * 100)

**Random Forest**

In [None]:
from sklearn.ensemble import RandomForestClassifier

rforest = RandomForestClassifier(n_estimators = 100)

rforest.fit(X_train, y_train)

y_pred = rforest.predict(X_test)

#printing confusion matrix and classification report
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))



In [None]:
#printing accuracy of algorithm
print("Random Forest Model Accuracy (in %):",
metrics.accuracy_score(y_test, y_pred) * 100)

**Decision Tree**

In [None]:
from sklearn.tree import DecisionTreeClassifier

d_tree = DecisionTreeClassifier(random_state=42, max_depth=2)

d_tree.fit(X_train, y_train)

y_pred = d_tree.predict(X_test)

In [None]:
#printing confusion matrix and classification report
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
#printing accuracy of algorithm
print("Decsision Tree Model Accuracy (in %):",
metrics.accuracy_score(y_test, y_pred) * 100)