# **GO CONCERT (SPOTIFY -> TICKETMASTER)**
# **GENRE CLASSIFICATION (MACHINE LEARNING)**

In [4]:
from google.colab import drive
drive.mount('/content/drive', force_remount= True)

Mounted at /content/drive


In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle

In [6]:
#reading Data
spotify_df = pd.read_csv('/content/drive/MyDrive/Spotify_BDP/Main_Data.csv')

In [7]:
spotify_df.head()

Unnamed: 0,id,name,artist_id,artists,artist_genre,album_type,album_id,album_name,album_release_date,duration_ms,...,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,000C3ZY8325A4yktxnnwCl,when the sun is a stranger,4Uqu4U6hhDMODyzSCtNDzG,poemme,"ambient worship,drift",album,3ypgq6ExA3JN8s2biuRK5e,soft ice,2017.0,390560,...,0,-33.246,1,0.0469,0.952,0.776,0.121,0.0835,134.542,3.0
1,000GxVUl2QoBm5gh63tbpF,chrono boost - original mix,"2PhITKrYcCL00gmtYGJcIe,2IVF45YhNY3Ic1FscjCMWO","gimmah,midnight smack",,compilation,6akGDSdZANHIZ0bAUEcsit,straight up glitch hop! vol. 3,2013.0,269640,...,6,-6.226,0,0.0863,1.3e-05,0.06,0.136,0.357,90.034,4.0
2,0014PFmFeg9ArijfRppSPa,way down deep (live),6IqRTh75wzqgbhpwx7pgyV,david grissom,,album,2TVsae5VW9X2dAtyJMnR6j,how it feels to fly,2014.0,407520,...,0,-6.223,1,0.0428,0.0874,0.397,0.969,0.652,82.592,4.0
3,00174RHq41L2LkeLdE0eyj,confusion,1z7SXnaVwt0Fnncfrz94SG,colder,coldwave,album,4bTWH5P6CykNnANLOAzKCR,again,2003.0,296533,...,10,-6.718,0,0.0547,0.0676,0.778,0.339,0.96,100.023,4.0
4,001COKlREAcM9pTCJp2YeL,beautiful slumber,4H4sW5MC8ErjBcUxSBMFG4,cosmic sleep captain,,album,634BMVeTPoGYc2CzDwnrsE,deep calming sleep: celestial music for restfu...,2021.0,111059,...,7,-24.727,1,0.104,0.951,0.853,0.101,0.0419,72.559,1.0


In [8]:
spotify_df = spotify_df.dropna(how='any')

In [9]:
#NLP Code for Lemmatizing Genres as per Ticket Master Genre List
import nltk
from nltk.stem import WordNetLemmatizer
from difflib import SequenceMatcher
nltk.download('omw-1.4')
import numpy as np
import pandas as pd

# download WordNet
nltk.download('wordnet')

# create a WordNet lemmatizer object
lemmatizer = WordNetLemmatizer()

# define the MGL
tkm = [
    'alternative',
    'ballads',
    'romantics',
    'blues',
    'bollywood',
    'chanson francaise',
    'children',
    'classical',
    'country',
    'dance',
    'electronic',
    'folk',
    'hip-hop',
    'rap',
    'holiday',
    'jazz',
    'latin',
    'medieval/renaissance',
    'metal',
    'new age',
    'other',
    'pop',
    'r&b',
    'reggae',
    'religious',
    'rock',
    'world'
]

# define a function to lemmatize words
def lemmatize_word(word):
    return lemmatizer.lemmatize(word, pos='n')

def lemmatize_genre(genres_str):
    # split the string on ',' and remove any leading/trailing spaces
    genres_list = [genre.strip() for genre in genres_str.split(',')]
    # convert to lowercase and remove spaces
    genres_list = [genre.lower().replace(' ', '') for genre in genres_list]
    # find the closest match in the MGL
    closest_match = max(tkm, key=lambda x: sum([SequenceMatcher(None, genre, x).ratio() for genre in genres_list]))
    # lemmatize the closest match and return it
    return lemmatize_word(closest_match)

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [10]:
def make_word_dict(s):
    """
    This function takes a string and breaks each word at special characters and extracts all words from the list
    to make a dictionary of all the existing words.
    """
    word_dict = {}
    special_chars = [', ', ',']
    for char in special_chars:
        s = s.replace(char, ' ')
    words = s.split()
    for word in words:
        if word not in word_dict:
            word_dict[word] = 1
        else:
            word_dict[word] += 1
    return word_dict


In [11]:
def find_max_word(s, word_dict):
    """
    This function takes a string and a word dictionary as input and returns the word in the string with the maximum count
    in the dictionary.
    """
    special_chars = [', ', ',']
    for char in special_chars:
        s = s.replace(char, ' ')
    words = s.split()
    max_word = None
    max_count = 0
    for word in words:
        if word not in tkm:
          max_word = 'other'
          max_count = 0
        elif word in word_dict and word_dict[word] > max_count:
            max_word = word
            max_count = word_dict[word]
    return max_word

In [12]:
s = ''
word_dict = make_word_dict(s)
word_dict['other']=1

# Apply the two functions on each column value of the 'artist_genre' column in the spotify_df dataframe
for i, row in spotify_df.iterrows():
    # Apply Function 1 to extract words from the current column value and add them to the word_dict
    word_dict = {**word_dict, **make_word_dict(row['artist_genre'])}
    # Apply Function 2 to assign a single word to the current column value based on the maximum count in the word_dict
    max_word = find_max_word(row['artist_genre'], word_dict)
    spotify_df.at[i, 'artist_genre'] = max_word


In [13]:
# Define a function to convert loudness values to a 0-100 scale
def loudness_norm(loudness, min_l, max_l):
  #Applying the min max scaler formula
  l_min = min_l
  l_max = max_l
  return float((loudness - l_min)/(l_max - l_min))

In [14]:
#Normalizing the tempo on a 0 to 1 scale
# Define a function to convert loudness values to a 0-100 scale
def tempo_norm(tempo, min_tempo, max_tempo):
  #Applying the min max scaler formula
  t_min = min_tempo
  t_max = max_tempo
  return float((tempo - t_min)/(t_max - t_min))

In [15]:
max_loudness = spotify_df['loudness'].max()
min_loudness = spotify_df['loudness'].min()
spotify_df['loudness'] = spotify_df['loudness'].apply(lambda x: loudness_norm(x, min_loudness, max_loudness))

max_temp = spotify_df['tempo'].max()
min_temp = spotify_df['tempo'].min()
spotify_df["tempo"] = spotify_df['tempo'].apply(lambda x: tempo_norm(x, min_temp, max_temp))

# #Required Preprocessing for loudness, NLP Genre lemmatization
# spotify_df['genre'] = spotify_df['artist_genre'].apply(lambda x: lemmatize_genre(x))

In [16]:
counts = spotify_df['artist_genre'].value_counts()
print(counts)

other          372827
rock            48453
pop             38890
metal           24844
jazz            17274
rap             13730
country          7621
blues            6637
folk             4926
r&b              3455
electronic       3167
bollywood        3138
reggae           2613
dance            1930
alternative      1741
classical         772
world             522
latin             277
Name: artist_genre, dtype: int64


In [17]:
spotify_df = spotify_df[spotify_df['artist_genre'] != 'other']
spotify_df['genre'] = spotify_df['artist_genre']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  spotify_df['genre'] = spotify_df['artist_genre']


In [18]:
#Label Encoding the genre feature for classification purposes
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()

y_le = spotify_df['genre']
spotify_df['genre'] = labelencoder.fit_transform(spotify_df['genre'])
label_map = dict(zip(y_le, spotify_df['genre']))
print(label_map)

{'rock': 15, 'country': 4, 'bollywood': 2, 'classical': 3, 'jazz': 8, 'pop': 11, 'metal': 10, 'rap': 13, 'dance': 5, 'blues': 1, 'electronic': 6, 'folk': 7, 'r&b': 12, 'alternative': 0, 'reggae': 14, 'world': 16, 'latin': 9}


In [19]:
#Selecting the Input and the Output Features before Classification
X = spotify_df[["acousticness", "danceability", "energy", "instrumentalness", "mode", "key", "liveness", "loudness", "speechiness", "tempo", "valence"]]
Y = spotify_df["genre"]

In [20]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.20, random_state=42)

# **KNN Model**

In [21]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=100)
knn.fit(X_train, y_train)

y_pred = knn.predict(X_test)

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test, y_pred))

[[   0    0    0    0    0    0    3    0   16    0   46  101    0   13
     0  138    0]
 [   0    9    0    0    1    0    0    2  106    0   45  417    0    4
     0  682    0]
 [   0    1    2    0    1    0    0    3   20    0    1  434    0   10
     0  163    0]
 [   0    0    0   10    0    0    0   25   79    0    3   17    0    0
     0   14    0]
 [   0    7    0    0   12    0    0    1   23    0    9  653    0    0
     0  813    0]
 [   0    0    0    0    0    0    5    1   21    0   43  217    0   16
     0   99    0]
 [   0    0    0    2    0    0   77   16  168    0  141   83    0    9
     0  150    0]
 [   0    3    0    7    3    0    7  110  236    0   17  337    0    3
     0  255    0]
 [   0    1    0    6    3    0   19   72 1916    0   53  783    0   58
     0  509    0]
 [   0    0    0    0    0    0    1    0   11    0    0   21    0    0
     0   17    0]
 [   0    0    0    0    0    0    6   15   61    0 3416  175    0    4
     0 1379    0]
 [   0    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# **Random Forest Model**

In [28]:
from sklearn.ensemble import RandomForestClassifier
rfm = RandomForestClassifier(n_estimators = 50)
rfm.fit(X_train, y_train)

#Predcting the Model using Random Forest
y_pred = rfm.predict(X_test)

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test, y_pred))

[[  27    1    0    0    3    0    3    2   18    0   33   72    1   21
     1  135    0]
 [   0  152    2    0   12    0    0   10  133    0   26  288    1   10
     3  629    0]
 [   0    8  118    0    5    0    0    1   19    0    0  322    0   38
     1  123    0]
 [   0    1    0   78    0    0    1   11   34    0    4   11    0    0
     0    8    0]
 [   0   20    1    0  420    0    0    7   35    0    5  497    0    6
     0  527    0]
 [   0    0    0    0    2   76   14    0   14    0   20  192    0   17
     1   66    0]
 [   0    0    0    3    0    1  233   16  108    0   60   98    0   18
     0  109    0]
 [   0   14    0    9    6    0   15  238  199    0   13  197    0    3
     0  284    0]
 [   0   34    1   26    5    0   34   59 2222    0   10  422    1   85
     3  518    0]
 [   0    1    1    0    0    0    1    0   10    8    0   10    0    3
     1   15    0]
 [   0    0    0    1    4    1   12   29   52    0 3798  116    0   17
     0 1026    0]
 [   1   2

# **XGBoost Model**

In [29]:
# Import the necessary libraries
import xgboost as xgb
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

# Create the XGBoost classifier
xgb_classifier = xgb.XGBClassifier()

# Train the classifier
xgb_classifier.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = xgb_classifier.predict(X_test)

# Generate the classification report
report = classification_report(y_test, y_pred)
print('Classification Report:')
print(report)

# Evaluate the classifier's accuracy
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)


Classification Report:
              precision    recall  f1-score   support

           0       0.50      0.01      0.01       317
           1       0.41      0.10      0.16      1266
           2       0.54      0.10      0.16       635
           3       0.56      0.44      0.49       148
           4       0.49      0.17      0.25      1518
           5       0.50      0.11      0.18       402
           6       0.50      0.33      0.40       646
           7       0.43      0.23      0.30       978
           8       0.59      0.63      0.61      3420
           9       0.83      0.10      0.18        50
          10       0.75      0.69      0.72      5056
          11       0.47      0.56      0.51      7824
          12       0.32      0.04      0.08       677
          13       0.62      0.63      0.62      2790
          14       0.43      0.21      0.28       497
          15       0.52      0.70      0.60      9652
          16       0.67      0.07      0.12       122

   

# **Naive Bayes Model**

In [30]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

# Create the Naive Bayes classifier
nb_classifier = GaussianNB()
# Train the classifier
nb_classifier.fit(X_train, y_train)
# Make predictions on the testing set
y_pred = nb_classifier.predict(X_test)
# Generate the classification report
report = classification_report(y_test, y_pred)
print('Classification Report:')
print(report)
# Evaluate the classifier's accuracy
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)

Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       317
           1       0.15      0.03      0.04      1266
           2       0.06      0.00      0.00       635
           3       0.16      0.74      0.26       148
           4       0.13      0.68      0.22      1518
           5       0.17      0.06      0.09       402
           6       0.38      0.15      0.22       646
           7       0.13      0.18      0.15       978
           8       0.51      0.49      0.50      3420
           9       0.00      0.00      0.00        50
          10       0.54      0.78      0.64      5056
          11       0.38      0.23      0.29      7824
          12       0.18      0.02      0.03       677
          13       0.53      0.57      0.55      2790
          14       0.13      0.07      0.09       497
          15       0.45      0.31      0.37      9652
          16       0.00      0.00      0.00       122

   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# **Saving .pkl Files**

In [25]:
with open('/content/drive/MyDrive/Spotify_BDP/rfmodel2.pkl', 'wb') as f:
    pickle.dump(rfm, f)

In [26]:
with open('/content/drive/MyDrive/Spotify_BDP/label_map2.pkl', 'wb') as f:
  pickle.dump(label_map, f)