In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import linear_model  # will be using for plotting trend line
from sklearn.preprocessing import MinMaxScaler # for normalizing data
from sklearn.cluster import KMeans 
%matplotlib inline

In [None]:
spotify = pd.read_csv('../input/spotify-data/data.csv')
# 10 random rows
spotify.sample(5)

In [None]:
song_features = pd.DataFrame()
# normalizer instance
scaler = MinMaxScaler()
for col in spotify.iloc[:,:-1].columns:      # excluding year col i.e, of int64 type
    if spotify[col].dtypes in ['float64', 'int64']:
        # adding normalized col
        scaler.fit(spotify[[col]])
        song_features[col] = scaler.transform(spotify[col].values.reshape(-1,1)).ravel()     
# first we would like to know that how many cluster or to say Genres can be clustered 
# with less SSE(Sum of Squared Error) we will use "Elbow method" to find out 

# KMeans instance
km = KMeans()
k_rng = range(1,200)  # k value
sse = [] # sse value for each k
for i in k_rng:
    km = KMeans(n_clusters = i)
    km.fit(song_features.sample(1000))
    # calculating sse
    sse.append(km.inertia_) 
    
# due to less computation power I am unable to use whole data 
# I guess 1000 sample of whole data can depict actual
plt.plot(k_rng,sse)
plt.xlabel('K value')
plt.ylabel('SSE Error')
plt.title('Best K value')
# plt.ylim(0,400)
# plt.xlim(0,100)
plt.show()

In [None]:
km = KMeans(n_clusters=25)
predicted_genres = km.fit_predict(song_features)
song_features['predicted_genres'] = predicted_genres
song_features['predicted_genres'] = song_features['predicted_genres'].apply(lambda x: 'Genre'+ str(x))
song_features.sample(10)

In [None]:
genres_grp = song_features.groupby(['predicted_genres']).size()
plt.figure(figsize=(10,6))
genres_grp.sort_values(ascending=True).plot.barh(color='yellow')
plt.xlabel('Total Songs')
plt.title('Genre Ranking')
plt.show()