In [None]:
import pandas as pd
data=pd.read_csv("../input/top50spotify2019/top50.csv", encoding='cp1252')
data.head()

In [None]:
#quick visualization of most popular genre group/sub group using WordCloud
from matplotlib import pyplot as plt
from wordcloud import WordCloud, STOPWORDS
string=str(data.Genre)
wordcloud = WordCloud(stopwords=STOPWORDS,
                      background_color='white',
                      width=1000,
                      height=1000).generate(string)
plt.imshow(wordcloud,interpolation='bilinear')
plt.axis("off")
plt.show()

So we can see both Pop is the most popular genre followed by dance. Surprising fact is that latin and canadian genres are also hugely popular.

In [None]:
#detailed visualization of relationship between genre and popularity using SwarmPlot
#genre with most popular song
#genre/genres with most/least number of popular songs
#genre/genres with most/least variance in popularity of songs
import seaborn as sns
plt.figure(figsize=(10,5))
swarmplot=sns.swarmplot(x="Genre",y="Popularity",data=data)
swarmplot.set_xticklabels(swarmplot.get_xticklabels(),rotation=90)
swarmplot.set_title("Relationship between Genre & Popularity")

In [None]:
#visualizing relationship between danceability and popularity using RegPlot
regplot=sns.regplot(x="Danceability",y="Popularity",data=data)
regplot.set_title("relationship between danceability and popularity")

In [None]:
print(data.dtypes)

In [None]:
type(data['Genre'])
print(data.groupby('Genre').size().sort_values(ascending=False))
genre_list=data['Genre'].values.tolist()

In [None]:
print(data.groupby('Artist.Name').size().sort_values(ascending=False))
artist_list=data['Artist.Name'].values.tolist()

In [None]:
data.isnull().sum()

In [None]:
data.rename(columns={'Loundness..dB..':'Loudness(db)','Valence.':'Valence','Length.':'Length','Acousticness..':'Acousticness','Speechiness.':'Speechiness'},inplace=True)

In [None]:
data.columns

In [None]:
data.describe()

In [None]:
from scipy import stats

In [None]:
skew_feat=data.skew()

In [None]:
skew_feat

In [None]:
import numpy as np
transform=np.asarray(data[['Liveness']].values)

In [None]:
data_transform=stats.boxcox(transform)[0]

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.hist(data['Liveness'],bins=10)
plt.show()
plt.hist(data_transform,bins=10)
plt.show()

In [None]:
transform1=np.asarray(data[['Popularity']].values)

In [None]:
df_transform1=stats.boxcox(transform1)[0]

In [None]:
import seaborn as sns
sns.distplot(data['Popularity'],bins=10,kde=True,kde_kws={"color": "k", "lw": 2, "label": "KDE"},color='yellow')
plt.show()
sns.distplot(df_transform1,bins=10,kde=True,kde_kws={"color": "k", "lw": 2, "label": "KDE"},color='black') #corrected skew data
plt.show()

In [None]:
corrmat=data.corr()
corrmat

In [None]:
f,ax=plt.subplots(figsize=(16,8))
sns.heatmap(corrmat,vmin=-1,vmax=1)

In [None]:
popular_artist=data.groupby('Artist.Name').size()
artist_list=data['Artist.Name'].tolist()

In [None]:
length=np.arange(len(popular_artist))

In [None]:
fig, ax=plt.subplots(figsize=(12,12))
plt.barh(length,popular_artist)
plt.yticks(length,artist_list)
plt.title('Most popular artists',fontsize=18)
plt.ylabel('Artists',fontsize=16)
plt.xlabel('Number of songs',fontsize=16)
plt.show()

In [None]:
#Dependence between Energy and Popularity
fig=plt.subplots(figsize=(10,10))
plt.title('Dependence between Energy and Popularity')
sns.regplot(x='Energy',y='Popularity',data=data)
sns.kdeplot(data.Energy,data.Popularity)

In [None]:
#Pie charts 
labels = data['Artist.Name'].value_counts().index
sizes = data['Artist.Name'].value_counts().values
colors = ['red', 'yellowgreen', 'lightcoral', 'lightskyblue','cyan', 'green', 'black','yellow']
plt.figure(figsize = (10,10))
plt.pie(sizes, labels=labels, colors=colors)
autopct=('%1.1f%%')
plt.axis('equal')
plt.show()

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [None]:
X=data.loc[:,['Energy','Danceability','Length','Loudness..dB..','Acousticness']].values
Y=data.loc[:,['Popularity']].values

In [None]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2)

In [None]:
lr=LinearRegression()

In [None]:
lr.fit(X_train,Y_train)

In [None]:
pred=lr.predict(X_test)

In [None]:
from sklearn.metrics import mean_squared_error,mean_absolute_error

In [None]:
mean_squared_error(Y_test,pred)
mean_absolute_error(Y_test,pred)

In [None]:
plt.figure(figsize=(10,10))
plt.plot(pred,Y_test,color='black',linestyle='dashed',marker='*',markerfacecolor='red',markersize=10)
plt.title('Error analysis')
plt.xlabel('Predicted values')
plt.ylabel('Test values')

In [None]:
x=data.loc[:,['Energy','Length','Danceability','Beats.Per.Minute', 'Acousticness']].values
y=data.loc[:,'Popularity'].values


In [None]:
# Creating a test and training dataset
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.30)

In [None]:
from sklearn.naive_bayes import GaussianNB

In [None]:
x

In [None]:
gnb = GaussianNB()
gnb.fit(X_train, y_train)
y_pred=gnb.predict(X_test)
df_output = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
print(df_output)

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
score=cross_val_score(gnb,X_train,y_train,scoring='accuracy',cv=3)

In [None]:
score.mean()*100

In [None]:
x=data.loc[:,['Energy','Length','Danceability','Beats.Per.Minute', 'Acousticness']].values
y=data.loc[:,'Popularity'].values

In [None]:
from sklearn.svm import LinearSVC

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=1)

In [None]:
lsvc=LinearSVC()

In [None]:
lsvc.fit(x_train,y_train)

In [207]:
y_pred=lsvc.predict(x_test)

In [209]:
score=cross_val_score(lsvc,x_train,y_train,scoring='accuracy',cv=3)



In [None]:
score.mean()