In [3]:
import pandas as pd

#import dataset
spotify_songs = pd.read_csv('data/final_spotify_data.csv')

#subset data based on playlist genre
w = spotify_songs.loc[spotify_songs['playlist'] == 'workout']
c = spotify_songs.loc[spotify_songs['playlist'] == 'chill']
p = spotify_songs.loc[spotify_songs['playlist'] == 'party']
f = spotify_songs.loc[spotify_songs['playlist'] == 'focus']

# Inferential Statistics

## Are the variable significant in terms of predicting Playlist genre?

We will use inferential statistics to determine which variables will be good candidates for our models. To do this, I will use to hypothesis testing to prove that the variables are significantly different across the playlist categories. 

### Are workout and party playlists significantly different from each other?

In the data storytelling stage, we found that there were many duplicate songs shared between Workout and Party playlists. This might indicate that the two playlists are very similar. I want to perform hypothesis testing to confirm if that two playlists are significantly different from each other. 

##### Two sample T test:

H0: the means of the samples are the same.

H1: the means of the samples are not the same.


In [6]:
from scipy import stats
from scipy.stats import ttest_ind


features = ['acousticness', 'danceability', 'duration_ms', 'energy', 'instrumentalness', 
            'liveness', 'loudness', 'speechiness', 'tempo', 'valence']

for feature in features:
    data1, data2 = w[feature], p[feature]
    stat, pval = ttest_ind(data1, data2)

    print(feature, "\n--------------------")
    print("T Statistic: %.3f" % stat)
    print("P-Value: %f" % pval)
    if pval > 0.05:
        print("The means are the same (do not reject null hypothesis)\n")
    else:
        print("The means are not the same (reject null hypothesis)\n")

acousticness 
--------------------
T Statistic: -7.784
P-Value: 0.000000
The means are not the same (reject null hypothesis)

danceability 
--------------------
T Statistic: -6.753
P-Value: 0.000000
The means are not the same (reject null hypothesis)

duration_ms 
--------------------
T Statistic: 0.903
P-Value: 0.366590
The means are the same (do not reject null hypothesis)

energy 
--------------------
T Statistic: 7.677
P-Value: 0.000000
The means are not the same (reject null hypothesis)

instrumentalness 
--------------------
T Statistic: 3.432
P-Value: 0.000623
The means are not the same (reject null hypothesis)

liveness 
--------------------
T Statistic: 0.322
P-Value: 0.747544
The means are the same (do not reject null hypothesis)

loudness 
--------------------
T Statistic: 6.251
P-Value: 0.000000
The means are not the same (reject null hypothesis)

speechiness 
--------------------
T Statistic: 1.177
P-Value: 0.239262
The means are the same (do not reject null hypothesis)

t

The results from the t-test show that duration, liveness, and speechiness are not significantly different between workout and party playlist.

T-test only tests for continuous variables. For our categorical variables, I will use the chi-squared test.

##### Chi-Squared Test

H0: the two samples are independent.

H1: there is a dependency between the samples.

In [35]:
workout_party = spotify_songs[(spotify_songs['playlist'] == 'workout') | (spotify_songs['playlist'] == 'party')]

mode = pd.crosstab(workout_party['mode'], workout_party['playlist'], margins = True)


observed = mode.iloc[0:2,0:2]   # Get table without totals for later use
print(mode, "\n")


chi2, p, dof, exp = stats.chi2_contingency(observed= observed)

print('mode', "\n--------------------")
print("Chi-Squared: %.3f" % chi2)
print("P-Value: %f" % p)
if p > 0.05:
    print("The two samples are independent (do not reject null hypothesis)\n")
else:
    print("The two samples are dependent (reject null hypothesis)\n")

playlist  party  workout   All
mode                          
0           224      233   457
1           340      290   630
All         564      523  1087 

mode 
--------------------
Chi-Squared: 2.408
P-Value: 0.120707
The two samples are independent (do not reject null hypothesis)



In [36]:
key = pd.crosstab(workout_party['key'], workout_party['playlist'], margins = True)


observed = key.iloc[0:12,0:2]   # Get table without totals for later use
print(key, "\n")

chi2, p, dof, exp = stats.chi2_contingency(observed= observed)

print('key', "\n--------------------")
print("Chi-Squared: %.3f" % chi2)
print("P-Value: %f" % p)
if p > 0.05:
    print("The two samples are independent (do not reject null hypothesis)\n")
else:
    print("The two samples are dependent (reject null hypothesis)\n")

playlist  party  workout   All
key                           
0            75       46   121
1            82       68   150
2            35       49    84
3            16       13    29
4            35       30    65
5            45       46    91
6            53       50   103
7            47       51    98
8            32       31    63
9            51       41    92
10           44       39    83
11           49       59   108
All         564      523  1087 

key 
--------------------
Chi-Squared: 12.348
P-Value: 0.338056
The two samples are independent (do not reject null hypothesis)



In [37]:
time_sig = pd.crosstab(workout_party['time_signature'], workout_party['playlist'], margins = True)


observed = time_sig.iloc[0:4,0:2]   # Get table without totals for later use
print(time_sig, "\n")

chi2, p, dof, exp = stats.chi2_contingency(observed= observed)

print('time signature', "\n--------------------")
print("Chi-Squared: %.3f" % chi2)
print("P-Value: %f" % p)
if p > 0.05:
    print("The two samples are independent (do not reject null hypothesis)\n")
else:
    print("The two samples are dependent (reject null hypothesis)\n")

playlist        party  workout   All
time_signature                      
1                   1        1     2
3                   7       10    17
4                 548      508  1056
5                   8        4    12
All               564      523  1087 

time signature 
--------------------
Chi-Squared: 1.834
P-Value: 0.607554
The two samples are independent (do not reject null hypothesis)



The tests show that all three categorical variables of party and workout playlists are independent of each other. This tells us that these variables may not be good candidates for predicting between the two playlists.


### Are variables significantly different across playlist genres?

Now lets test if all playlists are significantly different from each other. 



##### Kruskal-Wallis H Test

The Kruskal-Wallis test assesses for significant differences on a continuous dependent variable by a categorical independent variable (with two or more groups). It is the non-paramteric counterpart to ANOVA test.

H0: the distributions of all categories are equal.

H1: the distributions of one or more categories are not equal.


In [48]:
from scipy.stats import kruskal


features = ['acousticness', 'danceability', 'duration_ms', 'energy', 'instrumentalness', 
            'liveness', 'loudness', 'speechiness', 'tempo', 'valence']

for feature in features:
    data1, data2, data3, data4 = w[feature], p[feature], c[feature], f[feature]
    stat, pval = kruskal(data1, data2, data3, data4)

    print(feature, "\n--------------------")
    print("Kruskal-Wallis H: %.3f" % stat)
    print("P-Value: %f" % pval)
    if pval > 0.05:
        print("Distributions are the same (do not reject null hypothesis)\n")
    else:
        print("Distributions are not the same (reject null hypothesis)\n")

acousticness 
--------------------
Kruskal-Wallis H: 973.123
P-Value: 0.000000
Distributions are not the same (reject null hypothesis)

danceability 
--------------------
Kruskal-Wallis H: 519.786
P-Value: 0.000000
Distributions are not the same (reject null hypothesis)

duration_ms 
--------------------
Kruskal-Wallis H: 98.430
P-Value: 0.000000
Distributions are not the same (reject null hypothesis)

energy 
--------------------
Kruskal-Wallis H: 1121.449
P-Value: 0.000000
Distributions are not the same (reject null hypothesis)

instrumentalness 
--------------------
Kruskal-Wallis H: 1137.366
P-Value: 0.000000
Distributions are not the same (reject null hypothesis)

liveness 
--------------------
Kruskal-Wallis H: 71.343
P-Value: 0.000000
Distributions are not the same (reject null hypothesis)

loudness 
--------------------
Kruskal-Wallis H: 1197.174
P-Value: 0.000000
Distributions are not the same (reject null hypothesis)

speechiness 
--------------------
Kruskal-Wallis H: 507.99

We reject the null hypothesis for all the variables. There is a significant difference between all genre categories. Therefore, we can conclude that all these variables will be good features to include in our model. 

Kruskal-Wallis is only valid for continuous variables. We will use a different test for our three categorical variables.



##### Chi-Squared Test

H0: the samples are independent.

H1: there is a dependency between the samples.



In [38]:
mode_tab = pd.crosstab(spotify_songs['mode'], spotify_songs['playlist'], margins = True)


observed = mode_tab.iloc[0:2,0:4]   # Get table without totals for later use
print(mode_tab)

playlist  chill  focus  party  workout   All
mode                                        
0           145    183    224      233   785
1           392    342    340      290  1364
All         537    525    564      523  2149


In [39]:
chi2, p, dof, exp = stats.chi2_contingency(observed= observed)

print('mode', "\n--------------------")
print("Chi-Squared: %.3f" % chi2)
print("P-Value: %f" % p)
if p > 0.05:
    print("The samples are independent (do not reject null hypothesis)\n")
else:
    print("The samples are dependent (reject null hypothesis)\n")

mode 
--------------------
Chi-Squared: 38.642
P-Value: 0.000000
The samples are dependent (reject null hypothesis)



In [40]:
key_tab = pd.crosstab(spotify_songs['key'], spotify_songs['playlist'], margins = True)


observed = key_tab.iloc[0:12,0:4]  
print(key_tab)

playlist  chill  focus  party  workout   All
key                                         
0            63     68     75       46   252
1            39     56     82       68   245
2            57     39     35       49   180
3            24     25     16       13    78
4            45     38     35       30   148
5            50     50     45       46   191
6            38     39     53       50   180
7            65     71     47       51   234
8            37     42     32       31   142
9            41     38     51       41   171
10           33     26     44       39   142
11           45     33     49       59   186
All         537    525    564      523  2149


In [41]:
chi2, p, dof, exp = stats.chi2_contingency(observed= observed)

print('key', "\n--------------------")
print("Chi-Squared: %.3f" % chi2)
print("P-Value: %f" % p)
if p > 0.05:
    print("The samples are independent (do not reject null hypothesis)\n")
else:
    print("The samples are dependent (reject null hypothesis)\n")

key 
--------------------
Chi-Squared: 64.504
P-Value: 0.000842
The samples are dependent (reject null hypothesis)



In [42]:
timesig_tab = pd.crosstab(spotify_songs['time_signature'], spotify_songs['playlist'], margins = True)


observed = timesig_tab.iloc[0:4,0:4]  
print(timesig_tab)

playlist        chill  focus  party  workout   All
time_signature                                    
1                   5     12      1        1    19
3                  36     97      7       10   150
4                 485    393    548      508  1934
5                  11     23      8        4    46
All               537    525    564      523  2149


In [43]:
chi2, p, dof, exp = stats.chi2_contingency(observed= observed)

print('time signature', "\n--------------------")
print("Chi-Squared: %.3f" % chi2)
print("P-Value: %f" % p)
if p > 0.05:
    print("The samples are independent (do not reject null hypothesis)\n")
else:
    print("The samples are dependent (reject null hypothesis)\n")

time signature 
--------------------
Chi-Squared: 200.912
P-Value: 0.000000
The samples are dependent (reject null hypothesis)



The chi-squared test shows a significant relationship between mode, key, time signature and playlist genre. Therefore, these categorical variables are good candidates for our model. 

### Conclusion:

Our tests show that all our variables significantly vary based on playlist genre. All of the features are good candidates for our model.

However, we did see that duration, liveness, and speechiness are not significantly different between workout and party playlists. This will be good to keep in mind when evaluating our model. 