In [2]:
# Import everything we need.
import csv
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

We'll be using EvolutionPopUSA_MainData.csv dataset, posted by Matthias Mauch.


Dataset url: https://figshare.com/articles/Main_Dataset_for_Evolution_of_Popular_Music_USA_1960_2010_/1309953

Dataset was used for an article written by Matthias Mauch titled "The evolution of popular music: USA 1960–2010".  Mauch and his co-authors defined different musical features of songs, along with the artist name and year of popularity, and used these musical features over time.

Article url: http://rsos.royalsocietypublishing.org/content/2/5/150081

In [48]:
# Import csv dataset into jupyter notebook.
# Call csv file music_df.
music_df = pd.read_csv('EvolutionPopUSA_MainData.csv')

# Test that we've successfully loaded dataframe.
music_df.head()

Unnamed: 0,recording_id,artist_name,artist_name_clean,track_name,first_entry,quarter,year,fiveyear,decade,era,...,timb_26,timb_27,timb_28,timb_29,timb_30,timb_31,timb_32,timb_33,timb_34,timb_35
0,1,Suzanne Vega,SUZANNEVEGA,Solitude Standing,1987-09-12,1987 Q3,1987,1985,1980,3,...,7,8,12,5,53,35,2,3,2,0
1,2,Janet Jackson ft Carly Simon,JANETJACKSON,Son Of A Gun (Betcha Think This Song Is About ...,2001-11-24,2001 Q4,2001,2000,2000,4,...,40,19,33,7,48,22,13,23,6,22
2,3,Neneh Cherry,NENEHCHERRY,Heart,1989-12-09,1989 Q4,1989,1985,1980,3,...,18,10,23,8,26,37,11,14,7,63
3,4,Neneh Cherry,NENEHCHERRY,Kisses On The Wind,1989-07-22,1989 Q3,1989,1985,1980,3,...,9,19,8,35,7,15,30,4,9,27
4,5,Junior M.A.F.I.A. Featuring The Notorious B.I.G.,JUNIORMAFIA,Get Money,1996-02-10,1996 Q1,1996,1995,1990,4,...,5,7,22,16,2,7,37,12,3,16


In [4]:
music_df.describe()

Unnamed: 0,recording_id,year,fiveyear,decade,era,cluster,hTopic_01,hTopic_02,hTopic_03,hTopic_04,...,timb_26,timb_27,timb_28,timb_29,timb_30,timb_31,timb_32,timb_33,timb_34,timb_35
count,17094.0,17094.0,17094.0,17094.0,17094.0,17094.0,17094.0,17094.0,17094.0,17094.0,...,17094.0,17094.0,17094.0,17094.0,17094.0,17094.0,17094.0,17094.0,17094.0,17094.0
mean,8547.5,1981.265181,1979.278402,1976.766702,2.632561,7.322511,0.09714,0.15277,0.156575,0.107098,...,16.790277,16.707851,24.827483,10.599392,40.872002,32.116766,10.199544,30.690125,16.265707,9.692641
std,4934.757086,14.861432,14.770766,14.501195,1.01769,3.790798,0.136731,0.167657,0.180013,0.121609,...,22.554304,29.559922,25.023219,18.454009,60.948417,21.217112,13.820388,59.324304,20.351111,19.586098
min,1.0,1960.0,1960.0,1960.0,1.0,1.0,0.000207,0.000207,0.000207,0.000207,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,4274.25,1968.0,1965.0,1960.0,2.0,4.0,0.00022,0.01514,0.012602,0.000234,...,2.0,1.0,8.0,5.0,3.0,16.0,2.0,2.0,3.0,0.0
50%,8547.5,1979.0,1975.0,1970.0,2.0,8.0,0.03983,0.095753,0.091951,0.065009,...,9.0,6.0,17.0,5.0,17.0,28.0,6.0,9.0,10.0,2.0
75%,12820.75,1994.0,1990.0,1990.0,4.0,11.0,0.14281,0.239772,0.239738,0.169363,...,22.0,19.0,34.0,8.0,53.0,44.0,12.0,31.0,21.0,10.0
max,17094.0,2009.0,2005.0,2000.0,4.0,13.0,0.980424,0.998511,0.981016,0.769407,...,380.0,445.0,299.0,477.0,711.0,185.0,222.0,805.0,308.0,339.0


In [49]:
# To start, we'll 
# Define feature and target dataframes
# To start, lets see if we can predict decade based on all harmonic and timbre classifiers.

# Define our input dataframe of features
decadeInput = music_df.drop(['recording_id', 'artist_name', 'artist_name_clean', 
                             'track_name', 'first_entry','quarter', 'year', 
                             'fiveyear', 'decade', 'era'], axis = 1)

# Define our target labels
decadeLabel = music_df[['decade']].copy()

In [54]:
# using train_test_split, create decadeInputTrain, decadeInputTest, decadeLabelTrain, and decadeLabelTest 
decadeInputTrain, decadeInputTest, decadeLabelTrain, decadeLabelTest = train_test_split(decadeInput, decadeLabel, stratify = decadeLabel)

In [55]:
# Test that the keys match for training dataframes
print (decadeInputTrain.head())
print (decadeLabelTrain.head())

       cluster  hTopic_01  hTopic_02  hTopic_03  hTopic_04  hTopic_05  \
14741       10   0.000217   0.367788   0.287477   0.090879   0.012995   
8772        11   0.102127   0.334457   0.005038   0.082393   0.014514   
2417         3   0.134070   0.138789   0.501406   0.000216   0.035710   
6511         2   0.057820   0.058130   0.223568   0.051892   0.473363   
14129        9   0.068678   0.141597   0.136720   0.422064   0.000227   

       hTopic_06  hTopic_07  hTopic_08  tTopic_01   ...     timb_26  timb_27  \
14741   0.000217   0.207962   0.032465   0.000444   ...           5        2   
8772    0.066803   0.273000   0.121668   0.169106   ...           0       32   
2417    0.000216   0.082599   0.106993   0.000417   ...           1        0   
6511    0.008517   0.098741   0.027969   0.434715   ...          25       31   
14129   0.141188   0.052465   0.037061   0.000597   ...           6        6   

       timb_28  timb_29  timb_30  timb_31  timb_32  timb_33  timb_34  timb_35  


In [56]:
# Test that shapes are correct
print(decadeInputTrain.shape)
print(decadeInputTest.shape)
print(decadeLabelTrain.shape)
print(decadeLabelTest.shape)

(12820, 259)
(4274, 259)
(12820, 1)
(4274, 1)


In [64]:
# make your model! 
gnb = GaussianNB()
gnb.fit(decadeInputTrain, decadeLabelTrain)
decade_music_pred = gnb.predict(decadeInputTest)

  y = column_or_1d(y, warn=True)


In [65]:
# How well did our model do?
# find the accuracy score.
print("Gaussian Naive Bayes accuracy:", round((accuracy_score(decadeLabelTest, decade_music_pred)*100),2))

Gaussian Naive Bayes accuracy: 33.55


In [66]:
# Let's try pruning out just necessary information
smaller_decadeInput = decadeInput[['cluster', 'hTopic_01', 'hTopic_02', 'hTopic_03', 'hTopic_04',
                                   'hTopic_05', 'hTopic_06', 'hTopic_07',
                                   'hTopic_08', 'tTopic_01', 'tTopic_02', 'tTopic_03', 'tTopic_04',
                                   'tTopic_05', 'tTopic_06', 'tTopic_07', 'tTopic_08']].copy()

In [67]:
smaller_decadeInput.head()

Unnamed: 0,cluster,hTopic_01,hTopic_02,hTopic_03,hTopic_04,hTopic_05,hTopic_06,hTopic_07,hTopic_08,tTopic_01,tTopic_02,tTopic_03,tTopic_04,tTopic_05,tTopic_06,tTopic_07,tTopic_08
0,12,0.055174,0.078318,0.553018,0.000223,0.000212,0.047303,0.044518,0.221235,0.444447,0.005474,0.00065,0.134949,0.381252,0.000678,0.031913,0.000636
1,12,0.027827,0.033408,0.587047,0.00021,0.056217,0.000212,0.16991,0.125169,0.308091,0.070154,0.255784,0.000456,0.217727,0.000423,0.000403,0.146964
2,12,0.028861,0.248975,0.549706,0.033614,0.079899,0.000219,0.058511,0.000216,0.368978,0.058471,0.248061,0.011694,0.087592,0.062672,0.000477,0.162054
3,11,0.008139,0.324315,0.195825,0.053928,0.093734,0.054064,0.269774,0.000222,0.343546,0.005986,0.421979,0.000425,0.09889,0.0004,0.000397,0.128377
4,12,0.037964,0.271282,0.42798,0.070706,0.084595,0.000232,0.049696,0.057546,0.263076,0.164987,0.495515,0.000461,0.00436,0.000448,0.000429,0.070725


In [68]:
# using train_test_split, create decadeInputTrain, decadeInputTest, decadeLabelTrain, and decadeLabelTest 
decadeInputTrain, decadeInputTest, decadeLabelTrain, decadeLabelTest = train_test_split(smaller_decadeInput, decadeLabel, stratify = decadeLabel)

In [69]:
# test model with smaller feature dataframe
gnb = GaussianNB()
gnb.fit(decadeInputTrain, decadeLabelTrain)
decade_music_pred = gnb.predict(decadeInputTest)

  y = column_or_1d(y, warn=True)


In [70]:
print("Gaussian Naive Bayes accuracy:", round((accuracy_score(decadeLabelTest, decade_music_pred)*100),2))

Gaussian Naive Bayes accuracy: 40.48


In [18]:
# Maybe we have to many features.
# Try just with hTopics...
hTopic_decadeInput = decadeInput[['hTopic_01', 'hTopic_02', 'hTopic_03', 
                                  'hTopic_04', 'hTopic_05', 'hTopic_06', 
                                  'hTopic_07', 'hTopic_08']].copy()
hTopic_decadeInput.head()

Unnamed: 0,hTopic_01,hTopic_02,hTopic_03,hTopic_04,hTopic_05,hTopic_06,hTopic_07,hTopic_08
0,0.055174,0.078318,0.553018,0.000223,0.000212,0.047303,0.044518,0.221235
1,0.027827,0.033408,0.587047,0.00021,0.056217,0.000212,0.16991,0.125169
2,0.028861,0.248975,0.549706,0.033614,0.079899,0.000219,0.058511,0.000216
3,0.008139,0.324315,0.195825,0.053928,0.093734,0.054064,0.269774,0.000222
4,0.037964,0.271282,0.42798,0.070706,0.084595,0.000232,0.049696,0.057546


In [71]:
# using train_test_split, create decadeInputTrain, decadeInputTest, decadeLabelTrain, and decadeLabelTest 
decadeInputTrain, decadeInputTest, decadeLabelTrain, decadeLabelTest = train_test_split(hTopic_decadeInput, decadeLabel, stratify = decadeLabel)

# test model with smaller feature dataframe
gnb = GaussianNB()
gnb.fit(decadeInputTrain, decadeLabelTrain)
decade_music_pred = gnb.predict(decadeInputTest)

print("Gaussian Naive Bayes accuracy:", round((accuracy_score(decadeLabelTest, decade_music_pred)*100),2))

Gaussian Naive Bayes accuracy: 35.38


  y = column_or_1d(y, warn=True)


In [72]:
# Try just with tTopics...
tTopic_decadeInput = decadeInput[['tTopic_01', 'tTopic_02', 'tTopic_03', 
                                  'tTopic_04','tTopic_05', 'tTopic_06', 
                                  'tTopic_07', 'tTopic_08']].copy()

# using train_test_split, create decadeInputTrain, decadeInputTest, decadeLabelTrain, and decadeLabelTest 
decadeInputTrain, decadeInputTest, decadeLabelTrain, decadeLabelTest = train_test_split(tTopic_decadeInput, decadeLabel, stratify = decadeLabel)

# test model with smaller feature dataframe
gnb = GaussianNB()
gnb.fit(decadeInputTrain, decadeLabelTrain)
decade_music_pred = gnb.predict(decadeInputTest)

print("Gaussian Naive Bayes accuracy:", round((accuracy_score(decadeLabelTest, decade_music_pred)*100),2))

Gaussian Naive Bayes accuracy: 37.86


  y = column_or_1d(y, warn=True)


In [73]:
# Let's try removing unnecessary columns.
# Looking at graphs of data from the paper, H2 H4 and H7 appear to be fairly consistant.
# Let's see if removing cluster, H2 H4 and H7 improves accuracy.
smaller_decadeInput = decadeInput[['hTopic_01', 'hTopic_03', 'hTopic_05', 'hTopic_06',
                                   'hTopic_08', 'tTopic_01', 'tTopic_02', 'tTopic_03',
                                   'tTopic_04', 'tTopic_05', 'tTopic_06', 'tTopic_07',
                                   'tTopic_08']].copy()

# using train_test_split, create decadeInputTrain, decadeInputTest, decadeLabelTrain, and decadeLabelTest 
decadeInputTrain, decadeInputTest, decadeLabelTrain, decadeLabelTest = train_test_split(smaller_decadeInput, decadeLabel, stratify = decadeLabel)

# test model with smaller feature dataframe
gnb = GaussianNB()
gnb.fit(decadeInputTrain, decadeLabelTrain)
decade_music_pred = gnb.predict(decadeInputTest)

# What's our accuracy?
print("Gaussian Naive Bayes accuracy:", round((accuracy_score(decadeLabelTest, decade_music_pred)*100),2))

Gaussian Naive Bayes accuracy: 40.8


  y = column_or_1d(y, warn=True)


In [74]:
# Hmmmmmm, barely better than H1-8 and T1-8 without any omissions.
# In addition to H2, H4 and H7, let's try taking some of the other noisy topics
# that don't have clear trends.  H8, T6, T7 and T8.
# That leaves us with: H1, H3, H5, H6, T1, T2, T3, T4, T5.
smaller_decadeInput = music_df[['hTopic_01', 'hTopic_03', 'hTopic_05', 'hTopic_06',
                                'tTopic_01', 'tTopic_02', 'tTopic_03',
                                'tTopic_04', 'tTopic_05']].copy()

# using train_test_split, create decadeInputTrain, decadeInputTest, decadeLabelTrain, and decadeLabelTest 
decadeInputTrain, decadeInputTest, decadeLabelTrain, decadeLabelTest = train_test_split(smaller_decadeInput, decadeLabel, stratify = decadeLabel)

# test model with smaller feature dataframe
gnb = GaussianNB()
gnb.fit(decadeInputTrain, decadeLabelTrain)
decade_music_pred = gnb.predict(decadeInputTest)

# What's our accuracy?
print("Gaussian Naive Bayes accuracy:", round((accuracy_score(decadeLabelTest, decade_music_pred)*100),2))

Gaussian Naive Bayes accuracy: 40.92


  y = column_or_1d(y, warn=True)


In [24]:
# Still not much better!  Time for a different approach...

In [90]:
# Maybe our labels are too broad, 4 classifications might be too much
# for this dataset.
#
# Let's generalize into two classifications for our labels:
# Songs before 1980 (False) and songs after 1980 (True)

decadeLabel_mask = (decadeLabel > 1980.00)
# If we change to decadeLabel > 1970.00, accuracy decreases to level of 1960.00 classifier alone, ~65%.

decadeLabel_mask.head()

Unnamed: 0,decade
0,True
1,True
2,True
3,True
4,True


In [91]:
# smaller_decadeInput already defined
# with just H1, H3, H5, H6, T1, T2, T3, T4, T5.

# using train_test_split, create decadeInputTrain, decadeInputTest, decadeLabelTrain, and decadeLabelTest 
decadeInputTrain, decadeInputTest, decadeLabelTrain, decadeLabelTest = train_test_split(smaller_decadeInput, decadeLabel_mask, stratify = decadeLabel_mask)

# test model with smaller feature dataframe
gnb = GaussianNB()
gnb.fit(decadeInputTrain, decadeLabelTrain)
decade_music_pred = gnb.predict(decadeInputTest)

# What's our accuracy?
print("Gaussian Naive Bayes accuracy:", round((accuracy_score(decadeLabelTest, decade_music_pred)*100),2))

Gaussian Naive Bayes accuracy: 66.45


  y = column_or_1d(y, warn=True)


In [77]:
# That was our best model yet!  Let's take a look at the confusion matrix.
# Import confusion matrix
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [78]:
# Generate the confusion matrix and classification report
print ("THIS IS THE CONFUSION MATRIX:")
print(confusion_matrix(decadeLabelTest, decade_music_pred))
print ("THIS IS THE CLASSIFICATION REPORT:")
print(classification_report(decadeLabelTest, decade_music_pred))

THIS IS THE CONFUSION MATRIX:
[[2679  256]
 [ 856  483]]
THIS IS THE CLASSIFICATION REPORT:
             precision    recall  f1-score   support

      False       0.76      0.91      0.83      2935
       True       0.65      0.36      0.46      1339

avg / total       0.73      0.74      0.71      4274



In [79]:
# Limiting our targets helps our classifier perform better.
# What if we narrow it down even more, to just one decade?
decadeLabel_mask_1960 = (decadeLabel == 1960.00)
decadeLabel_mask_1970 = (decadeLabel == 1970.00)
decadeLabel_mask_1980 = (decadeLabel == 1980.00)
decadeLabel_mask_1990 = (decadeLabel == 1990.00)
decadeLabel_mask_2000 = (decadeLabel == 2000.00)

# smaller_decadeInput already defined

# using train_test_split, create decadeInputTrain, decadeInputTest, decadeLabelTrain, and decadeLabelTest 
decadeInputTrain1, decadeInputTest1, decadeLabelTrain1, decadeLabelTest1 = train_test_split(smaller_decadeInput, decadeLabel_mask_1960, stratify = decadeLabel_mask_1960)
decadeInputTrain2, decadeInputTest2, decadeLabelTrain2, decadeLabelTest2 = train_test_split(smaller_decadeInput, decadeLabel_mask_1970, stratify = decadeLabel_mask_1970)
decadeInputTrain3, decadeInputTest3, decadeLabelTrain3, decadeLabelTest3 = train_test_split(smaller_decadeInput, decadeLabel_mask_1980, stratify = decadeLabel_mask_1980)
decadeInputTrain4, decadeInputTest4, decadeLabelTrain4, decadeLabelTest4 = train_test_split(smaller_decadeInput, decadeLabel_mask_1990, stratify = decadeLabel_mask_1990)
decadeInputTrain5, decadeInputTest5, decadeLabelTrain5, decadeLabelTest5 = train_test_split(smaller_decadeInput, decadeLabel_mask_2000, stratify = decadeLabel_mask_2000)

# test each model with smaller feature dataframe
gnb1 = GaussianNB()
gnb1.fit(decadeInputTrain1, decadeLabelTrain1)
decade_music_pred1 = gnb1.predict(decadeInputTest1)

gnb2 = GaussianNB()
gnb2.fit(decadeInputTrain2, decadeLabelTrain2)
decade_music_pred2 = gnb2.predict(decadeInputTest2)

gnb3 = GaussianNB()
gnb3.fit(decadeInputTrain3, decadeLabelTrain3)
decade_music_pred3 = gnb3.predict(decadeInputTest3)

gnb4 = GaussianNB()
gnb4.fit(decadeInputTrain1, decadeLabelTrain4)
decade_music_pred4 = gnb4.predict(decadeInputTest4)

gnb5 = GaussianNB()
gnb5.fit(decadeInputTrain5, decadeLabelTrain5)
decade_music_pred5 = gnb5.predict(decadeInputTest5)

# What's our accuracy?
print("1960 accuracy:", round((accuracy_score(decadeLabelTest1, decade_music_pred1)*100),2), "%")
print("1970 accuracy:", round((accuracy_score(decadeLabelTest2, decade_music_pred2)*100),2), "%")
print("1980 accuracy:", round((accuracy_score(decadeLabelTest3, decade_music_pred3)*100),2), "%")
print("1990 accuracy:", round((accuracy_score(decadeLabelTest4, decade_music_pred4)*100),2), "%")
print("2000 accuracy:", round((accuracy_score(decadeLabelTest5, decade_music_pred5)*100),2), "%")

1960 accuracy: 63.27 %
1970 accuracy: 64.86 %
1980 accuracy: 76.72 %
1990 accuracy: 85.47 %
2000 accuracy: 81.94 %


  y = column_or_1d(y, warn=True)


In [31]:
# It appears songs released in 1990 are the easiest for our classifier to predict.
# Let's compare the deviation of our features from 1960 and 1990.

In [80]:
# reassign 'decade' column to smaller_decadeInput_copy
smaller_decadeInput_copy = smaller_decadeInput.copy()
smaller_decadeInput_copy['decade'] = music_df['decade']
smaller_decadeInput_copy.head()

Unnamed: 0,hTopic_01,hTopic_03,hTopic_05,hTopic_06,tTopic_01,tTopic_02,tTopic_03,tTopic_04,tTopic_05,decade
0,0.055174,0.553018,0.000212,0.047303,0.444447,0.005474,0.00065,0.134949,0.381252,1980
1,0.027827,0.587047,0.056217,0.000212,0.308091,0.070154,0.255784,0.000456,0.217727,2000
2,0.028861,0.549706,0.079899,0.000219,0.368978,0.058471,0.248061,0.011694,0.087592,1980
3,0.008139,0.195825,0.093734,0.054064,0.343546,0.005986,0.421979,0.000425,0.09889,1980
4,0.037964,0.42798,0.084595,0.000232,0.263076,0.164987,0.495515,0.000461,0.00436,1990


In [36]:
# Assign to dataframes for 1960s and 1990s

In [81]:

df_1960s = smaller_decadeInput_copy.loc[smaller_decadeInput_copy['decade'] == 1960.00]
df_1990s = smaller_decadeInput_copy.loc[smaller_decadeInput_copy['decade'] == 1990.00]
df_1960s.head()

Unnamed: 0,hTopic_01,hTopic_03,hTopic_05,hTopic_06,tTopic_01,tTopic_02,tTopic_03,tTopic_04,tTopic_05,decade
12,0.14544,0.55012,0.039307,0.000211,0.000381,0.422229,0.000409,0.000391,0.000373,1960
13,0.118328,0.114935,0.008476,0.407516,0.112866,0.035378,0.064973,0.134022,0.449708,1960
15,0.121243,0.000252,0.218266,0.443894,0.401684,0.276826,0.000455,0.020672,0.11492,1960
16,0.63797,0.137656,0.061474,0.027952,0.000639,0.038528,0.000594,0.266308,0.475756,1960
17,0.321529,0.119727,0.000209,0.040157,0.187267,0.182332,0.037438,0.18084,0.151354,1960


In [82]:
df_1990s.head()

Unnamed: 0,hTopic_01,hTopic_03,hTopic_05,hTopic_06,tTopic_01,tTopic_02,tTopic_03,tTopic_04,tTopic_05,decade
4,0.037964,0.42798,0.084595,0.000232,0.263076,0.164987,0.495515,0.000461,0.00436,1990
5,0.032561,0.095055,0.752497,0.01321,0.221291,0.095613,0.425602,0.016056,0.008389,1990
6,0.011025,0.301181,0.071222,0.01187,0.338328,0.189655,0.19872,0.122192,0.000482,1990
7,0.000211,0.025405,0.823912,0.043281,0.244026,0.247049,0.452571,0.000369,0.008787,1990
10,0.000217,0.614482,0.010395,0.000213,0.061883,0.737034,0.122796,0.027249,0.015724,1990


In [83]:
# Describe the data, look at standard deviation
df_1960s.describe()

Unnamed: 0,hTopic_01,hTopic_03,hTopic_05,hTopic_06,tTopic_01,tTopic_02,tTopic_03,tTopic_04,tTopic_05,decade
count,4997.0,4997.0,4997.0,4997.0,4997.0,4997.0,4997.0,4997.0,4997.0,4997.0
mean,0.150561,0.107035,0.044319,0.0752,0.082064,0.122852,0.058857,0.207306,0.155454,1960.0
std,0.165212,0.139522,0.087776,0.102996,0.121187,0.176653,0.095832,0.171609,0.203545,0.0
min,0.000207,0.000207,0.000207,0.000207,0.000215,0.000216,0.000273,0.000215,0.000215,1960.0
25%,0.012517,0.002888,0.000211,0.000222,0.000502,0.000615,0.000534,0.06476,0.000593,1960.0
50%,0.100565,0.049972,0.005559,0.0357,0.024831,0.045178,0.021794,0.179421,0.061476,1960.0
75%,0.234467,0.158513,0.052817,0.110503,0.120169,0.170585,0.081408,0.318125,0.241937,1960.0
max,0.980424,0.905078,0.932671,0.778252,0.874877,0.997886,0.998494,0.871096,0.99264,1960.0


In [84]:
# Describe the data, look at standard deviation
df_1990s.describe()

Unnamed: 0,hTopic_01,hTopic_03,hTopic_05,hTopic_06,tTopic_01,tTopic_02,tTopic_03,tTopic_04,tTopic_05,decade
count,2483.0,2483.0,2483.0,2483.0,2483.0,2483.0,2483.0,2483.0,2483.0,2483.0
mean,0.068287,0.184545,0.155307,0.068112,0.20141,0.138588,0.152623,0.118817,0.13942,1990.0
std,0.106868,0.19636,0.248292,0.093167,0.177241,0.18869,0.192173,0.142729,0.21023,0.0
min,0.000207,0.000207,0.000207,0.000207,0.000254,0.00028,0.000282,0.000249,0.000248,1990.0
25%,0.000219,0.024142,0.000211,0.000221,0.055877,0.003628,0.000576,0.005892,0.000734,1990.0
50%,0.022848,0.117528,0.01883,0.028213,0.160388,0.048536,0.065337,0.063977,0.042882,1990.0
75%,0.095836,0.289544,0.208625,0.103638,0.305776,0.200883,0.259273,0.181148,0.18456,1990.0
max,0.916958,0.981016,0.995867,0.544076,0.970539,0.997993,0.85806,0.85825,0.997597,1990.0


In [85]:
# Let's assess the performance of our best classifier, just 1990s
# with a confusion matrix and classification report.
# Generate the confusion matrix and classification report

print("1990 accuracy:", round((accuracy_score(decadeLabelTest4, decade_music_pred4)*100),2), "%")

print ("THIS IS THE CONFUSION MATRIX:")
print(confusion_matrix(decadeLabelTest4, decade_music_pred4))
print ("THIS IS THE CLASSIFICATION REPORT:")
print(classification_report(decadeLabelTest4, decade_music_pred4))
      

1990 accuracy: 85.47 %
THIS IS THE CONFUSION MATRIX:
[[3653    0]
 [ 621    0]]
THIS IS THE CLASSIFICATION REPORT:
             precision    recall  f1-score   support

      False       0.85      1.00      0.92      3653
       True       0.00      0.00      0.00       621

avg / total       0.73      0.85      0.79      4274



  'precision', 'predicted', average, warn_for)


In [86]:
decadeLabelTest4.describe()

Unnamed: 0,decade
count,4274
unique,2
top,False
freq,3653


In [87]:
# Let's assess the performance of 1960s classifier for comparison.

print("1960 accuracy:", round((accuracy_score(decadeLabelTest1, decade_music_pred1)*100),2), "%")

print ("THIS IS THE CONFUSION MATRIX:")
print(confusion_matrix(decadeLabelTest1, decade_music_pred1))
print ("THIS IS THE CLASSIFICATION REPORT:")
print(classification_report(decadeLabelTest1, decade_music_pred1))

1960 accuracy: 63.27 %
THIS IS THE CONFUSION MATRIX:
[[1807 1218]
 [ 352  897]]
THIS IS THE CLASSIFICATION REPORT:
             precision    recall  f1-score   support

      False       0.84      0.60      0.70      3025
       True       0.42      0.72      0.53      1249

avg / total       0.72      0.63      0.65      4274



In [92]:
# Now that we've explored Gaussian NB,
# Let's see how the same data works
# with different machine learning classifiers.

In [93]:
# First, let's try a decision tree classifier
from sklearn import tree

In [94]:
tree_clf = tree.DecisionTreeClassifier()
tree_clf.fit(decadeInputTrain, decadeLabelTrain)
tree_pred = tree_clf.predict(decadeInputTest)

# What is our accuracy?
print("Decision Tree Accuracy:", round((accuracy_score(decadeLabelTest, tree_pred)*100),2))

Decision Tree Accuracy: 61.82


In [95]:
# Interesting.  Similar performance to our 
# Let's also try a perceptron, for something different.
# Import Perceptron
from sklearn.linear_model import perceptron

In [96]:
p_model = perceptron.Perceptron()
p_model.fit(decadeInputTrain, decadeLabelTrain)

# Test accuracy
print('Perceptron Accuracy:', (100 * p_model.score(decadeInputTrain, decadeLabelTrain)))

  y = column_or_1d(y, warn=True)


Perceptron Accuracy: 65.6708268331


In [97]:
# Finally, let's try logistic regression
# Import LogisticRegression
from sklearn.linear_model import LogisticRegression


In [98]:
logreg = LogisticRegression()
logreg.fit(decadeInputTrain, decadeLabelTrain)
logreg_pred = logreg.predict(decadeInputTest)

# Test accuracy
print("Decision Tree Accuracy:", round((accuracy_score(decadeLabelTest, tree_pred)*100),2))

Decision Tree Accuracy: 61.82


  y = column_or_1d(y, warn=True)
