In [1]:
from sklearn.svm import SVC
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt

from sklearn.naive_bayes import GaussianNB
%matplotlib inline

In [2]:
happy = pd.read_csv('happy_counts.csv')
relax = pd.read_csv('relax_counts.csv')
energetic = pd.read_csv('energetic_counts.csv')
sad = pd.read_csv('sad_counts.csv')

In [3]:
happywords = happy.iloc[:,3:]
sadwords = sad.iloc[:,3:]
energeticwords = energetic.iloc[:,3:]
relaxwords = relax.iloc[:,3:]

In [4]:
happywords['giventag'] = 'happy'
sadwords['giventag'] = 'sad'
energeticwords['giventag'] = 'energetic'
relaxwords['giventag'] = 'relax'

In [5]:
relaxwords.head(5)

Unnamed: 0,i,the,you,to,and,a,me,it,not,in,...,motivo,bake,insist,wel,santo,pe,gee,colleg,kad,giventag
0,28,15,2,12,22,2,2,4,2,1,...,0,0,0,0,0,0,0,0,0,relax
1,19,16,2,3,6,12,4,7,1,6,...,0,0,0,0,0,0,0,0,0,relax
2,2,9,8,7,2,1,0,7,3,0,...,0,0,0,0,0,0,0,0,0,relax
3,4,5,0,3,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,relax
4,39,17,21,11,10,6,2,6,12,7,...,0,0,0,0,0,0,0,0,0,relax


For matching grouping songs into multiple tags, we start off with the common classification methods including Gaussian Naive Bayes, Support Vector Machine(SVM), and Random Forest. The focus of our work is to turn the original bag of words format of 5000 words into a collection of features with appropriate dimensionality and enough predictive power. We took two main steps of feature engineering:

- Words selection

- Dimensionality Reduction

We first elaborate on word selection process. 

To ensure that we deal only with words that have strong predictive power, we consider removing the standard stop words from Python  library as well as the top common words across all targeted tags. We also attempted to convert the word counts into term frequency-inverse document frequency(tf-idf) so that we take into account the importance of a word to a song based on how many documents does the word show up in across entire corpus. We first run the models with all raw features with top 5000 word counts format with the three selected model. 

### Raw features models

In [22]:
fullcounts = pd.concat([happywords, sadwords,energeticwords,relaxwords], axis = 0, ignore_index = True)
trainfull = fullcounts.sample(frac = 0.75)
testfull=fullcounts.drop(trainfull.index)

x_full = fullcounts.drop(['giventag'], axis = 1)
y_full = fullcounts['giventag']

x_trainfull = trainfull.drop(['giventag'], axis = 1)
y_trainfull = trainfull['giventag']
x_testfull = testfull.drop(['giventag'], axis = 1)
y_testfull = testfull['giventag']

#### Naive Bayes

In [26]:
gnb_full = GaussianNB()
gnb_full.fit(x_trainfull, y_trainfull)
y_pred_nb = gnb_full.predict(x_testfull)
y_pred_nb_prob = gnb_full.predict_proba(x_testfull)
#y_score_nb = gnb_full.score(x_testfull, y_testfull)

In [28]:
np.mean(y_pred_nb==y_testfull)

0.21160558464223386

#### SVM

In [53]:
svm_full = SVC(kernel='rbf')
svm_full.fit(x_trainfull, y_trainfull)
y_pred_svm = svm_full.predict(x_testfull)

In [55]:
np.mean(y_pred_svm==y_testfull)

0.42844677137870857

#### Random Forest

In [267]:
param_grid = {
                 'n_estimators': [5, 10, 15, 20, 25, 30, 35, 40],
                 'max_depth': [5, 10, 15, 20, 25, 30]
             }

grid_clf = grid_search.GridSearchCV(RF, param_grid, cv=5)
grid_clf.fit(x_trainfull, y_trainfull)


GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': [5, 10, 15, 20, 25, 30, 35, 40], 'max_depth': [5, 10, 15, 20, 25, 30]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [269]:
grid_clf.grid_scores_

[mean: 0.36756, std: 0.00736, params: {'n_estimators': 5, 'max_depth': 5},
 mean: 0.37497, std: 0.00854, params: {'n_estimators': 10, 'max_depth': 5},
 mean: 0.37635, std: 0.00467, params: {'n_estimators': 15, 'max_depth': 5},
 mean: 0.37439, std: 0.00426, params: {'n_estimators': 20, 'max_depth': 5},
 mean: 0.37446, std: 0.00439, params: {'n_estimators': 25, 'max_depth': 5},
 mean: 0.37577, std: 0.00624, params: {'n_estimators': 30, 'max_depth': 5},
 mean: 0.37345, std: 0.00086, params: {'n_estimators': 35, 'max_depth': 5},
 mean: 0.37061, std: 0.00529, params: {'n_estimators': 40, 'max_depth': 5},
 mean: 0.32378, std: 0.01114, params: {'n_estimators': 5, 'max_depth': 10},
 mean: 0.33425, std: 0.00781, params: {'n_estimators': 10, 'max_depth': 10},
 mean: 0.33854, std: 0.00870, params: {'n_estimators': 15, 'max_depth': 10},
 mean: 0.34967, std: 0.00534, params: {'n_estimators': 20, 'max_depth': 10},
 mean: 0.35374, std: 0.00376, params: {'n_estimators': 25, 'max_depth': 10},
 mean: 0.

In [268]:
grid_clf.best_params_

{'max_depth': 5, 'n_estimators': 15}

In [29]:
RF_full = RandomForestClassifier(n_estimators = 15, max_depth =5)
RF_full.fit(x_trainfull, y_trainfull)
y_pred_rf = RF_full.predict(x_testfull)
y_pred_rf_prob = RF_full.predict_proba(x_testfull)
#y_score_rf = RF_full.score(x_testfull,y_testfull)

In [30]:
np.mean(y_pred_rf==y_testfull)

0.36801919720767889

Our attempts are made to try removing standard stop words, common words that appear in all four tags, and turning the word counts into term frequency - inverse document frequency format and apply the three selected models again to compare the performance. 

### Remove Stop Words - models

In [31]:
## Try using Python package for stop words
from stop_words import get_stop_words
stop_words = get_stop_words('en')

cols = [c for c in happywords.columns if c not in stop_words]
print (len(cols))

happywords_short = happywords[cols]
sadwords_short = sadwords[cols]
energeticwords_short = energeticwords[cols]
relaxwords_short = relaxwords[cols]
relaxwords_short.head(5)

4898


Unnamed: 0,will,love,know,just,like,now,que,time,can,come,...,motivo,bake,insist,wel,santo,pe,gee,colleg,kad,giventag
0,13,11,0,1,1,0,0,6,0,4,...,0,0,0,0,0,0,0,0,0,relax
1,0,0,0,0,3,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,relax
2,0,0,1,3,0,0,0,0,6,0,...,0,0,0,0,0,0,0,0,0,relax
3,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,relax
4,4,0,4,1,0,2,0,0,10,0,...,0,0,0,0,0,0,0,0,0,relax


In [32]:
counts_short = pd.concat([happywords_short, sadwords_short,energeticwords_short,relaxwords_short], axis = 0, ignore_index = True)
counts_short_tag = counts_short['giventag']
counts_short_ct = counts_short.drop(['giventag'], axis = 1)

train_short = counts_short.sample(frac = 0.75, random_state=100)
test_short=counts_short.drop(train_short.index)

x_short = counts_short.drop(['giventag'], axis = 1)
y_short = counts_short['giventag']

x_train_short = train_short.drop(['giventag'], axis = 1)
y_train_short = train_short['giventag']
x_test_short = test_short.drop(['giventag'], axis = 1)
y_test_short = test_short['giventag']


#### Naive Bayes

In [33]:
gnb_short = GaussianNB()
gnb_short.fit(x_train_short, y_train_short)
y_pred_nb_short = gnb_short.predict(x_test_short)
y_pred_nb_short_prob = gnb_short.predict_proba(x_test_short)
#y_score_nb_short = gnb_short.score(x_test_short, y_test_short)

In [34]:
np.mean(y_pred_nb_short==y_test_short)

0.22709424083769633

#### SVM

In [195]:
svm_short = SVC(kernel='rbf')
svm_short.fit(x_train_short, y_train_short)
y_pred_svm_short = svm_short.predict(x_test_short)

ValueError: X.shape[1] = 4897 should be equal to 5000, the number of features at training time

In [196]:
y_pred_svm_short = svm_short.predict(x_test_short)

In [197]:
np.mean(y_pred_svm_short==y_test_short)

0.41928446771378708

#### Random Forest

In [267]:
param_grid = {
                 'n_estimators': [5, 10, 15, 20, 25, 30, 35, 40],
                 'max_depth': [5, 10, 15, 20, 25, 30]
             }

grid_clf = grid_search.GridSearchCV(RF, param_grid, cv=5)
grid_clf.fit(x_trainfull, y_trainfull)


GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': [5, 10, 15, 20, 25, 30, 35, 40], 'max_depth': [5, 10, 15, 20, 25, 30]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [268]:
grid_clf.best_params_

{'max_depth': 5, 'n_estimators': 15}

In [None]:
RF = RandomForestClassifier()

param_grid = {
                 'n_estimators': [5, 10, 15, 20, 25, 30],
                 'max_depth': [5, 10, 15, 20, 25]
             }

grid_clf = grid_search.GridSearchCV(RF, param_grid, cv=5)
grid_clf.fit(x_train_short, y_train_short)

grid_clf.best_params_

In [35]:
RF_short = RandomForestClassifier(n_estimators = 15, max_depth =5)
RF_short.fit(x_train_short, y_train_short)
y_pred_rf_short = RF_short.predict(x_test_short)
y_pred_rf_short_prob = RF_short.predict_proba(x_test_short)
#y_score_rf_short = RF_short.score(x_test_short,y_test_short)

In [36]:
np.mean(y_pred_rf_short==y_test_short)

0.37172774869109948

### Remove Common Words - models

In [37]:
## Add up the total occurrence for each word in each type
total_happy = happywords_short.iloc[:,:4897].sum()
total_sad = sadwords_short.iloc[:,:4897].sum()
total_energetic = energeticwords_short.iloc[:,:4897].sum()
total_relax = relaxwords_short.iloc[:,:4897].sum()

In [38]:
## Sort by total number of occurences
sorted_happy = total_happy.sort_values(ascending=False, inplace=False)
sorted_sad = total_sad.sort_values(ascending=False, inplace=False)
sorted_energetic = total_energetic.sort_values(ascending=False, inplace=False)
sorted_relax = total_relax.sort_values(ascending=False, inplace=False)

## Find the intersection of top 30 occurences in each category
## Try using their intersection to be the stop list - words that do not add much meaning/value
inter1 = np.intersect1d(sorted_happy.index.values[:30],sorted_sad.index.values[:30])
inter2 = np.intersect1d(sorted_energetic.index.values[:30],sorted_relax.index.values[:30])
stop_list = np.intersect1d(inter1, inter2)
stop_list

array(['ca', 'can', 'come', 'feel', 'get', 'go', 'got', 'just', 'know',
       'let', 'like', 'love', 'make', 'never', 'now', 'oh', 'one', 'say',
       'see', 'take', 'time', 'want', 'way', 'will'], dtype=object)

In [39]:
cols = [c for c in happywords_short.columns if c not in stop_list]
print (len(cols))

happywords_final = happywords_short[cols]
sadwords_final = sadwords_short[cols]
energeticwords_final = energeticwords_short[cols]
relaxwords_final = relaxwords_short[cols]

happywords_final['giventag'] = 'happy'
sadwords_final['giventag'] = 'sad'
energeticwords_final['giventag'] = 'energetic'
relaxwords_final['giventag'] = 'relax'

print (happywords_final.shape)

4874
(5312, 4874)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [198]:
counts_nocommon = pd.concat([happywords_final, sadwords_final,energeticwords_final,relaxwords_final], axis = 0, ignore_index = True)
train_nocommon = counts_nocommon.sample(frac = 0.75, random_state=100)
test_nocommon=counts_nocommon.drop(train_nocommon.index)

x_nocommon = counts_nocommon.drop(['giventag'], axis = 1)
y_nocommon = counts_nocommon['giventag']

x_train_nocommon = train_nocommon.drop(['giventag'], axis = 1)
y_train_nocommon = train_nocommon['giventag']
x_test_nocommon = test_nocommon.drop(['giventag'], axis = 1)
y_test_nocommon = test_nocommon['giventag']

#### Naive Bayes

In [41]:
gnb_nocommon = GaussianNB()
gnb_nocommon.fit(x_train_nocommon, y_train_nocommon)
y_pred_nb_nocommon = gnb_nocommon.predict(x_test_nocommon)
y_pred_nb_nocommon_prob = gnb_nocommon.predict_proba(x_test_nocommon)
#y_score_nb_nocommon = gnb.score(x_test_nocommon, y_test_nocommon)

In [42]:
np.mean(y_pred_nb_nocommon == y_test_nocommon)

0.22709424083769633

#### SVM

In [199]:
svm_nocommon = SVC(kernel='rbf')
svm_nocommon.fit(x_train_nocommon, y_train_nocommon)
y_pred_svm_nocommon = svm_nocommon.predict(x_test_nocommon)
#y_score_svm_nocommon = svm_nocommon.score(x_test_nocommon, y_test_nocommon)

In [200]:
np.mean(y_pred_svm_nocommon==y_test_nocommon)

0.40990401396160558

#### Random Forest

In [267]:
param_grid = {
                 'n_estimators': [5, 10, 15, 20, 25, 30, 35, 40],
                 'max_depth': [5, 10, 15, 20, 25, 30]
             }

grid_clf = grid_search.GridSearchCV(RF, param_grid, cv=5)
grid_clf.fit(x_trainfull, y_trainfull)


GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': [5, 10, 15, 20, 25, 30, 35, 40], 'max_depth': [5, 10, 15, 20, 25, 30]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [268]:
grid_clf.best_params_

{'max_depth': 5, 'n_estimators': 15}

In [43]:
RF_nocommon = RandomForestClassifier(n_estimators = 15, max_depth =5)
RF_nocommon.fit(x_train_nocommon, y_train_nocommon)
y_pred_rf_nocommon = RF_nocommon.predict(x_test_nocommon)
y_pred_rf_nocommon_prob = RF_nocommon.predict_proba(x_test_nocommon)
#y_score_rf_nocommon = RF.score(x_test_nocommon,y_test_nocommon)

In [44]:
np.mean(y_pred_rf_nocommon == y_test_nocommon)

0.3763089005235602

### tf-idf - account for relative frequency of words - models

In [60]:
from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer(norm='l2', smooth_idf=False, sublinear_tf=False,
                 use_idf=True)
transformer2 = TfidfTransformer(norm='l2', smooth_idf=False, sublinear_tf=True,
                 use_idf=True)


In [61]:
tfidf = transformer.fit_transform(counts_short_ct)
tfidf = pd.DataFrame(tfidf.toarray())
#counts_short_tag = counts_short_tag.to_frame()
counts_short_tag.index = tfidf.index.values
tfidf_label = pd.concat([tfidf,counts_short_tag], axis = 1)
tfidf_label.columns = counts_short.columns.values
tfidf_label.head(5)

Unnamed: 0,will,love,know,just,like,now,que,time,can,come,...,motivo,bake,insist,wel,santo,pe,gee,colleg,kad,giventag
0,0.011388,0.0,0.0,0.0,0.052688,0.02737,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,happy
1,0.069047,0.0,0.014595,0.029785,0.063891,0.0,0.0,0.0,0.067346,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,happy
2,0.118384,0.0,0.0,0.0,0.0,0.0,0.0,0.047852,0.0,0.149618,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,happy
3,0.038873,0.0,0.0,0.029944,0.012847,0.013347,0.0,0.006734,0.0,0.028074,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,happy
4,0.067695,0.019787,0.0,0.0,0.019575,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,happy


In [62]:
tfidf_train = tfidf_label.sample(frac = 0.75, random_state=100)
tfidf_test=tfidf_label.drop(tfidf_train.index)

tfidf_x = tfidf_label.drop(['giventag'], axis = 1)
tfidf_y = tfidf_label['giventag']

tfidf_x_train = tfidf_train.drop(['giventag'], axis = 1)
tfidf_y_train = tfidf_train['giventag']
tfidf_x_test = tfidf_test.drop(['giventag'], axis = 1)
tfidf_y_test = tfidf_test['giventag']

#### Naive Bayes

In [63]:
tfidf_gnb = GaussianNB()
tfidf_gnb.fit(tfidf_x_train, tfidf_y_train)
tfidf_y_pred_nb = tfidf_gnb.predict(tfidf_x_test)
tfidf_y_pred_nb_prob = tfidf_gnb.predict_proba(tfidf_x_test)
#y_score_nb_nocommon = gnb.score(x_test_nocommon, y_test_nocommon)

In [64]:
np.mean(tfidf_y_pred_nb == tfidf_y_test)

0.21836823734729494

#### SVM

In [149]:
tfidf_svm = SVC(kernel='rbf')
tfidf_svm.fit(tfidf_x_train, tfidf_y_train)
tfidf_y_pred_svm = tfidf_svm.predict(tfidf_x_test)
tfidf_y_score_svm = tfidf_svm.score(tfidf_x_test, tfidf_y_test)

KeyboardInterrupt: 

In [None]:
np.mean(tfidf_y_pred_svm==tfidf_y_test)

#### Random Forest

In [267]:
param_grid = {
                 'n_estimators': [5, 10, 15, 20, 25, 30, 35, 40],
                 'max_depth': [5, 10, 15, 20, 25, 30]
             }

grid_clf = grid_search.GridSearchCV(RF, param_grid, cv=5)
grid_clf.fit(x_trainfull, y_trainfull)


GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': [5, 10, 15, 20, 25, 30, 35, 40], 'max_depth': [5, 10, 15, 20, 25, 30]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [268]:
grid_clf.best_params_

{'max_depth': 5, 'n_estimators': 15}

In [65]:
tfidf_RF = RandomForestClassifier(n_estimators = 15, max_depth =5)
tfidf_RF.fit(tfidf_x_train, tfidf_y_train)
tfidf_y_pred_rf = tfidf_RF.predict(tfidf_x_test)
tfidf_y_pred_rf_prob = tfidf_RF.predict_proba(tfidf_x_test)
#y_score_rf_nocommon = RF.score(x_test_nocommon,y_test_nocommon)

In [66]:
np.mean(tfidf_y_pred_rf == tfidf_y_test)

0.3736910994764398

### Apply PCA - stop word removed

In [67]:
from sklearn.decomposition import PCA
pca = PCA(n_components=800)
pca.fit(x_short)
np.sum(pca.explained_variance_ratio_)

0.9049027665447078

In [73]:
x_reduced = pca.transform(x_short)
x_reduced = pd.DataFrame(x_reduced)
y_short = y_short.to_frame()
y_short.index = x_reduced.index.values
xy_reduced = pd.concat([x_reduced, y_short], axis = 1)

In [74]:
train_reduced = xy_reduced.sample(frac = 0.75, random_state=100)
test_reduced=xy_reduced.drop(train_reduced.index)

x_reduced = xy_reduced.drop(['giventag'], axis = 1)
y_reduced = xy_reduced['giventag']

x_train_reduced = train_reduced.drop(['giventag'], axis = 1)
y_train_reduced = train_reduced['giventag']
x_test_reduced = test_reduced.drop(['giventag'], axis = 1)
y_test_reduced = test_reduced['giventag']


#### Naive Bayes

In [76]:
nb_reduced = GaussianNB()
nb_reduced.fit(x_train_reduced, y_train_reduced)
y_pred_nb_reduced = nb_reduced.predict(x_test_reduced)
y_pred_nb_reduced_prob = nb_reduced.predict_proba(x_test_reduced)
#y_score_nb_nocommon = gnb.score(x_test_nocommon, y_test_nocommon)

In [77]:
np.mean(y_pred_nb_reduced==y_test_reduced)

0.34969458987783597

#### SVM

In [150]:
svm_reduced = SVC(kernel='rbf')
svm_reduced.fit(x_train_reduced, y_train_reduced)
y_pred_svm_reduced = svm_reduced.predict(x_test_reduced)
#y_score_svm_reduced = svm_reduced.score(x_test_reduced, y_test_reduced)

In [151]:
np.mean(y_pred_svm_reduced==y_test_reduced)

0.41383071553228623

#### Random Forest

In [267]:
param_grid = {
                 'n_estimators': [5, 10, 15, 20, 25, 30, 35, 40],
                 'max_depth': [5, 10, 15, 20, 25, 30]
             }

grid_clf = grid_search.GridSearchCV(RF, param_grid, cv=5)
grid_clf.fit(x_trainfull, y_trainfull)


GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': [5, 10, 15, 20, 25, 30, 35, 40], 'max_depth': [5, 10, 15, 20, 25, 30]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [268]:
grid_clf.best_params_

{'max_depth': 5, 'n_estimators': 15}

In [78]:
RF_reduced = RandomForestClassifier(n_estimators = 15, max_depth =5)
RF_reduced.fit(x_train_reduced, y_train_reduced)
y_pred_rf_reduced = RF_reduced.predict(x_test_reduced)
y_pred_rf_reduced_prob = RF_reduced.predict_proba(x_test_reduced)
#y_score_rf_nocommon = RF.score(x_test_nocommon,y_test_nocommon)

In [79]:
np.mean(y_pred_rf_reduced==y_test_reduced)

0.39419720767888305

### tf-idf then apply PCA - stop word removed

In [90]:
tfidf_pca1 = PCA(n_components=800)
tfidf_pca1.fit(tfidf_x)
np.sum(tfidf_pca1.explained_variance_ratio_)

0.71698660295419991

In [80]:
tfidf_pca = PCA(n_components=1800)
tfidf_pca.fit(tfidf_x)
np.sum(tfidf_pca.explained_variance_ratio_)

0.89454253341783152

In [91]:
tfidf_x_reduced = tfidf_pca1.transform(tfidf_x)
tfidf_x_reduced = pd.DataFrame(tfidf_x_reduced)
tfidf_y_reduced = tfidf_y.to_frame()
tfidf_y_reduced.index = tfidf_x_reduced.index.values
tfidf_xy_reduced = pd.concat([tfidf_x_reduced, tfidf_y_reduced], axis = 1)

In [92]:
tfidf_train_reduced = tfidf_xy_reduced.sample(frac = 0.75, random_state=100)
tfidf_test_reduced=tfidf_xy_reduced.drop(tfidf_train_reduced.index)

tfidf_x_reduced = tfidf_xy_reduced.drop(['giventag'], axis = 1)
tfidf_y_reduced = tfidf_xy_reduced['giventag']

tfidf_x_train_reduced = tfidf_train_reduced.drop(['giventag'], axis = 1)
tfidf_y_train_reduced = tfidf_train_reduced['giventag']
tfidf_x_test_reduced = tfidf_test_reduced.drop(['giventag'], axis = 1)
tfidf_y_test_reduced = tfidf_test_reduced['giventag']


In [87]:
tfidf_y_pred_nb_reduced.shape

(4584,)

In [88]:
tfidf_y_test_reduced.shape

(4584,)

#### Naive Bayes

In [93]:
tfidf_nb_reduced = GaussianNB()
tfidf_nb_reduced.fit(tfidf_x_train_reduced, tfidf_y_train_reduced)
tfidf_y_pred_nb_reduced_prob = tfidf_nb_reduced.predict_proba(tfidf_x_test_reduced)
tfidf_y_pred_nb_reduced = tfidf_nb_reduced.predict(tfidf_x_test_reduced)
#tfidf_nb_reduced.score(tfidf_x_test_reduced, tfidf_y_test_reduced)

In [94]:
np.mean(tfidf_y_pred_nb_reduced==tfidf_y_test_reduced)

0.31173647469458987

#### SVM

In [None]:
tfidf_svm_reduced = SVC(kernel='rbf')
tfidf_svm_reduced.fit(tfidf_x_train_reduced, tfidf_y_train_reduced)
tfidf_y_pred_svm_reduced = tfidf_svm_reduced.predict(tfidf_x_test_reduced)
#y_predprob1 = clffull.predict_proba(x_testfull)
#tfidf_svm_reduced.score(tfidf_x_test_reduced, y_testfull)

In [None]:
np.mean(tfidf_y_pred_svm_reduced==tfidf_y_test_reduced)

#### Random Forest

In [267]:
param_grid = {
                 'n_estimators': [5, 10, 15, 20, 25, 30, 35, 40],
                 'max_depth': [5, 10, 15, 20, 25, 30]
             }

grid_clf = grid_search.GridSearchCV(RF, param_grid, cv=5)
grid_clf.fit(x_trainfull, y_trainfull)


GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': [5, 10, 15, 20, 25, 30, 35, 40], 'max_depth': [5, 10, 15, 20, 25, 30]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [268]:
grid_clf.best_params_

{'max_depth': 5, 'n_estimators': 15}

In [95]:
tfidf_rf_reduced = RandomForestClassifier(n_estimators = 15, max_depth =5)
tfidf_rf_reduced.fit(tfidf_x_train_reduced, tfidf_y_train_reduced)
tfidf_y_pred_rf_reduced = tfidf_rf_reduced.predict(tfidf_x_test_reduced)
tfidf_ypred_rf_reduced_prob = tfidf_rf_reduced.predict_proba(tfidf_x_test_reduced)
#score_rf = tfidf_rf_reduced.score(x_testfull,y_testfull)

In [96]:
np.mean(tfidf_y_pred_rf_reduced==tfidf_y_test_reduced)

0.375

### Apply TruncatedSVD

In [110]:
from sklearn.decomposition import TruncatedSVD

In [126]:
tsvd = TruncatedSVD(n_components=800)
tsvd.fit(x_short)
np.sum(tsvd.explained_variance_ratio_)

0.90404657432890489

In [127]:
x_reduced_tsvd = tsvd.transform(x_short)
x_reduced_tsvd = pd.DataFrame(x_reduced_tsvd)
#y_short = y_short.to_frame()
y_short.index = x_reduced_tsvd.index.values
xy_reduced_tsvd = pd.concat([x_reduced_tsvd, y_short], axis = 1)

In [128]:
train_reduced_tsvd = xy_reduced_tsvd.sample(frac = 0.75, random_state=100)
test_reduced_tsvd=xy_reduced_tsvd.drop(train_reduced_tsvd.index)

x_reduced_tsvd = xy_reduced_tsvd.drop(['giventag'], axis = 1)
y_reduced_tsvd = xy_reduced_tsvd['giventag']

x_train_reduced_tsvd = train_reduced_tsvd.drop(['giventag'], axis = 1)
y_train_reduced_tsvd = train_reduced_tsvd['giventag']
x_test_reduced_tsvd = test_reduced_tsvd.drop(['giventag'], axis = 1)
y_test_reduced_tsvd = test_reduced_tsvd['giventag']



#### Naive Bayes

In [129]:
nb_reduced_tsvd = GaussianNB()
nb_reduced_tsvd.fit(x_train_reduced_tsvd, y_train_reduced_tsvd)
y_pred_nb_reduced_tsvd = nb_reduced_tsvd.predict(x_test_reduced_tsvd)
y_pred_nb_reduced_tsvd_prob = nb_reduced_tsvd.predict_proba(x_test_reduced_tsvd)
#y_score_nb_nocommon = gnb.score(x_test_nocommon, y_test_nocommon)

In [130]:
np.mean(y_pred_nb_reduced_tsvd==y_test_reduced_tsvd)

0.3505671902268761

#### SVM

In [137]:
svm_reduced_tsvd = SVC(kernel='rbf')
svm_reduced_tsvd.fit(x_train_reduced_tsvd, y_train_reduced_tsvd)
y_pred_svm_reduced_tsvd = svm_reduced_tsvd.predict(x_test_reduced_tsvd)
#y_score_svm_reduced_tsvd = svm_reduced.score(x_test_reduced, y_test_reduced)

In [138]:
np.mean(y_pred_svm_reduced_tsvd==y_test_reduced)

0.41404886561954624

#### Random Forest

In [267]:
param_grid = {
                 'n_estimators': [5, 10, 15, 20, 25, 30, 35, 40],
                 'max_depth': [5, 10, 15, 20, 25, 30]
             }

grid_clf = grid_search.GridSearchCV(RF, param_grid, cv=5)
grid_clf.fit(x_trainfull, y_trainfull)


GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': [5, 10, 15, 20, 25, 30, 35, 40], 'max_depth': [5, 10, 15, 20, 25, 30]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [268]:
grid_clf.best_params_

{'max_depth': 5, 'n_estimators': 15}

In [131]:
RF_reduced_tsvd = RandomForestClassifier(n_estimators = 15, max_depth =5)
RF_reduced_tsvd.fit(x_train_reduced_tsvd, y_train_reduced_tsvd)
y_pred_rf_reduced_tsvd = RF_reduced_tsvd.predict(x_test_reduced_tsvd)
y_pred_rf_reduced_tsvd_prob = RF_reduced_tsvd.predict_proba(x_test_reduced_tsvd)
#y_score_rf_nocommon = RF.score(x_test_nocommon,y_test_nocommon)

In [132]:
np.mean(y_pred_rf_reduced==y_test_reduced)

0.39703315881326351

### tf-idf then apply TruncatedSVD - stop word removed

In [139]:
tfidf_tsvd = TruncatedSVD(n_components=800)
tfidf_tsvd.fit(tfidf_x)
np.sum(tfidf_tsvd.explained_variance_ratio_)

0.71434522174070392

In [141]:
tfidf_x_reduced_tsvd = tfidf_tsvd.transform(tfidf_x)
tfidf_x_reduced_tsvd = pd.DataFrame(tfidf_x_reduced_tsvd)
tfidf_y_reduced_tsvd = tfidf_y.to_frame()
tfidf_y_reduced_tsvd.index = tfidf_x_reduced_tsvd.index.values
tfidf_xy_reduced_tsvd = pd.concat([tfidf_x_reduced_tsvd, tfidf_y_reduced_tsvd], axis = 1)

In [142]:
tfidf_train_reduced_tsvd = tfidf_xy_reduced_tsvd.sample(frac = 0.75, random_state=100)
tfidf_test_reduced_tsvd=tfidf_xy_reduced_tsvd.drop(tfidf_train_reduced_tsvd.index)

tfidf_x_reduced_tsvd = tfidf_xy_reduced_tsvd.drop(['giventag'], axis = 1)
tfidf_y_reduced_tsvd = tfidf_xy_reduced_tsvd['giventag']

tfidf_x_train_reduced_tsvd = tfidf_train_reduced_tsvd.drop(['giventag'], axis = 1)
tfidf_y_train_reduced_tsvd = tfidf_train_reduced_tsvd['giventag']
tfidf_x_test_reduced_tsvd = tfidf_test_reduced_tsvd.drop(['giventag'], axis = 1)
tfidf_y_test_reduced_tsvd = tfidf_test_reduced_tsvd['giventag']

#### Naive Bayes

In [143]:
tfidf_nb_reduced_tsvd = GaussianNB()
tfidf_nb_reduced_tsvd.fit(tfidf_x_train_reduced_tsvd, tfidf_y_train_reduced_tsvd)
tfidf_y_pred_nb_reduced_tsvd_prob = tfidf_nb_reduced_tsvd.predict_proba(tfidf_x_test_reduced_tsvd)
tfidf_y_pred_nb_reduced_tsvd = tfidf_nb_reduced_tsvd.predict(tfidf_x_test_reduced_tsvd)
#tfidf_nb_reduced.score(tfidf_x_test_reduced, tfidf_y_test_reduced)

In [144]:
np.mean(tfidf_y_pred_nb_reduced_tsvd==tfidf_y_test_reduced_tsvd)

0.31849912739965097

#### SVM

In [147]:
tfidf_svm_reduced_tsvd = SVC(kernel='rbf')
tfidf_svm_reduced_tsvd.fit(tfidf_x_train_reduced_tsvd, tfidf_y_train_reduced_tsvd)
tfidf_y_pred_svm_reduced_tsvd = tfidf_svm_reduced_tsvd.predict(tfidf_x_test_reduced_tsvd)
#y_predprob1 = clffull.predict_proba(x_testfull)
#tfidf_svm_reduced.score(tfidf_x_test_reduced, y_testfull)

In [148]:
np.mean(tfidf_y_pred_svm_reduced_tsvd==tfidf_y_test_reduced_tsvd)

0.34162303664921467

#### Random Forest

In [267]:
param_grid = {
                 'n_estimators': [5, 10, 15, 20, 25, 30, 35, 40],
                 'max_depth': [5, 10, 15, 20, 25, 30]
             }

grid_clf = grid_search.GridSearchCV(RF, param_grid, cv=5)
grid_clf.fit(x_trainfull, y_trainfull)


GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': [5, 10, 15, 20, 25, 30, 35, 40], 'max_depth': [5, 10, 15, 20, 25, 30]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [268]:
grid_clf.best_params_

{'max_depth': 5, 'n_estimators': 15}

In [145]:
tfidf_rf_reduced_tsvd = RandomForestClassifier(n_estimators = 15, max_depth =5)
tfidf_rf_reduced_tsvd.fit(tfidf_x_train_reduced_tsvd, tfidf_y_train_reduced_tsvd)
tfidf_y_pred_rf_reduced_tsvd = tfidf_rf_reduced_tsvd.predict(tfidf_x_test_reduced_tsvd)
tfidf_ypred_rf_reduced_tsvd_prob = tfidf_rf_reduced_tsvd.predict_proba(tfidf_x_test_reduced_tsvd)
#score_rf = tfidf_rf_reduced.score(x_testfull,y_testfull)

In [146]:
np.mean(tfidf_y_pred_rf_reduced_tsvd==tfidf_y_test_reduced_tsvd)

0.39027050610820246

Preliminary Model Analysis and implications: 
Noticing that we have less than 50% accuracy with all above methods we tried, we want to dig into the reasons behind. By performing only classification decisions on the data that we are certain about based on the selected models for approximately 51% of the training and testing data, we only increase the classification accuracy by 3%. This led us to think that the reasons behind the unsatisfactory results may be from the inappropriate choice of targets. 

By running for individual binary classification for each individual target tags, we notice a significant increase in the prediction accuracy from the same reduced lyrics dataset to of at least 65%. This shed light to our guess of the problem of the target tags not being mutually exclusive. More specifically, we reconsider our choice of target groups and continue to explore natural separation in topics with further unsupervised learning methods.  


### Only classify if max prob - 2nd large prob > 0.07

In [6]:
probs = pd.read_csv('for_draw1.csv')

In [7]:
probs1_val = probs.iloc[:,1:5]
probs1_true = probs.iloc[:,5]
probs1_pred = probs.iloc[:,6]

In [10]:
import heapq
top2 = [heapq.nlargest(2,probs1_val.iloc[i,:]) for i in range(probs1_val.shape[0])]
diff = np.array([abs(x[0]-x[1]) for x in top2])

In [16]:
pred_bool = (diff>=0.07)
#Proportion of data points that we are able to classify with the threshold
np.mean(pred_bool)

0.54183266932270913

In [18]:
pred_bool = (diff>=0.07)
#prediction accuracy
print ("originally prediction accuracy:", np.mean((probs1_pred==probs1_true)))
print ("new prediciton accuracy:", np.mean((probs1_pred==probs1_true)[pred_bool]))

originally prediction accuracy: 0.488047808765
new prediciton accuracy: 0.511029411765


We also want to look at how much accuracy we achieve within each category specifically.

In [19]:
def conf_matrix(pred, true):
    overall = np.mean(pred==true)
    happy = np.mean((pred==true)[(true=='happy') | (true==1)])
    sad = np.mean((pred==true)[(true=='sad') | (true==2)])
    energetic = np.mean((pred==true)[(true=='energetic') | (true==3)])
    relaxing = np.mean((pred==true)[(true=='relaxing') | (true==4)])
    print ("overall: ",overall)
    print ("happy: ",happy)
    print ("sad: ",sad)
    print ("energetic: ",energetic)
    print ("relaxing: ",relaxing)

In [20]:
probs1_pred = np.array(probs1_pred)
probs1_true = np.array(probs1_true)

In [21]:
conf_matrix(probs1_pred,probs1_true)

overall:  0.488047808765
happy:  0.541176470588
sad:  0.790322580645
energetic:  0.027027027027
relaxing:  0.0555555555556


  app.launch_new_instance()


In [267]:
param_grid = {
                 'n_estimators': [5, 10, 15, 20, 25, 30, 35, 40],
                 'max_depth': [5, 10, 15, 20, 25, 30]
             }

grid_clf = grid_search.GridSearchCV(RF, param_grid, cv=5)
grid_clf.fit(x_trainfull, y_trainfull)


GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': [5, 10, 15, 20, 25, 30, 35, 40], 'max_depth': [5, 10, 15, 20, 25, 30]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [268]:
grid_clf.best_params_

{'max_depth': 5, 'n_estimators': 15}

In [269]:
grid_clf.grid_scores_

[mean: 0.36756, std: 0.00736, params: {'n_estimators': 5, 'max_depth': 5},
 mean: 0.37497, std: 0.00854, params: {'n_estimators': 10, 'max_depth': 5},
 mean: 0.37635, std: 0.00467, params: {'n_estimators': 15, 'max_depth': 5},
 mean: 0.37439, std: 0.00426, params: {'n_estimators': 20, 'max_depth': 5},
 mean: 0.37446, std: 0.00439, params: {'n_estimators': 25, 'max_depth': 5},
 mean: 0.37577, std: 0.00624, params: {'n_estimators': 30, 'max_depth': 5},
 mean: 0.37345, std: 0.00086, params: {'n_estimators': 35, 'max_depth': 5},
 mean: 0.37061, std: 0.00529, params: {'n_estimators': 40, 'max_depth': 5},
 mean: 0.32378, std: 0.01114, params: {'n_estimators': 5, 'max_depth': 10},
 mean: 0.33425, std: 0.00781, params: {'n_estimators': 10, 'max_depth': 10},
 mean: 0.33854, std: 0.00870, params: {'n_estimators': 15, 'max_depth': 10},
 mean: 0.34967, std: 0.00534, params: {'n_estimators': 20, 'max_depth': 10},
 mean: 0.35374, std: 0.00376, params: {'n_estimators': 25, 'max_depth': 10},
 mean: 0.

In [270]:
RF = RandomForestClassifier(n_estimators = 15, max_depth =5)
RF.fit(x_trainfull, y_trainfull)
ypred_rf = RF.predict(x_testfull)
ypred_rf_prob = RF.predict_proba(x_testfull)
score_rf = RF.score(x_testfull,y_testfull)

In [303]:
ypred_rf_prob_df = pd.DataFrame(ypred_rf_prob)
ypred_rf_prob_df.index = y_testfull.index.values
ypred_rf_prob_df_tag = pd.concat([ypred_rf_prob_df, y_testfull], axis=1)
ypred_rf_prob_df_tag.to_csv('rf_probs.csv')