### Preprocessing

In [1]:
import pandas as pd
df_train = pd.read_csv('../Data/training.csv')
df_test= pd.read_csv('../Data/validation.csv')
df_train.head()

Unnamed: 0,file,artist,title,year,lyrics,mood
0,TRBFHQG128F93092E5.h5,Donnie McClurkin,Psalm 27,2000,One thing have I desired up the Lord\r\r\r\r\n...,happy
1,TRAKWXS128F930F798.h5,Lollipop Lust Kill,No Answer (Outro),2002,Though I heard you say you love me\r\r\r\r\nIt...,sad
2,TRASWIV128E0788A84.h5,The Smashing Pumpkins,Real Love,2000,Fall in to the century of supersonic cross\r\r...,happy
3,TRAYTDZ128F93146E3.h5,Stevie Ray Vaughan And Double Trouble,Mary Had A Little Lamb,1988,Mary had a little lamb\r\r\r\r\nIt's fleece wa...,happy
4,TRAROPS128F92F09A5.h5,Carl Belew,Am I That Easy To Forget,1959,Am I that easy to forget?\r\r\r\r\n\r\r\r\r\nY...,sad


#### Label Encoder


In [2]:
from sklearn.preprocessing import LabelEncoder
import numpy as np

X_train = df_train['lyrics'].values 
y_train = df_train['mood'].values

X_test = df_test['lyrics'].values 
y_test = df_test['mood'].values


print('Original: %s ...' %y_train[:5])

le_train = LabelEncoder()
le_train.fit(y_train)
y_train = le_train.transform(y_train)

le_test = LabelEncoder()
le_test.fit(y_test)
y_test=le_test.transform(y_test)

print('Encoded: %s ...' %y_train[:5])

Original: ['happy' 'sad' 'happy' 'happy' 'sad'] ...
Encoded: [0 1 0 0 1] ...


#### Reading the Stop words from English Language

In [3]:
with open('../stopwords.txt', 'r') as infile:
    stop_words = infile.read().splitlines()
print('stop words %s ...' %stop_words[:5])

stop words ['i', 'me', 'my', 'myself', 'we'] ...


### Transform texts into bag of words models - Trying different tokenizers

In [4]:
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import EnglishStemmer


"""
The Porter stemming algorithm (or ‘Porter stemmer’) is a process for removing the commoner morphological and inflexional 
endings from words in English. Its main use is as part of a term normalisation process that is usually done when 
setting up Information Retrieval systems.

Snowball Stemmer: https://snowballstem.org/algorithms/

"""

porter = PorterStemmer()
snowball = EnglishStemmer()

# raw words
tokenizer = lambda text: text.split()

# words after Porter stemming 
tokenizer_porter = lambda text: [porter.stem(word) for word in text.split()]

# Words after Snowball stemming
tokenizer_snowball = lambda text: [snowball.stem(word) for word in text.split()]

### Checking the vocabulary size

In [5]:
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from mlxtend.preprocessing import DenseTransformer

"""

DenseTransformer is a simple transformer that converts a sparse into a dense numpy array, 
It is required for scikit-learn's Pipeline as CountVectorizers are used in combination with estimators 
that are not compatible with sparse matrices.

"""


vector_1 = CountVectorizer(binary=False,
                         stop_words=stop_words,
                         ngram_range=(1,1),
                         preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()),
                         tokenizer=tokenizer)

vector_2 = CountVectorizer(binary=False,
                         stop_words=stop_words,
                         ngram_range=(1,1),
                         preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()),
                         tokenizer=tokenizer_porter)
    
vector_3 = CountVectorizer(binary=False,
                         stop_words=stop_words,
                         ngram_range=(1,1),
                         preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()),
                         tokenizer=tokenizer_snowball)  

vector_4 = TfidfVectorizer(binary=False,
                         stop_words=stop_words,
                         ngram_range=(1,1),
                         preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()),
                         tokenizer=tokenizer)

vector_5 = TfidfVectorizer(binary=False,
                         stop_words=stop_words,
                         ngram_range=(1,1),
                         preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()),
                         tokenizer=tokenizer_porter)
    
vector_6 = TfidfVectorizer(binary=False,
                         stop_words=stop_words,
                         ngram_range=(1,1),
                         preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()),
                         tokenizer=tokenizer_snowball)

  from numpy.core.umath_tests import inner1d


In [6]:
"""

Pipelines are used to Sequentially apply a list of transforms and a final estimator. 
Intermediate steps of pipeline must implement fit and transform methods. 
Final estimator implements the fit.
"""

pipelines = []
vectorizers = [vector_1, vector_2, vector_3, vector_4, vector_5, vector_6]
for v in vectorizers:
    pipelines.append(Pipeline([('vect', v),
                               ('dense', DenseTransformer()),
                               ('clf', RandomForestClassifier(n_estimators=100))]))

In [7]:
print('Vocabulary sizes\n')
labels = ['CountVec', 'CountVec porter', 'CountVec snowball','TfidfVec', 'TfidfVec porter', 'TfidfVec snowball']

for label, v in zip(labels, vectorizers):
    v.fit(X_train)
    print('%s: %s' % (label, len(v.vocabulary_)))

Vocabulary sizes

CountVec: 8188
CountVec porter: 6282
CountVec snowball: 6253
TfidfVec: 8188
TfidfVec porter: 6282
TfidfVec snowball: 6253


### Selecting Models

In [8]:
from sklearn import metrics
from sklearn import cross_validation

labels = ['CountVec', 'CountVec porter', 'CountVec snowball','TfidfVec', 'TfidfVec porter', 'TfidfVec snowball']



dic = {'Data':labels,
     'ROC AUC (%)':[],}

for i,clf in enumerate(pipelines):
    scores = cross_validation.cross_val_score(estimator=clf, X=X_train, y=y_train, scoring='roc_auc', cv=10)
    print('clf %s, %s: %s' % (i+1, labels[i], scores.mean()*100))
    dic['ROC AUC (%)'].append('%0.2f (+/- %0.2f)' % (scores.mean()*100, scores.std()*100))



clf 1, CountVec: 67.74744623655916
clf 2, CountVec porter: 67.68660394265234
clf 3, CountVec snowball: 69.46052867383513
clf 4, TfidfVec: 66.58351254480286
clf 5, TfidfVec porter: 68.47804659498209
clf 6, TfidfVec snowball: 69.73920250896057


### Plotting ROC Curve

In [9]:
from sklearn.metrics import roc_curve, auc
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cross_validation import KFold
from scipy import interp

sns.set()
sns.set_style("whitegrid")

classifier = Pipeline([('vect', TfidfVectorizer(binary=False,
                                             stop_words=stop_words,
                                             ngram_range=(1,1),
                                             preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()),
                                             max_features = 5000,
                                             tokenizer=lambda text: [porter.stem(word) for word in text.split()])),
                        ('dense', DenseTransformer()),
                        ('clf', RandomForestClassifier(n_estimators=100))])


crossValidation = KFold(X_train.shape[0], n_folds=5)

mean_tpr = 0.0
mean_fpr = np.linspace(0, 1, 100)
all_tpr = []


for i,(train, test) in enumerate(crossValidation):
    probabilities = classifier.fit(X_train[train], y_train[train]).predict_proba(X_train[test])

    # Compute ROC curve and area under the curve
    fpr, tpr, thresholds = roc_curve(y_train[test], probabilities[:, 1])
    mean_tpr += interp(mean_fpr, fpr, tpr)
    mean_tpr[0] = 0.0
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, lw=1, label='ROC fold %d (area = %0.2f)' % (i+1, roc_auc))


plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Random Guessing')

mean_tpr /= len(crossValidation)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
plt.plot(mean_fpr, mean_tpr, 'k--',
         label='Mean ROC (area = %0.2f)' % mean_auc, lw=2)

plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.tight_layout()
plt.savefig('../Images/roc_RandomForest_TfPorter.png', dpi=300)
plt.legend(loc="lower right")

plt.show()



<Figure size 640x480 with 1 Axes>

### HyperParameter Tuning:  Finding best parameters through GridSearch

In [10]:
vect = TfidfVectorizer(binary=False,
                       stop_words=stop_words,
                       ngram_range=(1,1),
                       preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()),
                       tokenizer=lambda text: [porter.stem(word) for word in text.split()])

### Chosing the number of estimators

In [11]:
from sklearn.metrics import roc_curve, auc
from sklearn import cross_validation

pipe_1 = Pipeline([
                ('vect',   vect),
                ('dense', DenseTransformer()),
                ('clf', RandomForestClassifier(n_estimators=50))])

pipe_2 = Pipeline([
                ('vect',   vect),
                ('dense', DenseTransformer()),
                ('clf', RandomForestClassifier(n_estimators=100))])

pipe_3 = Pipeline([
                ('vect',   vect),
                ('dense', DenseTransformer()),
                ('clf', RandomForestClassifier(n_estimators=200))])

pipe_4 = Pipeline([
                ('vect',   vect),
                ('dense', DenseTransformer()),
                ('clf', RandomForestClassifier(n_estimators=400))])

labels = [50, 100, 200, 400]

for i,clf in enumerate([pipe_1, pipe_2, pipe_3, pipe_4]):
    scores = cross_validation.cross_val_score(estimator=clf, X=X_train, y=y_train, scoring='roc_auc', cv=10)
    print('clf %s, %s: %0.2f (+/- %0.2f)' % (i+1, labels[i], scores.mean()*100, scores.std()*100))

clf 1, 50: 64.50 (+/- 5.70)
clf 2, 100: 68.25 (+/- 5.21)
clf 3, 200: 71.28 (+/- 6.86)
clf 4, 400: 69.35 (+/- 6.14)


### Gridsearch

In [12]:
X_train_feature = vect.fit_transform(X_train)
#X_train_feature = X_train_feature.toarray()

In [13]:
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report


classifier_1 = RandomForestClassifier(n_estimators=200)


parameters = [
  {'criterion': ['gini', 'entropy'], 
   'max_features': ['auto', 'log2', 'sqrt'],
   'min_samples_split':[2,3], 
   'min_samples_leaf':[1,2]},
 ]


gridSearch_1 = GridSearchCV(classifier_1, 
                           parameters, 
                           n_jobs=1, 
                           scoring='roc_auc',
                           cv=10)

gridSearch_1.fit(X_train_feature, y_train)

print("Best parameters:")
print()
print(gridSearch_1.best_estimator_)
print()
print("Grid scores:")
print()
for params, mean_score, scores in gridSearch_1.grid_scores_:
    print("%0.3f (+/-%0.03f) for %r"
            % (mean_score, scores.std() / 2, params))



Best parameters:

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=2, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

Grid scores:

0.697 (+/-0.034) for {'criterion': 'gini', 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 2}
0.675 (+/-0.028) for {'criterion': 'gini', 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 3}
0.709 (+/-0.039) for {'criterion': 'gini', 'max_features': 'auto', 'min_samples_leaf': 2, 'min_samples_split': 2}
0.690 (+/-0.033) for {'criterion': 'gini', 'max_features': 'auto', 'min_samples_leaf': 2, 'min_samples_split': 3}
0.690 (+/-0.026) for {'criterion': 'gini', 'max_features': 'log2', 'min_samples_leaf':

### Validating the model

In [14]:
from sklearn import metrics

# `pos_label` for positive class, since we have sad=1, happy=0

acc_scorer = metrics.make_scorer(metrics.accuracy_score, greater_is_better=True)
pre_scorer = metrics.make_scorer(metrics.precision_score, greater_is_better=True, pos_label=0)
rec_scorer = metrics.make_scorer(metrics.recall_score, greater_is_better=True, pos_label=0)
f1_scorer = metrics.make_scorer(metrics.f1_score, greater_is_better=True, pos_label=0)
auc_scorer = metrics.make_scorer(metrics.roc_auc_score, greater_is_better=True)

In [15]:
labels = ['Train CountVec', 'Train CountVec porter', 'Train CountVec snowball',
          'Train TfidfVec', 'Train TfidfVec porter', 'Train TfidfVec snowball', 
          'Test CountVec', 'Test CountVec porter', 'Test CountVec snowball', 
          'Test TfidfVec', 'Test TfidfVec porter', 'Test TfidfVec snowball']

dic = {'Data':labels,
     'ACCURACY (%)':[],
     'PRECISION (%)':[],
     'RECALL (%)':[],
     'F1 (%)':[],
     'ROC AUC (%)':[],
}


for clf in pipelines:
    clf.fit(X_train, y_train)

for clf in pipelines:

    dic['ACCURACY (%)'].append(acc_scorer(estimator=clf, X=X_train, y_true=y_train))
    dic['PRECISION (%)'].append(pre_scorer(estimator=clf, X=X_train, y_true=y_train))
    dic['RECALL (%)'].append(rec_scorer(estimator=clf, X=X_train, y_true=y_train))
    dic['F1 (%)'].append(f1_scorer(estimator=clf, X=X_train, y_true=y_train))
    dic['ROC AUC (%)'].append(auc_scorer(estimator=clf, X=X_train, y_true=y_train))

for clf in pipelines:

    dic['ACCURACY (%)'].append(acc_scorer(estimator=clf, X=X_test, y_true=y_test))
    dic['PRECISION (%)'].append(pre_scorer(estimator=clf, X=X_test, y_true=y_test))
    dic['RECALL (%)'].append(rec_scorer(estimator=clf, X=X_test, y_true=y_test))
    dic['F1 (%)'].append(f1_scorer(estimator=clf, X=X_test, y_true=y_test))
    dic['ROC AUC (%)'].append(auc_scorer(estimator=clf, X=X_test, y_true=y_test))

In [16]:
performance = pd.DataFrame(dic)
performance = performance[['ACCURACY (%)', 'PRECISION (%)', 'RECALL (%)', 'F1 (%)', 'ROC AUC (%)']]
performance.index=(labels)
performance = performance*100
performance = np.round(performance, decimals=2)
performance

Unnamed: 0,ACCURACY (%),PRECISION (%),RECALL (%),F1 (%),ROC AUC (%)
Train CountVec,100.0,100.0,100.0,100.0,100.0
Train CountVec porter,100.0,100.0,100.0,100.0,100.0
Train CountVec snowball,100.0,100.0,100.0,100.0,100.0
Train TfidfVec,100.0,100.0,100.0,100.0,100.0
Train TfidfVec porter,100.0,100.0,100.0,100.0,100.0
Train TfidfVec snowball,100.0,100.0,100.0,100.0,100.0
Test CountVec,64.0,57.14,40.0,47.06,60.0
Test CountVec porter,69.0,64.52,50.0,56.34,65.83
Test CountVec snowball,67.0,61.29,47.5,53.52,63.75
Test TfidfVec,70.0,67.86,47.5,55.88,66.25


In [17]:
performance.to_csv('../Data/randomForest_performance.csv', index_label=False, float_format='%2.2f')