In [1]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
from keras.preprocessing.text import Tokenizer
from sklearn import preprocessing
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from tensorflow.keras import callbacks 
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from keras.utils import np_utils
from mlxtend.plotting import plot_confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm
from sklearn.model_selection import cross_val_score

In [344]:
h_train = pd.read_json("../input/nlp-s6-proj-data/Headline_Trainingdata.json")
h_val = pd.read_json("../input/nlp-s6-proj-data/Headline_Trialdata.json")
h_test = pd.read_json("../input/nlp-s6-proj-data/Headlines_Testdata.json")

## Headline

In [345]:
h_train.head()

Unnamed: 0,id,company,title,sentiment
0,2,Morrisons,Morrisons book second consecutive quarter of s...,0.43
1,3,IMI,IMI posts drop in first-quarter organic revenu...,-0.344
2,4,Glencore,Glencore to refinance its short-term debt earl...,0.34
3,5,Ryanair,EasyJet attracts more passengers in June but s...,0.259
4,6,Barclays,Barclays 'bad bank' chief to step down,-0.231


In [346]:
h_train.values[0]

array([2, 'Morrisons',
       'Morrisons book second consecutive quarter of sales growth', 0.43],
      dtype=object)

In [347]:
pos_neg_neu_train = {-1:[], 0:[], 1:[]}

for i in h_train.values:
    if(i[3]>-1 and i[3]<=-0.25):
        pos_neg_neu_train[-1].append([i[3], i[2]])
    if(i[3]>-0.25 and i[3]<=0.25):
        pos_neg_neu_train[0].append([i[3], i[2]])
    if(i[3]>0.25 and i[3]<=1):
        pos_neg_neu_train[1].append([i[3], i[2]])

In [348]:
print('Negative headlines => ', str(len(pos_neg_neu_train[-1])),'\nNeutral headlines => ', str(len(pos_neg_neu_train[0])),'\nPositive headlines => ', str(len(pos_neg_neu_train[1])))

Negative headlines =>  317 
Neutral headlines =>  428 
Positive headlines =>  397


In [349]:
pos_neg_neu_val = {-1:[], 0:[], 1:[]}

for i in h_val.values:
    if(i[3]>-1 and i[3]<=-0.25):
        pos_neg_neu_val[-1].append([i[1], i[3]])
    if(i[3]>-0.25 and i[3]<=0.25):
        pos_neg_neu_val[0].append([i[1], i[3]])
    if(i[3]>0.25 and i[3]<=1):
        pos_neg_neu_val[1].append([i[1], i[3]])

In [350]:
pos_neg_neu_val

{-1: [['Tesco says UK store closures put 2000 jobs at risk', -0.9],
  ['Reed Elsevier share price slides on underwhelming full-year results',
   -0.9],
  ['Kingfisher bid for Mr Bricolage runs into trouble', -0.30000000000000004],
  ['Kingfisher bid for Mr Bricolage runs into trouble', -0.30000000000000004]],
 0: [['Diageo stays neutral on India boardroom turmoil', -0.2],
  ['BP ends 27-year sponsorship of Tate as falling oil price takes toll',
   -0.2]],
 1: [["CRH's concrete bid for Holcim Lafarge assets", 0.30000000000000004],
  ["CRH's concrete bid for Holcim Lafarge assets", 0.30000000000000004],
  ["Glencore's annual results beat forecasts", 0.9],
  ['Markets Shire up 2.5% and Baxalta up 6% on $32bn deal', 0.8],
  ['Markets Shire up 2.5% and Baxalta up 6% on $32bn deal', 0.8]]}

In [351]:
print('Negative headlines => ', str(len(pos_neg_neu_val[-1])),'\nNeutral headlines => ', str(len(pos_neg_neu_val[0])),'\nPositive headlines => ', str(len(pos_neg_neu_val[1])))

Negative headlines =>  4 
Neutral headlines =>  2 
Positive headlines =>  5


In [352]:
X_vals_tr = []
for i in pos_neg_neu_train:
    for j in pos_neg_neu_train[i]:
        X_vals_tr.append([j[1]])
        if(i == -1):
            X_vals_tr[-1].extend(['Negative'])
        elif(i == 0):
            X_vals_tr[-1].extend(['Neutral'])
        else:
            X_vals_tr[-1].extend(['Positive'])
X_vals_tr = np.array(X_vals_tr)

In [353]:
X_vals_tr[0]

array(['IMI posts drop in first-quarter organic revenue; warns on full year',
       'Negative'], dtype='<U116')

In [354]:
headLine_train = pd.DataFrame({'Title':X_vals_tr[:,0],'Sentiment':X_vals_tr[:,1]})

In [355]:
X_vals_vl = []
for i in pos_neg_neu_val:
    for j in pos_neg_neu_val[i]:
        X_vals_vl.append([j[0]])
        if(i == -1):
            X_vals_vl[-1].extend(['Negative'])
        elif(i == 0):
            X_vals_vl[-1].extend(['Neutral'])
        else:
            X_vals_vl[-1].extend(['Positive'])
X_vals_vl = np.array(X_vals_vl)

In [356]:
X_vals_vl[0]

array(['Tesco says UK store closures put 2000 jobs at risk', 'Negative'],
      dtype='<U67')

In [357]:
headLine_val = pd.DataFrame({'Title':X_vals_vl[:,0],'Sentiment':X_vals_vl[:,1]})

In [358]:
headLine_val.head()

Unnamed: 0,Title,Sentiment
0,Tesco says UK store closures put 2000 jobs at ...,Negative
1,Reed Elsevier share price slides on underwhelm...,Negative
2,Kingfisher bid for Mr Bricolage runs into trouble,Negative
3,Kingfisher bid for Mr Bricolage runs into trouble,Negative
4,Diageo stays neutral on India boardroom turmoil,Neutral


In [359]:
headLine_train.head()

Unnamed: 0,Title,Sentiment
0,IMI posts drop in first-quarter organic revenu...,Negative
1,Barclays share price subdued as bank faces fre...,Negative
2,Diageo sales disappoint as currency and compar...,Negative
3,Smith & Nephew recalls hip-replacement components,Negative
4,Tesco is torn apart as watchdog finds supermar...,Negative


In [363]:
headLine = pd.concat([headLine_train,headLine_val])

In [364]:
headLine.head()

Unnamed: 0,Title,Sentiment
0,IMI posts drop in first-quarter organic revenu...,Negative
1,Barclays share price subdued as bank faces fre...,Negative
2,Diageo sales disappoint as currency and compar...,Negative
3,Smith & Nephew recalls hip-replacement components,Negative
4,Tesco is torn apart as watchdog finds supermar...,Negative


# Word2Vec

In [365]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [366]:
from gensim.models import Word2Vec

In [367]:
from nltk.tokenize import word_tokenize

In [368]:
import gensim

In [369]:
pretext_h=headLine["Title"].apply(gensim.utils.simple_preprocess)

In [370]:
pretex=[" ".join(i) for i in pretext_h]

In [371]:
pretext_h = [i for i in pretext_h.values]

In [372]:
wvmodel = Word2Vec(pretext_h, vector_size=300, window=3, min_count=1, workers=4)

In [373]:
train_features = []
for i in tqdm(pretext_h):
    s = np.zeros(wvmodel.vector_size,dtype='float32')
    for j in i:
        s+=wvmodel.wv.get_vector(wvmodel.wv.key_to_index[j])
    train_features.append(s)
train_features = np.array(train_features)

100%|██████████| 1153/1153 [00:00<00:00, 18564.50it/s]


In [374]:
train_labels = headLine['Sentiment'].values

In [375]:
train_features, test_features, train_labels, test_labels = train_test_split(train_features, train_labels, test_size=0.2, random_state=42)

In [376]:
train_features.shape

(922, 300)

In [377]:
test_features.shape

(231, 300)

# Model Training

In [378]:
from sklearn.linear_model import LogisticRegression

In [379]:
from sklearn import svm

In [380]:
clf = LogisticRegression(random_state=0, solver='lbfgs', penalty='l2')

In [381]:
clf.fit(train_features, train_labels)

LogisticRegression(random_state=0)

In [382]:
# print('Validation Accuracy= '+str(clf.score(val_features, val_labels))+'%')
# for train data:        
print('From train set Accuracy= '+str(clf.score(test_features, test_labels))+'%')

From train set Accuracy= 0.41125541125541126%


In [383]:
clf = svm.SVC(degree=5)
clf.fit(train_features, train_labels)

SVC(degree=5)

In [384]:
scores = cross_val_score(clf, test_features, test_labels, cv=10)

In [385]:
# print('Validation Accuracy= '+str(clf.score(val_features, val_labels))+'%')
# for train data:        
print('From train set Accuracy= '+str(np.mean(scores))+'%')

From train set Accuracy= 0.37228260869565216%


In [386]:
clf = make_pipeline(StandardScaler(),svm.SVC(random_state=0, degree=10, tol=1e-5, C=2.))
clf.fit(train_features, train_labels)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('svc', SVC(C=2.0, degree=10, random_state=0, tol=1e-05))])

In [387]:
scores = cross_val_score(clf, test_features, test_labels, cv=10)

In [388]:
# print('Validation Accuracy= '+str(clf.score(val_features, val_labels))+'%')
# for train data:        
print('From train set Accuracy= '+str(np.mean(scores))+'%')

From train set Accuracy= 0.47228260869565214%


In [389]:
clf = RandomForestClassifier(max_depth=50, random_state=0, criterion='entropy')
clf.fit(train_features, train_labels)

RandomForestClassifier(criterion='entropy', max_depth=50, random_state=0)

In [390]:
scores = cross_val_score(clf, test_features, test_labels, cv=10)

In [391]:
# print('Validation Accuracy= '+str(clf.score(val_features, val_labels))+'%')
# for train data:        
print('From train set Accuracy= '+str(np.mean(scores))+'%')

From train set Accuracy= 0.4545289855072464%


In [392]:
from sklearn.neighbors import KNeighborsClassifier

In [393]:
clf = KNeighborsClassifier(n_neighbors=20)
clf.fit(train_features, train_labels)

KNeighborsClassifier(n_neighbors=20)

In [394]:
scores = cross_val_score(clf, test_features, test_labels, cv=10)

In [395]:
# print('Validation Accuracy= '+str(clf.score(val_features, val_labels))+'%')
# for train data:        
print('From train set Accuracy= '+str(np.mean(scores))+'%')

From train set Accuracy= 0.4197463768115942%


In [396]:
from sklearn.ensemble import GradientBoostingClassifier

In [397]:
clf = GradientBoostingClassifier(n_estimators=75, learning_rate=1.0, max_depth=1, random_state=0)
clf.fit(train_features, train_labels)

GradientBoostingClassifier(learning_rate=1.0, max_depth=1, n_estimators=75,
                           random_state=0)

In [398]:
scores = cross_val_score(clf, test_features, test_labels, cv=10)

In [399]:
# print('Validation Accuracy= '+str(clf.score(val_features, val_labels))+'%')
# for train data:        
print('From train set Accuracy= '+str(np.mean(scores))+'%')

From train set Accuracy= 0.4115942028985507%


## Microblogs

In [207]:
mb_train = pd.read_json("../input/nlp-s6-proj-data/Microblog_Trainingdata.json")
mb_val = pd.read_json("../input/nlp-s6-proj-data/Microblog_Trialdata.json")
mb_test = pd.read_json("../input/nlp-s6-proj-data/Microblogs_Testdata.json")

## Headline

In [208]:
mb_train.head()

Unnamed: 0,source,cashtag,sentiment score,id,spans
0,twitter,$FB,0.366,719659409228451840,[watching for bounce tomorrow]
1,twitter,$LUV,0.638,719904304207962112,[record number of passengers served in 2015]
2,stocktwits,$NFLX,-0.494,5329774,[out $NFLX -.35]
3,twitter,$DIA,0.46,719891468173844480,"[Looking for a strong bounce, Lunchtime rally ..."
4,stocktwits,$PLUG,0.403,20091246,[Very intrigued with the technology and growth...


In [209]:
mb_train.values[0]

array(['twitter', '$FB', 0.366, 719659409228451840,
       list(['watching for bounce tomorrow'])], dtype=object)

In [231]:
mb_train.head()

Unnamed: 0,source,cashtag,sentiment score,id,spans
0,twitter,$FB,0.366,719659409228451840,[watching for bounce tomorrow]
1,twitter,$LUV,0.638,719904304207962112,[record number of passengers served in 2015]
2,stocktwits,$NFLX,-0.494,5329774,[out $NFLX -.35]
3,twitter,$DIA,0.46,719891468173844480,"[Looking for a strong bounce, Lunchtime rally ..."
4,stocktwits,$PLUG,0.403,20091246,[Very intrigued with the technology and growth...


In [237]:
pos_neg_neu_train = {-1:[], 0:[], 1:[]}

for i in mb_train.values:
    if(i[2]>-1 and i[2]<=-0.25):
        if(len(i[4])==1):
            pos_neg_neu_train[-1].append([i[4][0], i[2]])
        elif(len(i[4])>1):
            for k in i[4]:
                pos_neg_neu_train[-1].append([i[4][0], i[2]])
    if(i[2]>-0.25 and i[2]<=0.25):
        if(len(i[4])==1):
            pos_neg_neu_train[0].append([i[4][0], i[2]])
        elif(len(i[4])>1):
            for k in i[4]:
                pos_neg_neu_train[0].append([i[4][0], i[2]])
    if(i[2]>0.25 and i[2]<=1):
        if(len(i[4])==1):
            pos_neg_neu_train[1].append([i[4][0], i[2]])
        elif(len(i[4])>1):
            for k in i[4]:
                pos_neg_neu_train[1].append([i[4][0], i[2]])

In [238]:
pos_neg_neu_train

{-1: [['out $NFLX -.35', -0.494],
  ['overbought', -0.296],
  ['absolute garbage still up', -0.546],
  ['absolute garbage still up', -0.546],
  ['absolute garbage still up', -0.546],
  ['Biggest Market Losers', -0.438],
  ['$GOOG $GOOGL would suck', -0.398],
  ["who won't pay anymore", -0.349],
  ["who won't pay anymore", -0.349],
  ['now seems like its helping the downtrend', -0.372],
  ['big dumping', -0.699],
  ['big dumping', -0.699],
  ['Stochastic Overbought', -0.385],
  ['Insiders Are Selling', -0.351],
  ['What goes up...', -0.514],
  ['if $249.84 breaks we see $245 then $240', -0.519],
  ['Put the chum out there at key support then next level down', -0.32],
  ['Put the chum out there at key support then next level down', -0.32],
  ['Alibaba IPO hoopla canâ€™t hide risk of Chinese stocks', -0.39],
  ['$MDXG', -0.402],
  ['$MDXG', -0.402],
  ['Biggest Market Losers', -0.438],
  ['hip sinking?', -0.411],
  ['placed an order to sell 5 shares', -0.379],
  ['Short Setups Looking Nic

In [233]:
print('Negative headlines => ', str(len(pos_neg_neu_train[-1])),'\nNeutral headlines => ', str(len(pos_neg_neu_train[0])),'\nPositive headlines => ', str(len(pos_neg_neu_train[1])))

Negative headlines =>  562 
Neutral headlines =>  426 
Positive headlines =>  1090


In [239]:
pos_neg_neu_val = {-1:[], 0:[], 1:[]}

for i in mb_val.values:
    if(i[3]>-1 and i[3]<=-0.25):
#         pos_neg_neu_val[-1].append([i[1], i[3]])
        if(len(i[1])==1):
            pos_neg_neu_val[-1].append([i[1][0], i[3]])
        elif(len(i[1])>1):
            for k in i[1]:
                pos_neg_neu_val[-1].append([i[1][0], i[3]])
    if(i[3]>-0.25 and i[3]<=0.25):
#         pos_neg_neu_val[0].append([i[1], i[3]])
        if(len(i[1])==1):
            pos_neg_neu_val[0].append([i[1][0], i[3]])
        elif(len(i[1])>1):
            for k in i[1]:
                pos_neg_neu_val[0].append([i[1][0], i[3]])
    if(i[3]>0.25 and i[3]<=1):
#         pos_neg_neu_val[1].append([i[1], i[3]])
        if(len(i[1])==1):
            pos_neg_neu_val[1].append([i[1][0], i[3]])
        elif(len(i[1])>1):
            for k in i[1]:
                pos_neg_neu_val[1].append([i[1][0], i[3]])

In [240]:
pos_neg_neu_val

{-1: [['Putting on a little $F short', -0.454],
  ['short some', -0.464],
  ['its time to sell banks', -0.763],
  ['is a short below 740', -0.48],
  ['is a short below 740', -0.48]],
 0: [],
 1: [['buying opportunity', 0.445],
  ['Scaling Up on Long Position', 0.661],
  ['Entering long', 0.627],
  ['picked some up', 0.653],
  ['time to accumulate for a long position', 0.668],
  ['time to accumulate for a long position', 0.668],
  ['Buying $SBUX on dip', 0.483]]}

In [241]:
print('Negative headlines => ', str(len(pos_neg_neu_val[-1])),'\nNeutral headlines => ', str(len(pos_neg_neu_val[0])),'\nPositive headlines => ', str(len(pos_neg_neu_val[1])))

Negative headlines =>  5 
Neutral headlines =>  0 
Positive headlines =>  7


In [244]:
X_vals_tr = []
for i in pos_neg_neu_train:
    for j in pos_neg_neu_train[i]:
        X_vals_tr.append([j[0]])
        if(i == -1):
            X_vals_tr[-1].extend(['Negative'])
        elif(i == 0):
            X_vals_tr[-1].extend(['Neutral'])
        else:
            X_vals_tr[-1].extend(['Positive'])
X_vals_tr = np.array(X_vals_tr)

In [245]:
X_vals_tr

array([['out $NFLX -.35', 'Negative'],
       ['overbought', 'Negative'],
       ['absolute garbage still up', 'Negative'],
       ...,
       ['Buy stop above 80', 'Positive'],
       ['Airplane And Hospitality Industries Set Their Sights On #Cuba',
        'Positive'],
       ['nice bounce', 'Positive']], dtype='<U129')

In [249]:
microBlog_train = pd.DataFrame({'Title':X_vals_tr[:,0],'Sentiment':X_vals_tr[:,1]})

In [250]:
X_vals_vl = []
for i in pos_neg_neu_val:
    for j in pos_neg_neu_val[i]:
        X_vals_vl.append([j[0]])
        if(i == -1):
            X_vals_vl[-1].extend(['Negative'])
        elif(i == 0):
            X_vals_vl[-1].extend(['Neutral'])
        else:
            X_vals_vl[-1].extend(['Positive'])
X_vals_vl = np.array(X_vals_vl)

In [251]:
X_vals_vl[0]

array(['Putting on a little $F short', 'Negative'], dtype='<U38')

In [253]:
microBlog_val = pd.DataFrame({'Title':X_vals_vl[:,0],'Sentiment':X_vals_vl[:,1]})

In [16]:
microBlog_val.head()

Unnamed: 0,Title,Sentiment
0,Tesco says UK store closures put 2000 jobs at ...,Negative
1,Reed Elsevier share price slides on underwhelm...,Negative
2,Kingfisher bid for Mr Bricolage runs into trouble,Negative
3,Kingfisher bid for Mr Bricolage runs into trouble,Negative
4,Diageo stays neutral on India boardroom turmoil,Neutral


In [254]:
microBlog_train.head()

Unnamed: 0,Title,Sentiment
0,out $NFLX -.35,Negative
1,overbought,Negative
2,absolute garbage still up,Negative
3,absolute garbage still up,Negative
4,absolute garbage still up,Negative


In [255]:
microBlog = pd.concat([microBlog_train,microBlog_val])

In [256]:
microBlog.head()

Unnamed: 0,Title,Sentiment
0,out $NFLX -.35,Negative
1,overbought,Negative
2,absolute garbage still up,Negative
3,absolute garbage still up,Negative
4,absolute garbage still up,Negative


# Word2Vec

In [257]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [258]:
from gensim.models import Word2Vec

In [259]:
from nltk.tokenize import word_tokenize

In [260]:
import gensim

In [312]:
pretext_h=microBlog["Title"].apply(gensim.utils.simple_preprocess)

In [313]:
pretex=[" ".join(i) for i in pretext_h]

In [314]:
pretext_h = [i for i in pretext_h.values]

In [315]:
wvmodel = Word2Vec(pretext_h, vector_size=300, window=5, min_count=1, workers=4)

In [316]:
train_features = []
for i in tqdm(pretext_h):
    s = np.zeros(wvmodel.vector_size,dtype='float32')
    for j in i:
        s+=wvmodel.wv.get_vector(wvmodel.wv.key_to_index[j])
    train_features.append(s)
train_features = np.array(train_features)

100%|██████████| 2090/2090 [00:00<00:00, 41076.12it/s]


In [317]:
train_labels = microBlog['Sentiment'].values

In [318]:
train_features, test_features, train_labels, test_labels = train_test_split(train_features, train_labels, test_size=0.2, random_state=42)

In [319]:
train_features.shape

(1672, 300)

In [320]:
test_features.shape

(418, 300)

# Model Training

In [321]:
from sklearn.linear_model import LogisticRegression

In [322]:
from sklearn import svm

In [323]:
clf = LogisticRegression(random_state=0, solver='lbfgs', penalty='l2')

In [324]:
clf.fit(train_features, train_labels)

LogisticRegression(random_state=0)

In [325]:
# print('Validation Accuracy= '+str(clf.score(val_features, val_labels))+'%')
# for train data:        
print('From train set Accuracy= '+str(clf.score(test_features, test_labels))+'%')

From train set Accuracy= 0.5215311004784688%


In [326]:
clf = svm.SVC(degree=10)
clf.fit(train_features, train_labels)

SVC(degree=10)

In [327]:
scores = cross_val_score(clf, test_features, test_labels, cv=10)

In [328]:
# print('Validation Accuracy= '+str(clf.score(val_features, val_labels))+'%')
# for train data:        
print('From train set Accuracy= '+str(np.mean(scores))+'%')

From train set Accuracy= 0.5216027874564461%


In [330]:
clf = make_pipeline(StandardScaler(),svm.SVC(random_state=0, degree=10, tol=1e-5, C=2))
clf.fit(train_features, train_labels)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('svc', SVC(C=2, degree=10, random_state=0, tol=1e-05))])

In [331]:
scores = cross_val_score(clf, test_features, test_labels, cv=10)

In [332]:
# print('Validation Accuracy= '+str(clf.score(val_features, val_labels))+'%')
# for train data:        
print('From train set Accuracy= '+str(np.mean(scores))+'%')

From train set Accuracy= 0.643321718931475%


In [333]:
clf = RandomForestClassifier(max_depth=100, random_state=0, criterion='entropy')
clf.fit(train_features, train_labels)

RandomForestClassifier(criterion='entropy', max_depth=100, random_state=0)

In [334]:
scores = cross_val_score(clf, test_features, test_labels, cv=10)

In [335]:
# print('Validation Accuracy= '+str(clf.score(val_features, val_labels))+'%')
# for train data:        
print('From train set Accuracy= '+str(np.mean(scores))+'%')

From train set Accuracy= 0.6458768873403018%


In [336]:
from sklearn.neighbors import KNeighborsClassifier

In [337]:
clf = KNeighborsClassifier(n_neighbors=20)
clf.fit(train_features, train_labels)

KNeighborsClassifier(n_neighbors=20)

In [338]:
scores = cross_val_score(clf, test_features, test_labels, cv=10)

In [339]:
# print('Validation Accuracy= '+str(clf.score(val_features, val_labels))+'%')
# for train data:        
print('From train set Accuracy= '+str(np.mean(scores))+'%')

From train set Accuracy= 0.5265969802555167%


In [340]:
from sklearn.ensemble import GradientBoostingClassifier

In [341]:
clf = GradientBoostingClassifier(n_estimators=75, learning_rate=1.0, max_depth=1, random_state=0)
clf.fit(train_features, train_labels)

GradientBoostingClassifier(learning_rate=1.0, max_depth=1, n_estimators=75,
                           random_state=0)

In [342]:
scores = cross_val_score(clf, test_features, test_labels, cv=10)

In [343]:
# print('Validation Accuracy= '+str(clf.score(val_features, val_labels))+'%')
# for train data:        
print('From train set Accuracy= '+str(np.mean(scores))+'%')

From train set Accuracy= 0.6364111498257838%
