In [3]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import numpy as np

In [3]:
import pandas as pd
#Load datas
df_DJIA = pd.read_csv('data/DJIA_table.csv').iloc[::-1]#Dow Jones indust Avg daily infos
df_DJIA.index = range(df_DJIA.shape[0])

df_News = pd.read_csv('data/Combined_News_DJIA.csv') # Top 25 news

df_News = pd.concat((df_News.iloc[:, 0:2],df_News.iloc[:, 2:].astype(str)), axis = 1)
# Visualization
#df_News.head()
#df_DJIA.head()
#df_News.head()

#Drop the first line in order to have the same size than all future labels
df_News = df_News.drop([0])
df_News.index = range(df_News.shape[0])
df_DJIA.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Adj Close
0,2008-08-08,11432.089844,11759.959961,11388.040039,11734.320312,212830000,11734.320312
1,2008-08-11,11729.669922,11867.110352,11675.530273,11782.349609,183190000,11782.349609
2,2008-08-12,11781.700195,11782.349609,11601.519531,11642.469727,173590000,11642.469727
3,2008-08-13,11632.80957,11633.780273,11453.339844,11532.959961,182550000,11532.959961
4,2008-08-14,11532.070312,11718.280273,11450.889648,11615.929688,159790000,11615.929688


# Definition of labels

## Price variation analysis 


<b>Label 1: </b> Given label of the dataset, corresponding to the evolution of the price for the day (1: increased, 0: decreased) <br>

<b>Label 2: </b> Also a price variation label, but this time we only look for a variation with a given threshold (0: Price varied less than the threshold, 1: more) <br>
    
## Volume variation analysis    

    
<b> Label 3: </b> 3 classes: (0: decrease, 1: Stagnates, 2: Increased), with threshold  

In [4]:
# Label1 = Fluctuation of price (with threshold)
# Label : 2 = Stagnates

P = df_DJIA['Adj Close']
V = df_DJIA['Volume']

delta_P = [] # List filled with price variation
delta_V = [] # List filled with volume variation 


for i in range(1, df_DJIA.shape[0]): #Fill the lists
    delta_P.append(abs((P[i]-P[i-1])/P[i-1]))
    delta_V.append((V[i]-V[i-1])/V[i-1])
    
    
#Initiate a np.array for the new labels values
label2_val = np.empty([len(delta_P), 3]) # Price variation (abs)
label3_val = np.empty([len(delta_V), 3]) # Volume variation

#Threshold initialization
threshold_P = [0.005, 0.01, 0.02]
threshold_V = [0.005, 0.01, 0.02]
for j in range(3):
    
    for i in range(len(delta_P)):
        if delta_P[i] > threshold_P[j]:
            label2_val[i][j] = 1
        else:
            label2_val[i][j] = 0
        
        if delta_V[i] > threshold_V[j]:
            label3_val[i][j] = 1
        elif delta_V[i] < -threshold_V[j]:
            label3_val[i][j] = -1
        else:
            label3_val[i][j] = 0
        
        

df_Label2 = pd.DataFrame(label2_val) 
df_Label2.columns = ['2 T = ' +str(x) for x in threshold_P]

df_Label3 = pd.DataFrame(label3_val)
df_Label3.columns = ['3 T = ' + str(y) for y in threshold_V]

df_News = pd.concat((df_News.iloc[:, 0:2], df_Label2, df_Label3 ,df_News.iloc[:, 2:].astype(str)), axis = 1)

#df_News.head()


In [5]:
delta_P[0]

0.004093061696201033

# Sentiment Analysis

Creation of our features with a sentiment analysis, with the library vaderSentiment


In [6]:
#####################################################################################

# Creates a numpy array with the values of the SA (negativity, neutrality, compound)#
# for each of the Top25 headlines for each day                                      #

#####################################################################################

analyzer = SentimentIntensityAnalyzer() #Sentiment analyzer
threshold_c = 0.5 # Compound threshold

#DataFrame initialization
df_Sentiments = df_News
df_Sent_Full = np.empty([df_News.shape[0],(df_News.shape[1]-8)*3+8])
for i in range(df_News.shape[0]): #Fills the array
    for j in range(df_News.shape[1]):
        if j >= 8:
            score =  analyzer.polarity_scores(df_News.iloc[i,j]) #result of sentiment analysis (dictionary format) 
            df_Sent_Full[i][8 +(j-8)*3] = score['pos']
            df_Sent_Full[i][9 +(j-8)*3] = score['neu']
            df_Sent_Full[i][10 +(j-8)*3]= score['compound']
            
            if  score['pos'] > score['neg'] and score['compound'] >= threshold_c :
                df_Sentiments.iloc[i,j] = 1
            elif score['neg'] > score['pos'] and score['compound'] <= threshold_c:
                df_Sentiments.iloc[i,j] = -1
            else:
                df_Sentiments.iloc[i,j] = 0    

In [7]:
colnames = []
colnames = list(df_News.columns.values[0:8])


for i in range(25):
        colnames.append('H ' + str(i+1) + ' POS')
        colnames.append('H ' + str(i+1) + ' NEU')
        colnames.append('H ' + str(i+1) + ' COM')


df_Sentiment_Full = pd.DataFrame(data = df_Sent_Full, columns = colnames)

df_Sentiment_Full = pd.concat((df_News.iloc[:,:8],df_Sentiment_Full.iloc[:,8:]), axis = 1)
df_Sentiment_Full.iloc[:,8:].head()

Unnamed: 0,H 1 POS,H 1 NEU,H 1 COM,H 2 POS,H 2 NEU,H 2 COM,H 3 POS,H 3 NEU,H 3 COM,H 4 POS,...,H 22 COM,H 23 POS,H 23 NEU,H 23 COM,H 24 POS,H 24 NEU,H 24 COM,H 25 POS,H 25 NEU,H 25 COM
0,0.332,0.668,0.8156,0.0,0.723,-0.3182,0.225,0.775,0.4404,0.149,...,0.0,0.0,0.753,-0.3182,0.263,0.414,-0.1832,0.0,1.0,0.0
1,0.175,0.656,0.0258,0.0,1.0,0.0,0.0,0.503,-0.7845,0.102,...,0.5267,0.14,0.785,0.3818,0.0,1.0,0.0,0.0,1.0,0.0
2,0.0,0.538,-0.7184,0.0,0.751,-0.8074,0.11,0.508,-0.6369,0.0,...,0.4939,0.0,0.598,-0.5719,0.0,0.823,-0.4215,0.0,0.806,-0.34
3,0.184,0.816,0.2023,0.0,1.0,0.0,0.384,0.616,0.6808,0.0,...,-0.5994,0.248,0.571,0.1779,0.0,0.427,-0.6908,0.349,0.651,0.7096
4,0.0,0.671,-0.7481,0.0,1.0,0.0,0.178,0.667,0.4215,0.132,...,-0.7096,0.0,0.737,-0.3612,0.0,1.0,0.0,0.0,1.0,0.0


# PCA

In [8]:
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

y = df_Sentiment_Full.iloc[:, 1]
X = df_Sentiment_Full.iloc[:, 8:]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

clf = PCA(n_components = 50)

clf.fit(X_train, y_train)
X_pca=clf.transform(X)

data = [i for i in range(1,51)]
values = clf.explained_variance_ratio_.cumsum()

plt.plot(data, values)
plt.title('PCA explained variance')
plt.xlabel('Number of components')
plt.ylabel('Explained variance')
plt.show()

<Figure size 640x480 with 1 Axes>

# Random Forest



In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics


results_RF = []
results_RF_new_feat = []

########################################################
# Initial Sentiments
########################################################

print(20*'===')
print(10*' ', 'Initial Sentiments', 10*' ')
print(20*'===')

##############
#Label 1
##############
y = df_Sentiment_Full.iloc[:, 1]
X = df_Sentiment_Full.iloc[:, 8:]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)


clf = RandomForestClassifier(n_estimators = 100)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(5*'===',' Label 1 ', 5*'===', '\n' ,metrics.accuracy_score(y_test, y_pred))
results_RF.append(metrics.accuracy_score(y_test, y_pred))
print(metrics.classification_report(y_test,y_pred))

##############
#Label 2
##############


#Deal wih imbalance
for i in range(3):
    
    df_label2_1= df_Sentiment_Full.sort_values(['2 T = ' + str(threshold_P[i])], ascending = False)
    n_pos = df_label2_1.loc[df_label2_1['2 T = ' + str(threshold_P[i])] == 1].shape[0]
    df_label2_1 = pd.concat((df_label2_1.iloc[:n_pos, :],df_label2_1.iloc[df_label2_1.shape[0]-n_pos:, :] ))


    y = df_label2_1.iloc[:, i+1]
    X = df_label2_1.iloc[:, 8:]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    

    clf = RandomForestClassifier(n_estimators = 100)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(5*'===',' Label 2, threshold', str(threshold_P[i]) , 5*'===', '\n' ,metrics.accuracy_score(y_test, y_pred))
    results_RF.append(metrics.accuracy_score(y_test, y_pred))
    print(metrics.classification_report(y_pred,y_test))
    
##############
#Label 3
##############


#Deal wih imbalance
for i in range(3):
    df_label3_1 = df_Sentiment_Full.sort_values(['3 T = ' + str(threshold_V[i])])
    n = df_label3_1.loc[df_label3_1['3 T = ' + str(threshold_V[i])] == -1].shape[0]
    df_label3_1 = pd.concat((df_label3_1.iloc[:n, :],df_label3_1.iloc[n +1:2*n, :] ,df_label3_1.iloc[df_label3_1.shape[0]-n:, :] ))


    y = df_label3_1.iloc[:, 5+i]
    X = df_label3_1.iloc[:, 8:]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


    clf = RandomForestClassifier(n_estimators = 100)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(5*'===',' Label 3, threshold ', str(threshold_V[i]), 5*'===', '\n' ,metrics.accuracy_score(y_test, y_pred))
    results_RF.append(metrics.accuracy_score(y_test, y_pred))



########################################################
# New features
########################################################

print('\n', 20*'===')
print(10*' ', 'New Features', 10*' ')
print(20*'===')

##############
#Label 1
##############
y = df_Sentiments.iloc[:, 1]
X = df_Sentiments.iloc[:, 8:]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)


clf = RandomForestClassifier(n_estimators = 100)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(5*'===',' Label 1 ', 5*'===', '\n' ,metrics.accuracy_score(y_test, y_pred))
results_RF_new_feat.append(metrics.accuracy_score(y_test, y_pred))

##############
#Label 2
##############


#Deal wih imbalance
for i in range(3):
    
    df_label2_1= df_Sentiments.sort_values(['2 T = ' + str(threshold_P[i])], ascending = False)
    n_pos = df_label2_1.loc[df_label2_1['2 T = ' + str(threshold_P[i])] == 1].shape[0]
    df_label2_1 = pd.concat((df_label2_1.iloc[:n_pos, :],df_label2_1.iloc[df_label2_1.shape[0]-n_pos:, :] ))


    y = df_label2_1.iloc[:, 1+i]
    X = df_label2_1.iloc[:, 8:]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


    clf = RandomForestClassifier(n_estimators = 100)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(5*'===',' Label 2, threshold', str(threshold_P[i]) , 5*'===', '\n' ,metrics.accuracy_score(y_test, y_pred))
    results_RF_new_feat.append(metrics.accuracy_score(y_test, y_pred))

##############
#Label 3
##############


#Deal wih imbalance
for i in range(3):
    df_label3_1 = df_Sentiments.sort_values(['3 T = ' + str(threshold_V[i])])
    n = df_label3_1.loc[df_label3_1['3 T = ' + str(threshold_V[i])] == -1].shape[0]
    df_label3_1 = pd.concat((df_label3_1.iloc[:n, :],df_label3_1.iloc[n +1:2*n, :] ,df_label3_1.iloc[df_label3_1.shape[0]-n:, :] ))


    y = df_label3_1.iloc[:, 5+i]
    X = df_label3_1.iloc[:, 8:]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


    clf = RandomForestClassifier(n_estimators = 100)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(5*'===',' Label 3, threshold ', str(threshold_V[i]), 5*'===', '\n' ,metrics.accuracy_score(y_test, y_pred))
    results_RF_new_feat.append(metrics.accuracy_score(y_test, y_pred))



           Initial Sentiments           
 0.5025125628140703
              precision    recall  f1-score   support

           0       0.42      0.34      0.37       264
           1       0.55      0.63      0.59       333

   micro avg       0.50      0.50      0.50       597
   macro avg       0.48      0.49      0.48       597
weighted avg       0.49      0.50      0.49       597

 0.5269607843137255
              precision    recall  f1-score   support

           0       0.38      0.51      0.43       146
           1       0.66      0.54      0.59       262

   micro avg       0.53      0.53      0.53       408
   macro avg       0.52      0.52      0.51       408
weighted avg       0.56      0.53      0.54       408

 0.6590909090909091
              precision    recall  f1-score   support

         0.0       0.01      0.25      0.03         4
         1.0       0.98      0.67      0.79       216

   micro avg       0.66      0.66      0.66       220
   macro avg       0.50    

# SVM

In [10]:
from sklearn import svm
from sklearn.model_selection import train_test_split, GridSearchCV

########################################################
# Initial Sentiments
########################################################
results_SVM = []
results_SVM_new_feat = []
test_size = 0.2
gamma = 0.5

print(20*'===')
print(10*' ', 'Initial Sentiments', 10*' ')
print(20*'===')

##############
#Label 1
##############
y = df_Sentiment_Full.iloc[:, 1]
X = df_Sentiment_Full.iloc[:, 8:]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)


clf = svm.SVC(gamma = gamma)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(5*'===',' Label 1 ', 5*'===', '\n' ,metrics.accuracy_score(y_test, y_pred))
results_SVM.append(metrics.accuracy_score(y_test, y_pred))
print(metrics.classification_report(y_pred, y_test))

##############
#Label 2
##############


#Deal wih imbalance
for i in range(3):
    
    df_label2_1= df_Sentiment_Full.sort_values(['2 T = ' + str(threshold_P[i])], ascending = False)
    n_pos = df_label2_1.loc[df_label2_1['2 T = ' + str(threshold_P[i])] == 1].shape[0]
    df_label2_1 = pd.concat((df_label2_1.iloc[:n_pos, :],df_label2_1.iloc[df_label2_1.shape[0]-n_pos:, :] ))
    

    y = df_label2_1.iloc[:, i+1]
    X = df_label2_1.iloc[:, 8:]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_size)


    clf = svm.SVC(gamma = gamma)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(5*'===',' Label 2, threshold', str(threshold_P[i]) , 5*'===', '\n' ,metrics.accuracy_score(y_test, y_pred))
    results_SVM.append(metrics.accuracy_score(y_test, y_pred))
    if i >= 0:
        print(metrics.classification_report(y_pred, y_test))
        print(X.loc[df_Sentiment_Full['2 T = 0.01']== 1].shape,X.loc[df_Sentiment_Full['2 T = 0.01']== 1].shape )

###################
#Label 3          #
###################



#Deal wih imbalance
for i in range(3):
    df_label3_1 = df_Sentiment_Full.sort_values(['3 T = ' + str(threshold_V[i])])
    n = df_label3_1.loc[df_label3_1['3 T = ' + str(threshold_V[i])] == -1].shape[0]
    df_label3_1 = pd.concat((df_label3_1.iloc[:n, :],df_label3_1.iloc[n +1:2*n, :] ,df_label3_1.iloc[df_label3_1.shape[0]-n:, :] ))


    y = df_label3_1.iloc[:, i+5]
    X = df_label3_1.iloc[:, 8:]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
    
    
    
    clf = svm.SVC(gamma = gamma)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(5*'===',' Label 3, threshold ', str(threshold_V[i]), 5*'===', '\n' ,metrics.accuracy_score(y_test, y_pred))
    results_SVM.append(metrics.accuracy_score(y_test, y_pred))


########################################################
# New features
########################################################

print('\n', 20*'===')
print(10*' ', 'New Features', 10*' ')
print(20*'===')

##############
#Label 1
##############
y = df_Sentiments.iloc[:, 1]
X = df_Sentiments.iloc[:, 8:]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)


clf = svm.SVC(gamma = gamma)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(5*'===',' Label 1 ', 5*'===', '\n' ,metrics.accuracy_score(y_test, y_pred))
results_SVM_new_feat.append(metrics.accuracy_score(y_test, y_pred))

##############
#Label 2
##############


#Deal wih imbalance
for i in range(3):
    
    df_label2_1= df_Sentiments.sort_values(['2 T = ' + str(threshold_P[i])], ascending = False)
    n_pos = df_label2_1.loc[df_label2_1['2 T = ' + str(threshold_P[i])] == 1].shape[0]
    df_label2_1 = pd.concat((df_label2_1.iloc[:n_pos, :],df_label2_1.iloc[df_label2_1.shape[0]-n_pos:, :] ))


    y = df_label2_1.iloc[:, i+1]
    X = df_label2_1.iloc[:, 8:]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)


    clf = svm.SVC(gamma = gamma)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(5*'===',' Label 2, threshold', str(threshold_P[i]) , 5*'===', '\n' ,metrics.accuracy_score(y_test, y_pred))
    results_SVM_new_feat.append(metrics.accuracy_score(y_test, y_pred))

##############
#Label 3
##############


#Deal wih imbalance
for i in range(3):
    df_label3_1 = df_Sentiments.sort_values(['3 T = ' + str(threshold_V[i])])
    n = df_label3_1.loc[df_label3_1['3 T = ' + str(threshold_V[i])] == -1].shape[0]
    df_label3_1 = pd.concat((df_label3_1.iloc[:n, :],df_label3_1.iloc[n +1:2*n, :] ,df_label3_1.iloc[df_label3_1.shape[0]-n:, :] ))


    y = df_label3_1.iloc[:, i+5]
    X = df_label3_1.iloc[:, 8:]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
    
    
    clf = svm.SVC(gamma = gamma)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(5*'===',' Label 3, threshold ', str(threshold_V[i]), 5*'===', '\n' ,metrics.accuracy_score(y_test, y_pred))
    results_SVM_new_feat.append(metrics.accuracy_score(y_test, y_pred))

           Initial Sentiments           
 0.5050251256281407
              precision    recall  f1-score   support

           0       0.17      0.42      0.25        76
           1       0.79      0.52      0.63       322

   micro avg       0.51      0.51      0.51       398
   macro avg       0.48      0.47      0.44       398
weighted avg       0.67      0.51      0.56       398

 0.5367647058823529
              precision    recall  f1-score   support

           0       0.26      0.45      0.33       103
           1       0.75      0.57      0.65       305

   micro avg       0.54      0.54      0.54       408
   macro avg       0.51      0.51      0.49       408
weighted avg       0.63      0.54      0.57       408

(585, 75) (585, 75)
 0.6909090909090909
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00         0
         1.0       1.00      0.69      0.82       220

   micro avg       0.69      0.69      0.69       220
   macr

  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


 0.661641541038526
 0.6791808873720137
 0.625886524822695

           New Features           
 0.5251256281407035
 0.5465686274509803
 0.6681818181818182
 0.5507246376811594
 0.6365159128978225
 0.6552901023890785
 0.624113475177305


In [13]:
df_Sentiments.iloc[:,8:].head()

Unnamed: 0,Top1,Top2,Top3,Top4,Top5,Top6,Top7,Top8,Top9,Top10,...,Top16,Top17,Top18,Top19,Top20,Top21,Top22,Top23,Top24,Top25
0,1,-1,0,-1,0,-1,-1,-1,-1,-1,...,0,0,-1,0,0,-1,0,-1,-1,0
1,0,0,-1,-1,0,-1,-1,-1,0,-1,...,0,0,-1,-1,-1,0,1,0,0,0
2,-1,-1,-1,-1,-1,0,0,0,0,0,...,0,0,0,-1,-1,-1,0,-1,-1,-1
3,0,0,1,-1,-1,-1,1,-1,-1,-1,...,-1,0,0,0,0,-1,-1,0,-1,1
4,-1,0,0,0,-1,-1,0,-1,0,-1,...,-1,-1,0,-1,-1,0,-1,-1,0,0


# KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.model_selection import train_test_split, GridSearchCV

########################################################
# Initial Sentiments
########################################################
results_KNN = []
results_KNN_new_feat = []
test_size = 0.2

n_n = 5

print(20*'===')
print(10*' ', 'Initial Sentiments', 10*' ')
print(20*'===')

##############
#Label 1
##############
y = df_Sentiment_Full.iloc[:, 1]
X = df_Sentiment_Full.iloc[:, 8:]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)


clf = KNN(n_neighbors = n_n)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(5*'===',' Label 1 ', 5*'===', '\n' ,metrics.accuracy_score(y_test, y_pred))
results_KNN.append(metrics.accuracy_score(y_test, y_pred))

##############
#Label 2
##############


#Deal wih imbalance
for i in range(3):
    
    df_label2_1= df_Sentiment_Full.sort_values(['2 T = ' + str(threshold_P[i])], ascending = False)
    n_pos = df_label2_1.loc[df_label2_1['2 T = ' + str(threshold_P[i])] == 1].shape[0]
    df_label2_1 = pd.concat((df_label2_1.iloc[:n_pos, :],df_label2_1.iloc[df_label2_1.shape[0]-n_pos:, :] ))
    

    y = df_label2_1.iloc[:, i+1]
    X = df_label2_1.iloc[:, 8:]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_size)


    clf = KNN(n_neighbors = n_n)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(5*'===',' Label 2, threshold', str(threshold_P[i]) , 5*'===', '\n' ,metrics.accuracy_score(y_test, y_pred))
    results_KNN.append(metrics.accuracy_score(y_test, y_pred))
    print(metrics.classification_report(y_pred, y_test))

###################
#Label 3          #
###################



#Deal wih imbalance
for i in range(3):
    df_label3_1 = df_Sentiment_Full.sort_values(['3 T = ' + str(threshold_V[i])])
    n = df_label3_1.loc[df_label3_1['3 T = ' + str(threshold_V[i])] == -1].shape[0]
    df_label3_1 = pd.concat((df_label3_1.iloc[:n, :],df_label3_1.iloc[n +1:2*n, :] ,df_label3_1.iloc[df_label3_1.shape[0]-n:, :] ))


    y = df_label3_1.iloc[:, i+5]
    X = df_label3_1.iloc[:, 8:]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
    
    
    
    clf = KNN(n_neighbors = n_n)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(5*'===',' Label 3, threshold ', str(threshold_V[i]), 5*'===', '\n' ,metrics.accuracy_score(y_test, y_pred))
    results_KNN.append(metrics.accuracy_score(y_test, y_pred))

########################################################
# New features
########################################################

print('\n', 20*'===')
print(10*' ', 'New Features', 10*' ')
print(20*'===')

##############
#Label 1
##############
y = df_Sentiments.iloc[:, 1]
X = df_Sentiments.iloc[:, 8:]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)


clf = KNN(n_neighbors = n_n)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(5*'===',' Label 1 ', 5*'===', '\n' ,metrics.accuracy_score(y_test, y_pred))
results_KNN_new_feat.append(metrics.accuracy_score(y_test, y_pred))
##############
#Label 2
##############


#Deal wih imbalance
for i in range(3):
    
    df_label2_1= df_Sentiments.sort_values(['2 T = ' + str(threshold_P[i])], ascending = False)
    n_pos = df_label2_1.loc[df_label2_1['2 T = ' + str(threshold_P[i])] == 1].shape[0]
    df_label2_1 = pd.concat((df_label2_1.iloc[:n_pos, :],df_label2_1.iloc[df_label2_1.shape[0]-n_pos:, :] ))


    y = df_label2_1.iloc[:, i+1]
    X = df_label2_1.iloc[:, 8:]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)


    clf = KNN(n_neighbors = n_n)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(5*'===',' Label 2, threshold', str(threshold_P[i]) , 5*'===', '\n' ,metrics.accuracy_score(y_test, y_pred))
    results_KNN_new_feat.append(metrics.accuracy_score(y_test, y_pred))
    print(metrics.classification_report(y_test, y_pred))
    
##############
#Label 3
##############


#Deal wih imbalance
for i in range(3):
    df_label3_1 = df_Sentiments.sort_values(['3 T = ' + str(threshold_V[i])])
    n = df_label3_1.loc[df_label3_1['3 T = ' + str(threshold_V[i])] == -1].shape[0]
    df_label3_1 = pd.concat((df_label3_1.iloc[:n, :],df_label3_1.iloc[n +1:2*n, :] ,df_label3_1.iloc[df_label3_1.shape[0]-n:, :] ))


    y = df_label3_1.iloc[:, i+5]
    X = df_label3_1.iloc[:, 8:]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
    
    
    clf = KNN(n_neighbors = n_n)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(5*'===',' Label 3, threshold ', str(threshold_V[i]), 5*'===', '\n' ,metrics.accuracy_score(y_test, y_pred))
    results_KNN_new_feat.append(metrics.accuracy_score(y_test, y_pred))

# Result Plots

In [None]:
import matplotlib.pyplot as plt


titles = ['Initial label', 'Price threshold = 0.5%', 'Price threshold = 1%', 'Price threshold = 2%', 'Volume threshold = 0.5%', 'Volume threshold = 1%', 'Volume threshold = 2%']

for i in range(7):
    plt.bar(['KNN', 'RF', 'SVM'],[results_KNN[i],results_RF[i], results_SVM[i]], color = ['r','g','b'])
    plt.xlabel('accuracy score')
    plt.ylabel('Models')
    plt.title(titles[i])
    plt.show()

len(results_SVM_new_feat)

In [None]:
import matplotlib.pyplot as plt


titles = ['Initial label', 'Price threshold = 0.5%', 'Price threshold = 1%', 'Price threshold = 2%', 'Volume threshold = 0.5%', 'Volume threshold = 1%', 'Volume threshold = 2%']

for i in range(7):
    plt.bar(['KNN', 'RF', 'SVM'],[results_KNN_new_feat[i],results_RF_new_feat[i], results_SVM_new_feat[i]], color = ['r','g','b'])
    plt.xlabel('accuracy score')
    plt.ylabel('Models')
    plt.title(titles[i])
    plt.show()

len(results_SVM_new_feat)

# Weekly prediction



In [15]:
week_ret = []
new_feats = np.empty([1985//5,125])

for i in range(5, 1985, 5):
    if abs((df_DJIA.iloc[i,6]-df_DJIA.iloc[i+5,6])/df_DJIA.iloc[i,6]) > 0.0125:        
        week_ret.append(1)
    else:
        week_ret.append(0)
    
for i in range(1985//5):
    for j in range(125):
        if j < 25:
            new_feats[i][j] = df_Sentiment_Full.iloc[:,8:].iloc[i*5,j]
        elif 25 <= j < 50:
            new_feats[i][j] = df_Sentiment_Full.iloc[:,8:].iloc[i*5+1,j-25]
        elif 50 <= j < 75:
            new_feats[i][j] = df_Sentiment_Full.iloc[:,8:].iloc[i*5+2,j-50]
        elif 75 <= j < 100:
            new_feats[i][j] = df_Sentiment_Full.iloc[:,8:].iloc[i*5+3,j-75]
        else:
            new_feats[i][j] = df_Sentiment_Full.iloc[:,8:].iloc[i*5+4,j-100]
    
    

In [16]:
col_names = ['label']
for i in range(125):
    col_names.append(i)

df_weekly = pd.DataFrame(data = new_feats)
df_wlabel = pd.DataFrame(data = week_ret)

df_weekly = pd.concat((df_wlabel, df_weekly), axis = 1)
df_weekly = df_weekly.fillna(0)
df_weekly.columns = col_names

df_weekly.head()

df_weekly.loc[df_weekly['label']== 0].shape

(205, 126)

In [21]:
from sklearn.neighbors import KNeighborsClassifier as KNN
y = df_weekly.iloc[:, 0]
X = df_weekly.iloc[:, 1:]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.3)


clf = KNN(n_neighbors = 3)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [22]:
print(metrics.accuracy_score(y_pred, y_test))

0.5083333333333333
