In [1]:
import pandas as pd
import numpy as np
import os
import json
import tensorflow as tf
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from textblob import TextBlob

In [2]:
train_eur = pd.read_json('../data/train/EURUSDV1M_1w.json')
train_vix = pd.read_json('../data/train/VIX_1w.json')

In [3]:
def split_df(df):
  stock = pd.DataFrame(df['stock'].to_list(), index=df.index)
  
  df1 = df.drop(labels='stock', axis=1)
  df1 = pd.concat([stock, df1], axis=1)

  speeches = pd.DataFrame(df.iloc[:, 0].tolist(), index=df.index)
  appended_data = []
  for i in range(0, 20):
    x = pd.DataFrame(speeches.iloc[:, i].tolist(), index=speeches.index)
    appended_data.append(x)

  appended_data = pd.concat(appended_data, axis=1)
  df1 = df1.drop(labels='speech', axis=1)
  final = pd.concat([appended_data, df1], axis=1)
  return final


In [4]:
#attempt 1: textblob
split_train_eur = split_df(train_eur)
split_train_vix = split_df(train_vix)

for i in tqdm(range(split_train_eur.shape[0])):
    for j in range(40):
        split_train_eur.iloc[i,j] = TextBlob(str(split_train_eur.iloc[i,j])).sentiment
        split_train_eur.iloc[i,j] = split_train_eur.iloc[i,j][0]

100%|██████████| 1254/1254 [12:47<00:00,  1.63it/s]


In [5]:
#classification, textblob, equity
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.svm import SVC

X_train, X_test, y_train, y_test = train_test_split(split_train_eur.iloc[:, 0:60], 
                                                    split_train_eur.iloc[:, 60],
                                                    test_size=0.3)

svm = SVC()
knn = KNeighborsClassifier(n_neighbors=2)
lgbm = lgb.LGBMClassifier()
xgbm = xgb.XGBClassifier()
rtf = RandomForestClassifier()
etc = ExtraTreesClassifier()

svm_model = svm.fit(X_train, y_train)
knn_model = knn.fit(X_train, y_train)
etc_model = etc.fit(X_train, y_train)
rtf_model = rtf.fit(X_train, y_train)


In [6]:
from sklearn.metrics import accuracy_score
print('Accuracy score (SVM): ', accuracy_score(svm_model.predict(X_test), y_test))
print('Accuracy score (KNN): ', accuracy_score(knn_model.predict(X_test), y_test))
print('Accuracy score (ETC): ', accuracy_score(etc_model.predict(X_test), y_test))
print('Accuracy score (RTF): ', accuracy_score(rtf_model.predict(X_test), y_test))

Accuracy score (SVM):  0.6392572944297082
Accuracy score (KNN):  0.649867374005305
Accuracy score (ETC):  0.6312997347480106
Accuracy score (RTF):  0.649867374005305


In [7]:
#regression, textblob, equity
warnings.filterwarnings("ignore")
from sklearn.linear_model import LinearRegression
import xgboost as xgb
import lightgbm as lgb
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.metrics import mean_squared_error

X_train, X_test, y_train, y_test = train_test_split(split_train_eur.iloc[:, 0:60], 
                                                    split_train_eur.iloc[:, 61],
                                                    test_size=0.3)

rtf = RandomForestRegressor(random_state=42)
etc = ExtraTreesRegressor(random_state=42)
lin = LinearRegression()

etc_model = etc.fit(X_train, y_train)
rtf_model = rtf.fit(X_train, y_train)
lin_model = lin.fit(X_train, y_train)

print('RMSE (ETC): ', (mean_squared_error(etc_model.predict(X_test), y_test))**0.5)
print('RMSE (RTF): ', (mean_squared_error(rtf_model.predict(X_test), y_test))**0.5)
print('RMSE (LIN): ', (mean_squared_error(lin_model.predict(X_test), y_test))**0.5)


RMSE (ETC):  0.3521485992812843
RMSE (RTF):  0.3576859696971005
RMSE (LIN):  0.38045697497387876


In [99]:
split_train_eur = split_df(train_eur)
split_train_vix = split_df(train_vix)

max_chunk = 250
cell = []
for i in tqdm(range(split_train_eur.shape[0])):
    for j in range(split_train_eur.shape[1]):
        if split_train_eur.iloc[i,j] != []:
            cell = split_train_eur.iloc[i,j][0]
            cell = cell.replace('.', '.<eos>')
            cell = cell.replace('?', '?<eos>')
            cell = cell.replace('!', '!<eos>')
            sentences = cell.split('<eos>')
            current_chunk = 0 
            chunks = []
            for sentence in sentences:
                if len(chunks) == current_chunk + 1: 
                    if len(chunks[current_chunk]) + len(sentence.split(' ')) <= max_chunk:
                        chunks[current_chunk].extend(sentence.split(' '))
                    else:
                        current_chunk += 1
                        chunks.append(sentence.split(' '))
                else:
                    chunks.append(sentence.split(' '))

            for chunk_id in range(len(chunks)):
                chunks[chunk_id] = ' '.join(chunks[chunk_id])
            res = nlp(chunks)
            split_train_eur.iloc[i,j] = ' '.join([summ['label'] for summ in res])
        else:
            None


In [45]:
# attempt 2.1: sentiment analysis - most common star value
sent_train_eur = pd.read_csv('../sentiment__equity_train.csv')
sent_train_eur = sent_train_eur.drop(columns='Unnamed: 0', axis=1)
sent_train_eur = pd.concat([sent_train_eur, split_train_eur.iloc[:, 40:62]], axis=1)

In [47]:
import ast

def most_frequent(List):
    return max(set(List), key = List.count)

def most_common_star(df):
    for i in tqdm(range(0, df.shape[0])):
        for j in range (0, df.shape[1] - 22):
            if  df.iloc[i, j] == '2.5':
                None
            else:
                temp = ast.literal_eval(str(df.iloc[i, j]))
                for l in range(0, len(temp[0])):
                    lst = []
                    lst.append(temp[0][l]['label'][0])
                df.iloc[i, j] = int(most_frequent(lst))


In [48]:
#classification, sent-analysis, equity
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.svm import SVC

most_common_star(sent_train_eur)

X_train, X_test, y_train, y_test = train_test_split(sent_train_eur.iloc[:, 0:60], 
                                                    sent_train_eur.iloc[:, 60],
                                                    test_size=0.3)

svm = SVC()
knn = KNeighborsClassifier(n_neighbors=2)
rtf = RandomForestClassifier()
etc = ExtraTreesClassifier()

svm_model = svm.fit(X_train, y_train)
knn_model = knn.fit(X_train, y_train)
etc_model = etc.fit(X_train, y_train)
rtf_model = rtf.fit(X_train, y_train)


100%|██████████| 1254/1254 [00:22<00:00, 55.86it/s]


In [50]:
from sklearn.metrics import accuracy_score
print('Accuracy score (SVM): ', accuracy_score(svm_model.predict(X_test), y_test))
print('Accuracy score (KNN): ', accuracy_score(knn_model.predict(X_test), y_test))
print('Accuracy score (ETC): ', accuracy_score(etc_model.predict(X_test), y_test))
print('Accuracy score (RTF): ', accuracy_score(rtf_model.predict(X_test), y_test))

Accuracy score (SVM):  0.6206896551724138
Accuracy score (KNN):  0.5782493368700266
Accuracy score (ETC):  0.636604774535809
Accuracy score (RTF):  0.6604774535809018


In [51]:
#regression, sent-analysis, equity
warnings.filterwarnings("ignore")
from sklearn.linear_model import LinearRegression
import xgboost as xgb
import lightgbm as lgb
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.metrics import mean_squared_error

X_train, X_test, y_train, y_test = train_test_split(sent_train_eur.iloc[:, 0:60], 
                                                    sent_train_eur.iloc[:, 61],
                                                    test_size=0.3)

rtf = RandomForestRegressor(random_state=42)
etc = ExtraTreesRegressor(random_state=42)
lin = LinearRegression()

etc_model = etc.fit(X_train, y_train)
rtf_model = rtf.fit(X_train, y_train)
lin_model = lin.fit(X_train, y_train)

print('RMSE (ETC): ', (mean_squared_error(etc_model.predict(X_test), y_test))**0.5)
print('RMSE (RTF): ', (mean_squared_error(rtf_model.predict(X_test), y_test))**0.5)
print('RMSE (LIN): ', (mean_squared_error(lin_model.predict(X_test), y_test))**0.5)


RMSE (ETC):  0.32726498555302885
RMSE (RTF):  0.33740049920988024
RMSE (LIN):  0.3779932503749537


In [278]:
# attempt 2.2: sentiment analysis - average scores of stars + argmax
sent_train_eur = pd.read_csv('../sentiment__equity_train.csv')
sent_train_eur = sent_train_eur.drop(columns='Unnamed: 0', axis=1)
sent_train_eur = pd.concat([sent_train_eur, split_train_eur.iloc[:, 40:62]], axis=1)

In [279]:
import operator

def star_average_val(df):
    for i in tqdm(range(0, df.shape[0])):
        for j in range(0, df.shape[1] - 22):
            if  df.iloc[i, j] == '2.5':
                None
            else:
                temp = ast.literal_eval(str(df.iloc[i, j]))   
                first = []
                second = []
                third = []
                fourth = []
                fifth = []
                avg_stars = {1: 0, 2: 0, 3: 0, 4: 0, 5: 0}

                for l in range(0, len(temp[0])):
                    if temp[0][l]['label'][0] == str(1):
                        first.append(temp[0][l]['score'])
                        avg_stars[1] = (sum(first)/len(first))
                    if temp[0][l]['label'][0] == str(2):
                        second.append(temp[0][l]['score'])
                        avg_stars[2] = (sum(second)/len(second))
                    if temp[0][l]['label'][0] == str(3):
                        third.append(temp[0][l]['score'])
                        avg_stars[3] = (sum(third)/len(third))
                    if temp[0][l]['label'][0] == str(4):
                        fourth.append(temp[0][l]['score'])
                        avg_stars[4] = (sum(fourth)/len(fourth))
                    if temp[0][l]['label'][0] == str(5):
                        fifth.append(temp[0][l]['score'])
                        avg_stars[5] = (sum(fifth)/len(fifth)) 
                df.iloc[i, j] = max(avg_stars.items(), key=operator.itemgetter(1))[0]

In [280]:
#average scores of stars + argmax
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.svm import SVC

star_average_val(sent_train_eur)

X_train, X_test, y_train, y_test = train_test_split(sent_train_eur.iloc[:, 0:60], 
                                                    sent_train_eur.iloc[:, 60],
                                                    test_size=0.3)

svm = SVC()
knn = KNeighborsClassifier(n_neighbors=2)
rtf = RandomForestClassifier()
etc = ExtraTreesClassifier()

svm_model = svm.fit(X_train, y_train)
knn_model = knn.fit(X_train, y_train)
etc_model = etc.fit(X_train, y_train)
rtf_model = rtf.fit(X_train, y_train)


100%|██████████| 1254/1254 [00:35<00:00, 35.17it/s]


In [281]:
from sklearn.metrics import accuracy_score
print('Accuracy score (SVM): ', accuracy_score(svm_model.predict(X_test), y_test))
print('Accuracy score (KNN): ', accuracy_score(knn_model.predict(X_test), y_test))
print('Accuracy score (ETC): ', accuracy_score(etc_model.predict(X_test), y_test))
print('Accuracy score (RTF): ', accuracy_score(rtf_model.predict(X_test), y_test))

Accuracy score (SVM):  0.5941644562334217
Accuracy score (KNN):  0.5517241379310345
Accuracy score (ETC):  0.6472148541114059
Accuracy score (RTF):  0.6578249336870027


In [282]:
#regression, sent-analysis, equity - best average star score
warnings.filterwarnings("ignore")
from sklearn.linear_model import LinearRegression
import xgboost as xgb
import lightgbm as lgb
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.metrics import mean_squared_error

X_train, X_test, y_train, y_test = train_test_split(sent_train_eur.iloc[:, 0:60], 
                                                    sent_train_eur.iloc[:, 61],
                                                    test_size=0.3)

rtf = RandomForestRegressor(random_state=42)
etc = ExtraTreesRegressor(random_state=42)
lin = LinearRegression()

etc_model = etc.fit(X_train, y_train)
rtf_model = rtf.fit(X_train, y_train)
lin_model = lin.fit(X_train, y_train)

print('RMSE (ETC): ', (mean_squared_error(etc_model.predict(X_test), y_test))**0.5)
print('RMSE (RTF): ', (mean_squared_error(rtf_model.predict(X_test), y_test))**0.5)
print('RMSE (LIN): ', (mean_squared_error(lin_model.predict(X_test), y_test))**0.5)


RMSE (ETC):  0.260634214086759
RMSE (RTF):  0.26873053000494623
RMSE (LIN):  0.289233268254266
