In [5]:
import pandas as pd
import numpy as np
import os
import json
import tensorflow as tf
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from textblob import TextBlob

In [11]:
train_eur = pd.read_json('../data/train/EURUSDV1M_1w.json')
train_vix = pd.read_json('../data/train/VIX_1w.json')

In [12]:
def split_df(df):
  stock = pd.DataFrame(df['stock'].to_list(), index=df.index)
  
  df1 = df.drop(labels='stock', axis=1)
  df1 = pd.concat([stock, df1], axis=1)

  speeches = pd.DataFrame(df.iloc[:, 0].tolist(), index=df.index)
  appended_data = []
  for i in range(0, 20):
    x = pd.DataFrame(speeches.iloc[:, i].tolist(), index=speeches.index)
    appended_data.append(x)

  appended_data = pd.concat(appended_data, axis=1)
  df1 = df1.drop(labels='speech', axis=1)
  final = pd.concat([appended_data, df1], axis=1)
  return final


In [13]:
from transformers import pipeline
summarizer = pipeline('summarization')

No model was supplied, defaulted to t5-small (https://huggingface.co/t5-small)
All model checkpoint layers were used when initializing TFT5ForConditionalGeneration.

All the layers of TFT5ForConditionalGeneration were initialized from the model checkpoint at t5-small.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


In [14]:
#attempt 1: textblob
split_train_eur = split_df(train_eur)
split_train_vix = split_df(train_vix)

for i in tqdm(range(split_train_eur.shape[0])):
    for j in range(40):
        split_train_eur.iloc[i,j] = TextBlob(str(split_train_eur.iloc[i,j])).sentiment
        split_train_eur.iloc[i,j] = split_train_eur.iloc[i,j][0]

100%|██████████| 1254/1254 [05:46<00:00,  3.61it/s]


In [18]:
#classification
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.svm import SVC

X_train, X_test, y_train, y_test = train_test_split(split_train_eur.iloc[:, 0:60], 
                                                    split_train_eur.iloc[:, 60],
                                                    test_size=0.3)

svm = SVC()
knn = KNeighborsClassifier(n_neighbors=7)
lgbm = lgb.LGBMClassifier()
xgbm = xgb.XGBClassifier()
rtf = RandomForestClassifier()
etc = ExtraTreesClassifier()

svm_model = svm.fit(X_train, y_train)
knn_model = knn.fit(X_train, y_train)
etc_model = etc.fit(X_train, y_train)
rtf_model = rtf.fit(X_train, y_train)




In [19]:
from sklearn.metrics import accuracy_score
print('Accuracy score (SVM): ', accuracy_score(svm_model.predict(X_test), y_test))
print('Accuracy score (KNN): ', accuracy_score(knn_model.predict(X_test), y_test))
print('Accuracy score (ETC): ', accuracy_score(etc_model.predict(X_test), y_test))
print('Accuracy score (RTF): ', accuracy_score(rtf_model.predict(X_test), y_test))

Accuracy score (SVM):  0.6472148541114059
Accuracy score (KNN):  0.6684350132625995
Accuracy score (ETC):  0.6472148541114059
Accuracy score (RTF):  0.6657824933687002


In [17]:
from sklearn.linear_model import LinearRegression
import xgboost as xgb
import lightgbm as lgb
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.metrics import mean_squared_error

X_train, X_test, y_train, y_test = train_test_split(split_train_eur.iloc[:, 0:60], 
                                                    split_train_eur.iloc[:, 61],
                                                    test_size=0.3)

rtf = RandomForestRegressor(random_state=42)
etc = ExtraTreesRegressor(random_state=42)
lin = LinearRegression()

etc_model = etc.fit(X_train, y_train)
rtf_model = rtf.fit(X_train, y_train)
lin_model = lin.fit(X_train, y_train)

print('RMSE (ETC): ', (mean_squared_error(etc_model.predict(X_test), y_test))**0.5)
print('RMSE (RTF): ', (mean_squared_error(rtf_model.predict(X_test), y_test))**0.5)
print('RMSE (LIN): ', (mean_squared_error(lin_model.predict(X_test), y_test))**0.5)


Accuracy score (ETC):  0.3135358522953706
Accuracy score (RTF):  0.31767498477782424
Accuracy score (LIN):  0.3797842171784644


In [23]:
#attempt 2: FinBERT
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import pipeline

finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone',num_labels=3)
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')

nlp = pipeline("sentiment-analysis", model=finbert, tokenizer=tokenizer)

sentences = ["there is a shortage of capital, and we need extra financing",  
             "growth is strong and we have plenty of liquidity", 
             "there are doubts about our finances", 
             "profits are flat"]
results = nlp(sentences)
print(results)

In [99]:
split_train_eur = split_df(train_eur)
split_train_vix = split_df(train_vix)

max_chunk = 250
cell = []
for i in tqdm(range(split_train_eur.shape[0])):
    for j in range(split_train_eur.shape[1]):
        if split_train_eur.iloc[i,j] != []:
            cell = split_train_eur.iloc[i,j][0]
            cell = cell.replace('.', '.<eos>')
            cell = cell.replace('?', '?<eos>')
            cell = cell.replace('!', '!<eos>')
            sentences = cell.split('<eos>')
            current_chunk = 0 
            chunks = []
            for sentence in sentences:
                if len(chunks) == current_chunk + 1: 
                    if len(chunks[current_chunk]) + len(sentence.split(' ')) <= max_chunk:
                        chunks[current_chunk].extend(sentence.split(' '))
                    else:
                        current_chunk += 1
                        chunks.append(sentence.split(' '))
                else:
                    chunks.append(sentence.split(' '))

            for chunk_id in range(len(chunks)):
                chunks[chunk_id] = ' '.join(chunks[chunk_id])
            res = nlp(chunks)
            split_train_eur.iloc[i,j] = ' '.join([summ['label'] for summ in res])
        else:
            None


In [6]:
#attempt 3: bert base multilingual
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")

model = AutoModelForSequenceClassification.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")

nlp = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

In [7]:
split_train_eur = split_df(train_eur)
split_train_vix = split_df(train_vix)


max_chunk = 250
cell = []
for i in tqdm(range(split_train_eur.shape[0])):
    for j in range(split_train_eur.shape[1]):
        if split_train_eur.iloc[i,j] != []:
            cell = split_train_eur.iloc[i,j][0]
            cell = cell.replace('.', '.<eos>')
            cell = cell.replace('?', '?<eos>')
            cell = cell.replace('!', '!<eos>')
            sentences = cell.split('<eos>')
            current_chunk = 0 
            chunks = []
            for sentence in sentences:
                if len(chunks) == current_chunk + 1: 
                    if len(chunks[current_chunk]) + len(sentence.split(' ')) <= max_chunk:
                        chunks[current_chunk].extend(sentence.split(' '))
                    else:
                        current_chunk += 1
                        chunks.append(sentence.split(' '))
                else:
                    chunks.append(sentence.split(' '))

            for chunk_id in range(len(chunks)):
                chunks[chunk_id] = ' '.join(chunks[chunk_id])
            res = nlp(chunks)
            split_train_eur.iloc[i,j] = ' '.join([summ['label'] for summ in res])
        else:
            None


Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,12,13,14,15,16,17,18,19,target_classif,target_reg
0,0.401754,0.317894,0.331283,0.269974,0.329169,0.472930,0.511689,0.394002,0.408096,0.560314,...,0.740719,0.502527,0.392593,0.460950,0.463064,0.396116,0.396821,0.299571,0,0.402459
1,0.215006,0.186818,0.296752,0.275611,0.268564,0.269974,0.304504,0.197389,0.152287,0.147354,...,0.327760,0.248832,0.246718,0.425009,0.501118,0.544810,0.718168,1.009918,1,1.661069
2,-0.355103,-0.422050,-0.485474,-0.548193,-0.501683,-0.528461,-0.511548,-0.541146,-0.615845,-0.562287,...,-0.400204,-0.111979,-0.092247,0.160039,-0.159194,-0.015433,-0.045031,-0.077448,1,0.635013
3,0.466587,0.573703,0.774545,0.843606,0.845016,0.876023,0.805552,0.715349,0.670248,0.781592,...,0.912668,0.940856,0.755518,0.723101,0.768203,0.842197,0.739310,0.620214,0,0.437694
4,1.299553,1.377776,1.305895,1.083207,0.709007,0.967635,0.814713,0.829512,0.668839,0.616690,...,1.238948,1.260794,1.046563,0.815418,1.028945,1.026831,1.174115,1.225559,0,0.471520
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1249,0.458835,0.572998,0.552562,0.479977,0.417962,0.396116,0.489843,0.564542,0.570179,0.730853,...,1.059247,0.899983,0.919715,0.961998,1.039515,1.002871,1.000052,0.867567,1,1.039515
1250,1.172001,1.281935,1.740701,1.790030,1.662478,1.606806,1.511670,1.653317,1.783688,1.872481,...,1.816809,1.639223,1.674458,1.644860,1.541268,1.325627,1.241062,1.287573,0,1.393984
1251,0.388646,0.351720,0.344673,0.260108,0.372861,0.357076,0.386955,0.365814,0.302390,0.443332,...,0.621482,0.605415,0.773277,0.994696,0.973273,1.019502,1.098711,0.931357,1,1.500254
1252,0.987367,1.025421,1.135356,1.035287,1.021193,0.984548,0.784411,0.819646,0.784411,0.715349,...,0.739310,0.791458,0.701255,0.705484,0.761860,0.685752,0.680114,0.516622,0,0.459540


In [229]:
# attempt 2: sentiment analysis
sent_train_eur = pd.read_csv('../sentiment__equity_train.csv')
sent_train_eur = sent_train_eur.drop(columns='Unnamed: 0', axis=1)
sent_train_eur = pd.concat([sent_train_eur, split_train_eur.iloc[:, 41:63]], axis=1)

In [85]:
import ast
test = ast.literal_eval(sent_train_eur.iloc[1, 4]))

In [141]:
def most_frequent(List):
    return max(set(List), key = List.count)

def most_common_star(df):
    for i in tqdm(range(0, df.shape[0])):
        for j in range (0, df.shape[1] - 21):
            if  df.iloc[i, j] == '2.5':
                None
            else:
                temp = ast.literal_eval(str(df.iloc[i, j]))
                for l in range(0, len(temp[0])):
                    lst = []
                    lst.append(temp[0][l]['label'][0])
                df.iloc[i, j] = int(most_frequent(lst))


3