In [1]:
import pandas as pd
import numpy as np
import os
import json
import tensorflow as tf
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from textblob import TextBlob

In [2]:
train_eur = pd.read_json('../input/train-natixis/EURUSDV1M_1w.json')
train_vix = pd.read_json('../input/train-natixis/VIX_1w.json')

In [3]:
def split_df(df):
  stock = pd.DataFrame(df['stock'].to_list(), index=df.index)
  
  df1 = df.drop(labels='stock', axis=1)
  df1 = pd.concat([stock, df1], axis=1)

  speeches = pd.DataFrame(df.iloc[:, 0].tolist(), index=df.index)
  appended_data = []
  for i in range(0, 20):
    x = pd.DataFrame(speeches.iloc[:, i].tolist(), index=speeches.index)
    appended_data.append(x)

  appended_data = pd.concat(appended_data, axis=1)
  df1 = df1.drop(labels='speech', axis=1)
  final = pd.concat([appended_data, df1], axis=1)
  return final


In [4]:
from transformers import pipeline
summarizer = pipeline('summarization')

In [5]:
#attempt 1: textblob
split_train_eur = split_df(train_eur)
split_train_vix = split_df(train_vix)

for i in tqdm(range(split_train_eur.shape[0])):
    for j in range(40):
        split_train_eur.iloc[i,j] = TextBlob(str(split_train_eur.iloc[i,j])).sentiment
        split_train_eur.iloc[i,j] = split_train_eur.iloc[i,j][0]

In [8]:
#classification
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.svm import SVC

X_train, X_test, y_train, y_test = train_test_split(split_train_eur.iloc[:, 0:60], 
                                                    split_train_eur.iloc[:, 60],
                                                    test_size=0.3)

svm = SVC()
knn = KNeighborsClassifier(n_neighbors=7)
lgbm = lgb.LGBMClassifier()
xgbm = xgb.XGBClassifier()
rtf = RandomForestClassifier()
etc = ExtraTreesClassifier()

svm_model = svm.fit(X_train, y_train)
knn_model = knn.fit(X_train, y_train)
etc_model = etc.fit(X_train, y_train)
rtf_model = rtf.fit(X_train, y_train)


In [9]:
from sklearn.metrics import accuracy_score
print('Accuracy score (SVM): ', accuracy_score(svm_model.predict(X_test), y_test))
print('Accuracy score (KNN): ', accuracy_score(knn_model.predict(X_test), y_test))
print('Accuracy score (ETC): ', accuracy_score(etc_model.predict(X_test), y_test))
print('Accuracy score (RTF): ', accuracy_score(rtf_model.predict(X_test), y_test))

In [22]:
from sklearn.linear_model import LinearRegression
import xgboost as xgb
import lightgbm as lgb
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.metrics import mean_squared_error

X_train, X_test, y_train, y_test = train_test_split(split_train_eur.iloc[:, 0:60], 
                                                    split_train_eur.iloc[:, 61],
                                                    test_size=0.3)

rtf = RandomForestRegressor(random_state=42)
etc = ExtraTreesRegressor(random_state=42)
lin = LinearRegression()

etc_model = etc.fit(X_train, y_train)
rtf_model = rtf.fit(X_train, y_train)
lin_model = lin.fit(X_train, y_train)

print('Accuracy score (ETC): ', (mean_squared_error(etc_model.predict(X_test), y_test))**0.5)
print('Accuracy score (RTF): ', (mean_squared_error(rtf_model.predict(X_test), y_test))**0.5)
print('Accuracy score (LIN): ', (mean_squared_error(lin_model.predict(X_test), y_test))**0.5)


In [23]:
#attempt 2: FinBERT
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import pipeline

finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone',num_labels=3)
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')

nlp = pipeline("sentiment-analysis", model=finbert, tokenizer=tokenizer)

sentences = ["there is a shortage of capital, and we need extra financing",  
             "growth is strong and we have plenty of liquidity", 
             "there are doubts about our finances", 
             "profits are flat"]
results = nlp(sentences)
print(results)

In [99]:
split_train_eur = split_df(train_eur)
split_train_vix = split_df(train_vix)

max_chunk = 250
cell = []
for i in tqdm(range(split_train_eur.shape[0])):
    for j in range(split_train_eur.shape[1]):
        if split_train_eur.iloc[i,j] != []:
            cell = split_train_eur.iloc[i,j][0]
            cell = cell.replace('.', '.<eos>')
            cell = cell.replace('?', '?<eos>')
            cell = cell.replace('!', '!<eos>')
            sentences = cell.split('<eos>')
            current_chunk = 0 
            chunks = []
            for sentence in sentences:
                if len(chunks) == current_chunk + 1: 
                    if len(chunks[current_chunk]) + len(sentence.split(' ')) <= max_chunk:
                        chunks[current_chunk].extend(sentence.split(' '))
                    else:
                        current_chunk += 1
                        chunks.append(sentence.split(' '))
                else:
                    chunks.append(sentence.split(' '))

            for chunk_id in range(len(chunks)):
                chunks[chunk_id] = ' '.join(chunks[chunk_id])
            res = nlp(chunks)
            split_train_eur.iloc[i,j] = ' '.join([summ['label'] for summ in res])
        else:
            None


In [6]:
#attempt 3: bert base multilingual
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")

model = AutoModelForSequenceClassification.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")

nlp = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

In [7]:
split_train_eur = split_df(train_eur)
split_train_vix = split_df(train_vix)


max_chunk = 250
cell = []
for i in tqdm(range(split_train_eur.shape[0])):
    for j in range(split_train_eur.shape[1]):
        if split_train_eur.iloc[i,j] != []:
            cell = split_train_eur.iloc[i,j][0]
            cell = cell.replace('.', '.<eos>')
            cell = cell.replace('?', '?<eos>')
            cell = cell.replace('!', '!<eos>')
            sentences = cell.split('<eos>')
            current_chunk = 0 
            chunks = []
            for sentence in sentences:
                if len(chunks) == current_chunk + 1: 
                    if len(chunks[current_chunk]) + len(sentence.split(' ')) <= max_chunk:
                        chunks[current_chunk].extend(sentence.split(' '))
                    else:
                        current_chunk += 1
                        chunks.append(sentence.split(' '))
                else:
                    chunks.append(sentence.split(' '))

            for chunk_id in range(len(chunks)):
                chunks[chunk_id] = ' '.join(chunks[chunk_id])
            res = nlp(chunks)
            split_train_eur.iloc[i,j] = ' '.join([summ['label'] for summ in res])
        else:
            None
