In [501]:
import pandas as pd
import numpy as np
import os
import json
import ast
from sklearn.model_selection import cross_val_score, train_test_split
# Regressors
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import HistGradientBoostingRegressor
import xgboost as xgb
import lightgbm as lgb
from tpot import TPOTRegressor
# Classifiers
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from tpot import TPOTClassifier
# Sentiment analysis
import transformers
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
tokenizer1 = AutoTokenizer.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')
model1 = AutoModelForSequenceClassification.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

# <font color ='red'> Creating data variables </font>

In [305]:
##### Reading in data #####
# Training data with target
equity = pd.read_json('data/train/EURUSDV1M_1w.json')
volatility = pd.read_json('data/train/VIX_1w.json')
# Testing data without target
e = pd.read_json('data/test/EURUSDV1M_1w.json')
v = pd.read_json('data/test/VIX_1w.json')

#### Regression

In [104]:
# Equity
stock_e = pd.DataFrame(equity.stock.to_list())
target_e_r = pd.DataFrame(equity.target_reg)
# X_train, X_test, y_train, y_test = train_test_split(stock_e, target_e_r)

In [11]:
# Volatily
stock_v = pd.DataFrame(volatility.stock.to_list())
target_v_r = pd.DataFrame(volatility.target_reg)
# X_train, X_test, y_train, y_test = train_test_split(stock_v, target_v_r)

#### Classification

In [105]:
# Equity
target_e_c = pd.DataFrame(equity.target_classif)
X_train, X_test, y_train, y_test = train_test_split(stock_e, target_e_c)

In [13]:
# Volatily
target_v_c = pd.DataFrame(volatility.target_classif)
# X_train, X_test, y_train, y_test = train_test_split(stock_v, target_v_c)

# <font color = 'red'>Training models without central bank statements</font>
Since we have tabular data, we know that trees will perform better than neural nets

## <font color = 'blue'>Regression</font>

### Functions

In [160]:
def reg_predictor(data, model, area, path, name='pred_reg', response='n'):
    '''
    This functions fits a model to data, makes predictions and offers the possibility to save the predictions
    in a .txt file if one is content with the RMSE of the predictions
    ----------
    Paramters:
    data takes a DataFrame with the features to predict from and the target to predict
    model is the model used to make the predictions. It must have a fit() and predict() method
    area takes the data that should be predicted from after the model has been trained
    path is the folder to save the predictions in
    name is the name of the txt file to which the predictions can be saved
    response decides whether the results should be saved (y) or not (n), the default is not
    '''
    
    # Create the variable with the stock prices
    stock = pd.DataFrame(data.stock.to_list())
    # Create the variable with the regression target
    target = pd.DataFrame(data.target_reg)
    # Create the train and test set
    X_train, X_test, y_train, y_test = train_test_split(stock, target)
    # Instantiate the model
    r = model()
    # Train the model on the training data
    r_model = r.fit(X_train, y_train.values.ravel())
    # Print the RMSE of the predicitons on the test split
    print('RMSE: ', (mean_squared_error(r_model.predict(X_test), y_test))**0.5)
    # Ask whether or not the result of the model's prediction on the evaluation data should be saved or not
    # response = input('Do you want to save the result? Y/N \n')
    # If the answer should be saved
    if response.lower() == 'y':
        # Use the model to make predictions based on the evaluation data and save those predictions in a list
        reg = list(r_model.predict(area.stock.to_list()))
        # Write the predictions to a txt file in the specified path with the specified name
        with open(os.path.join(path, name + '.txt'), 'w') as f:
            f.write('\n'.join(list(map(str, reg))))

### Models

#### Our baseline: Linear regression

In [33]:
# Equity
reg_predictor(equity, LinearRegression, e, 'answer/EURUSDV1M_1w')
# Volatility
reg_predictor(volatility, LinearRegression, v, 'answer/VIX_1w')

RMSE:  0.3878019384368221
Do you want to save the result? Y/N 
n
RMSE:  0.315535854182696
Do you want to save the result? Y/N 
n


#### Extra Trees regression

In [51]:
# Equity
reg_predictor(equity, ExtraTreesRegressor, e, 'answer/EURUSDV1M_1w')
# Volatility
reg_predictor(volatility, ExtraTreesRegressor, v, 'answer/VIX_1w')

RMSE:  0.27804486839393255
Do you want to save the result? Y/N 
n
RMSE:  0.3768250477294624
Do you want to save the result? Y/N 
n


#### Random Forest regression

In [53]:
# Equity
reg_predictor(equity, RandomForestRegressor, e, 'answer/EURUSDV1M_1w')
# Volatility
reg_predictor(volatility, RandomForestRegressor, v, 'answer/VIX_1w')

RMSE:  0.3050366047300447
Do you want to save the result? Y/N 
n
RMSE:  0.35178457151690273
Do you want to save the result? Y/N 
n


#### HistGradBoost regression

In [58]:
# Equity
reg_predictor(equity, HistGradientBoostingRegressor, e, 'answer/EURUSDV1M_1w')
# Volatility
reg_predictor(volatility, HistGradientBoostingRegressor, v, 'answer/VIX_1w')

RMSE:  0.33969377039808407
Do you want to save the result? Y/N 
n
RMSE:  0.3749545613301794
Do you want to save the result? Y/N 
n


#### XGBoost regression

In [57]:
# Equity
reg_predictor(equity, xgb.XGBRegressor, e, 'answer/EURUSDV1M_1w')
# Volatility
reg_predictor(volatility, xgb.XGBRegressor, v, 'answer/VIX_1w')

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


RMSE:  0.3138804695299698
Do you want to save the result? Y/N 
n


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


RMSE:  0.3674240116676135
Do you want to save the result? Y/N 
n


#### LightGBModel regression

In [59]:
# Equity
reg_predictor(equity, lgb.LGBMRegressor, e, 'answer/EURUSDV1M_1w')
# Volatility
reg_predictor(volatility, lgb.LGBMRegressor, v, 'answer/VIX_1w')

RMSE:  0.3305745083812685
Do you want to save the result? Y/N 
n
RMSE:  0.33043460622666937
Do you want to save the result? Y/N 
n


## <font color = 'blue'>Classification</font>

### Functions

In [151]:
def classif_predictor(data, model, area, path, name='pred_classif', response='No'):
    '''
    This functions fits a model to data, makes predictions and offers the possibility to save the predictions
    in a .txt file if one is content with the accuracy of the predictions
    ----------
    Paramters:
    data takes a DataFrame with the features to predict from and the target to predict
    model is the model used to make the predictions. It must have a fit() and predict() method
    area takes the data that should be predicted from after the model has been trained
    path is the folder to save the predictions in
    name is the name of the txt file to which the predictions can be saved
    '''
    
    # Create the variable with the stock prices
    stock = pd.DataFrame(data.stock.to_list())
    # Create the variable with the regression target
    target = pd.DataFrame(data.target_classif)
    # Create the train and test set
    X_train, X_test, y_train, y_test = train_test_split(stock, target)
    # Instantiate the model
    c = model()
    # Train the model on the training data
    c_model = c.fit(X_train, y_train.values.ravel())
    # Print the RMSE of the predicitons on the test split
    print('Accuracy: ', accuracy_score(c_model.predict(X_test), y_test))
    # Ask whether or not the result of the model's prediction on the evaluation data should be saved or not
    # response = input('Do you want to save the result? Y/N \n')
    # If the answer should be saved
    if response.lower() == 'y':
        # Use the model to make predictions based on the evaluation data and save those predictions in a list
        classif = list(c_model.predict(area.stock.to_list()))
        # Write the predictions to a txt file in the specified path with the specified name
        with open(os.path.join(path, name + '.txt'), 'w') as f:
            f.write('\n'.join(list(map(str, classif))))

### Models

#### Our baseline: Logistic Regression

In [356]:
classif_predictor(equity, LogisticRegression, e,
                  'answer/EURUSDV1M_1w', 'pred_classif')
classif_predictor(volatility, LogisticRegression, v, 
                  'answer/VIX_1w', 'pred_classif')

Accuracy:  0.6910828025477707
Accuracy:  0.6369426751592356


#### Extra Trees classification

In [152]:
# Equity
classif_predictor(equity, ExtraTreesClassifier, e, 'answer/EURUSDV1M_1w')
# Volatility
classif_predictor(volatility, ExtraTreesClassifier, v, 'answer/VIX_1w')

Accuracy:  0.6815286624203821
Accuracy:  0.6847133757961783


#### Random Forest classification

In [154]:
# Equity
classif_predictor(equity, RandomForestClassifier, e, 'answer/EURUSDV1M_1w')
# Volatility
classif_predictor(volatility, RandomForestClassifier, v, 'answer/VIX_1w')

Accuracy:  0.6878980891719745
Accuracy:  0.7006369426751592


#### HistGradBoost classification

In [156]:
# Equity
classif_predictor(equity, HistGradientBoostingClassifier, e, 'answer/EURUSDV1M_1w')
# Volatility
classif_predictor(volatility, HistGradientBoostingClassifier, v, 'answer/VIX_1w')

Accuracy:  0.6592356687898089
Accuracy:  0.6815286624203821


#### XGBoost classification

In [157]:
# Equity
classif_predictor(equity, xgb.XGBClassifier, e, 'answer/EURUSDV1M_1w')
# Volatility
classif_predictor(volatility, xgb.XGBClassifier, v, 'answer/VIX_1w')

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


Accuracy:  0.6464968152866242


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


Accuracy:  0.6592356687898089


#### LightGBModel classification

In [158]:
# Equity
classif_predictor(equity, lgb.LGBMClassifier, e, 'answer/EURUSDV1M_1w')
# Volatility
classif_predictor(volatility, lgb.LGBMClassifier, v, 'answer/VIX_1w')

Accuracy:  0.6337579617834395
Accuracy:  0.6242038216560509


# <font color = 'red'>Training models with central bank statements</font>

## <font color = 'blue'>Sentiment analysis</font>

## Functions

In [221]:
def extract_speeches(data):
    # Filling the empty lists with empty string, to allow easier processing later on
    for i in range(len(data['speech'])):
        for j in range(len(data['speech'][i])):
            if data['speech'][i][j]['ECB'] == []:
                data['speech'][i][j]['ECB'] = ['']
            if data['speech'][i][j]['FED'] == []:
                data['speech'][i][j]['FED'] = ['']
                
    # Creating a dataframe with speeches only
    temp = []
    for j in data['speech']:
        temp.append(pd.concat([pd.DataFrame(i) for i in j], axis=1))
    # Since the index were just 0, we set it to a normal row counter and remove the column with the 0-index
    speeches = pd.concat(temp).reset_index().iloc[:, 1:]
    return speeches

In [502]:
nlp = pipeline("sentiment-analysis", model=model1, tokenizer=tokenizer1)

In [253]:
def sentiment_analysis(text):
    temp = []
    max_chunk = 210
    delete_from = []
    
    # Replacing all end of sentence signs with <eos>
    text = text.replace('. ', '.<eos>')
    text = text.replace('? ', '?<eos>')
    text = text.replace('! ', '!<eos>')
    # Splitting the data into sentences
    sentences = text.split('<eos>')
    
    current_chunk = 0 
    chunks = []
    # Looping through the sentences and putting them into chunks as long as the chunk size is less than 210.
    # The 210 is a value we got through experimenting, that keeps the number of tokens per chunk (containing)
    # normal text below 512, since this is the maximum number of tokens the model can handle at any one time.
    for sentence in sentences:
        if len(chunks) == current_chunk + 1: 
            if len(chunks[current_chunk]) + len(sentence.split(' ')) <= max_chunk:
                chunks[current_chunk].extend(sentence.split(' '))
            else:
                current_chunk += 1
                chunks.append(sentence.split(' '))
        else:
            chunks.append(sentence.split(' '))
    # At the end of some speeches there are useless lists of references that cause the tokenizer to split the
    # chunks into more than 512 tokens. Thus we remove the first all all chunks after the first one creating
    # more than 512 tokens.
    for chunk_id in range(len(chunks)):
        if len(tokenizer1(' '.join(chunks[chunk_id]))['input_ids']) <= 512:
            chunks[chunk_id] = ' '.join(chunks[chunk_id])
        else:
            print('Deleting from chunk: ' + str(chunk_id))  # Printing which chunk to delete
            delete_from.append(chunk_id)
            
    if len(delete_from) > 0:
        del chunks[delete_from[0]:]  # Deleting all chunks after one produced more than 512 tokens
    
    temp.append(nlp(chunks))
    return temp

In [254]:
def create_sentiment_file(speeches, path, name):
    # Looping through all speeches and analyzing their sentiment
    sentiment = []

    for j in range(len(speeches)):
        start = time.time()
        non_empty = []
        for speech in speeches.iloc[j]:
            if speech != '':  # As long as there was a speech,
                non_empty.append(sentiment_analysis(speech))  # analyze its sentiment
            else:  # If the speech cell is empty,
                non_empty.append(2.5)  # just assign the value 2.5 (not the average, not cannot be confused with one of the other values)
        print(f'Row {j}:', time.time() - start)
        sentiment.append(non_empty)  # Assign the result of the analyses to sentiment
    # Write to disk
    pd.DataFrame(sentiment).to_csv(path + '/' + name + '.csv')
    return sentiment

In [587]:
def read_sentiment_file(path):
    sentiment = pd.read_csv(path)
    sentiment = sentiment.drop(columns='Unnamed: 0', axis=1)
    most_common_sentiment(sentiment)  # This function acts directly on the sentiment DataFrame
    return sentiment

In [666]:
# We use the most common sentiment level as the sentiment for the entire speech
def most_frequent(List):
    return max(set(List), key = List.count)

def most_common_sentiment(df):
    for i in range(df.shape[0]):
        #filler = 2.5
        for j in range (df.shape[1]):
            if  df.iloc[i, j] == '2.5':
                # Setting the unknown ratings to the previous known one, basically there saying that
                # there are only speeches if updates are necessary. Cold start is 2.5
                df.iloc[i, j] = 3
            else:
                temp = ast.literal_eval(str(df.iloc[i, j]))  # Evaluate the string to be able to treat it as a python data structure
                for l in range(len(temp[0])):
                    lst = []
                    lst.append(temp[0][l]['label'][0])
                df.iloc[i, j] = int(most_frequent(lst))
                #filler = int(most_frequent(lst))
        #df.iloc[i, :] = df.iloc[i, :].fillna(df.iloc[i, :].mean().round(3))  # Replacing the unknown ratings with the average of the known ratings

### Application

In [667]:
####### Equity #######
## For the training data
# Extract the speeches from the data
speeches = extract_speeches(equity)
# Defining the path and file name to which to store the sentiment analysis
path = '/Users/charlesnicholas/Documents/Natixis/SentimentAnalysis'
name = 'train_equity_sentiment_analysis'
# This creates a sentiment analysis and stores it in the specified path with the specified name
#sentiments = create_sentiment_file(speeches, path, name)  # Commented out, since it takes days to run
# If the sentiment analysis file has already been created, we can read it from the disk
sentiments = read_sentiment_file(path + '/' + name + '.csv')
sentiments = sentiments.add_prefix('Speech_')
s_equity = equity.join(sentiments).drop(columns='speech')

## For the test data
# Extract the speeches from the data
speeches = extract_speeches(e)
# Defining the path and file name to which to store the sentiment analysis
path = '/Users/charlesnicholas/Documents/Natixis/SentimentAnalysis'
name = 'equity_sentiment_analysis_true_test'
# This creates a sentiment analysis and stores it in the specified path with the specified name
#sentiments = create_sentiment_file(speeches, path, name)  # Commented out, since it takes hours to run
# If the sentiment analysis file has already been created, we can read it from the disk
sentiments = read_sentiment_file(path + '/' + name + '.csv')
sentiments = sentiments.add_prefix('Speech_')
s_e = e.join(sentiments).drop(columns='speech')

In [668]:
####### Volatility #######
## For the training data
# Extract the speeches from the data
speeches = extract_speeches(volatility)
# This creates a sentiment analysis and stores it in the specified path with the specified name
path = '/Users/charlesnicholas/Documents/Natixis/SentimentAnalysis'
name = 'train_volatility_sentiment_analysis'
# This creates a sentiment analysis and stores it in the specified path with the specified name
#sentiments = create_sentiment_file(speeches, path, name)  # Commented out, since it takes days to run
# If the sentiment analysis file has already been created, we can read it from the disk
sentiments = read_sentiment_file(path + '/' + name + '.csv')
sentiments = sentiments.add_prefix('Speech_')
s_volatility = volatility.join(sentiments).drop(columns='speech')

## For the test data
# Extract the speeches from the data
speeches = extract_speeches(v)
# This creates a sentiment analysis and stores it in the specified path with the specified name
path = '/Users/charlesnicholas/Documents/Natixis/SentimentAnalysis'
name = 'volatility_sentiment_analysis_true_test'
# This creates a sentiment analysis and stores it in the specified path with the specified name
#sentiments = create_sentiment_file(speeches, path, name)  # Commented out, since it takes hours to run
# If the sentiment analysis file has already been created, we can read it from the disk
sentiments = read_sentiment_file(path + '/' + name + '.csv')
sentiments = sentiments.add_prefix('Speech_')
s_v = v.join(sentiments).drop(columns='speech')

## <font color = 'blue'>Model training</font>

### <font color = 'blue'>Regression</font>

### Functions

In [506]:
def s_reg_predictor(data, model, area, path, name='pred_reg', response='n'):
    '''
    This functions fits a model to data, makes predictions and offers the possibility to save the predictions
    in a .txt file if one is content with the RMSE of the predictions
    ----------
    Paramters:
    data takes a DataFrame with the features to predict from and the target to predict
    model is the model used to make the predictions. It must have a fit() and predict() method
    area takes the data that should be predicted from after the model has been trained
    path is the folder to save the predictions in
    name is the name of the txt file to which the predictions can be saved
    response decides whether the results should be saved (y) or not (n), the default is not
    '''
    
    # Create the variable with the stock prices
    stock = pd.DataFrame(data.stock.to_list()).add_prefix('Day_')
    # Create the variable with the speech analyses included
    features = data.drop(columns=['stock', 'target_classif', 'target_reg']).join(stock)
    features = features.astype(float)
    # Create the variable with the regression target
    target = pd.DataFrame(data.target_reg)
    # Create the train and test set
    X_train, X_test, y_train, y_test = train_test_split(features, target)
    # Instantiate the model
    r = model()
    # Train the model on the training data
    r_model = r.fit(X_train, y_train.values.ravel())
    # Print the RMSE of the predicitons on the test split
    print('RMSE: ', (mean_squared_error(r_model.predict(X_test), y_test))**0.5)
    # Ask whether or not the result of the model's prediction on the evaluation data should be saved or not
    #response = input('Do you want to save the result? Y/N \n')
    # If the answer should be saved
    if response.lower() == 'y':
        # Use the model to make predictions based on the evaluation data and save those predictions in a list
        area_stock = pd.DataFrame(area.stock.to_list()).add_prefix('Day_')
        area_features = area.drop(columns=['stock']).join(area_stock)
        area_features = area_features.astype(float)
        reg = list(r_model.predict(area_features))
        # Write the predictions to a txt file in the specified path with the specified name
        with open(os.path.join(path, name + '.txt'), 'w') as f:
            f.write('\n'.join(list(map(str, reg))))

### Models

#### Linear regression

In [636]:
# Equity
s_reg_predictor(s_equity, LinearRegression, s_e, 'answer/EURUSDV1M_1w')
# Volatility
s_reg_predictor(s_volatility, LinearRegression, s_v, 'answer/VIX_1w')

RMSE:  0.38228181923859755
RMSE:  0.4101866231697137


#### Extra Tress regression

In [637]:
# Equity
s_reg_predictor(s_equity, ExtraTreesRegressor, s_e, 'answer/EURUSDV1M_1w')
# Volatility
s_reg_predictor(s_volatility, ExtraTreesRegressor, s_v, 'answer/VIX_1w')

RMSE:  0.275192412660448
RMSE:  0.40707032756399897


#### Random Forest regression

In [638]:
# Equity
s_reg_predictor(s_equity, RandomForestRegressor, s_e, 'answer/EURUSDV1M_1w')
# Volatility
s_reg_predictor(s_volatility, RandomForestRegressor, s_v, 'answer/VIX_1w')

RMSE:  0.298707457793211
RMSE:  0.3431408115283878


#### HistGradBoost regression

In [639]:
# Equity
s_reg_predictor(s_equity, HistGradientBoostingRegressor, s_e, 'answer/EURUSDV1M_1w')
# Volatility
s_reg_predictor(s_volatility, HistGradientBoostingRegressor, s_v, 'answer/VIX_1w')

RMSE:  0.3297811314049579
RMSE:  0.3753017486714521


#### XGBoost regression

In [640]:
# Equity
s_reg_predictor(s_equity, xgb.XGBRegressor, s_e, 'answer/EURUSDV1M_1w')
# Volatility
s_reg_predictor(s_volatility, xgb.XGBRegressor, s_v, 'answer/VIX_1w')

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


RMSE:  0.3441181861640191
RMSE:  0.3508529388924772


#### LightGBModel regression

In [641]:
# Equity
s_reg_predictor(s_equity, lgb.LGBMRegressor, s_e, 'answer/EURUSDV1M_1w')
# Volatility
s_reg_predictor(s_volatility, lgb.LGBMRegressor, s_v, 'answer/VIX_1w')

RMSE:  0.36954371472767017
RMSE:  0.3833764604874184


## <font color = 'blue'>Classification</font>

### Functions

In [476]:
def s_classif_predictor(data, model, area, path, name='pred_classif', response='n'):
    '''
    This functions fits a model to data, makes predictions and offers the possibility to save the predictions
    in a .txt file if one is content with the Accuracy of the predictions
    ----------
    Paramters:
    data takes a DataFrame with the features to predict from and the target to predict
    model is the model used to make the predictions. It must have a fit() and predict() method
    area takes the data that should be predicted from after the model has been trained
    path is the folder to save the predictions in
    name is the name of the txt file to which the predictions can be saved
    response decides whether the results should be saved (y) or not (n), the default is not
    '''
    
    # Create the variable with the stock prices
    stock = pd.DataFrame(data.stock.to_list()).add_prefix('Day_')
    # Create the variable with the speech analyses included
    features = data.drop(columns=['stock', 'target_classif', 'target_reg']).join(stock)
    features = features.astype(float)
    # Create the variable with the regression target
    target = pd.DataFrame(data.target_classif)
    # Create the train and test set
    X_train, X_test, y_train, y_test = train_test_split(features, target)
    # Instantiate the model
    if 'solver' in model().get_params():
        c = model(solver='liblinear')  # This gives us the best result for linear regression
    else:
        c = model()
    # Train the model on the training data
    c_model = c.fit(X_train, y_train.values.ravel())
    # Print the RMSE of the predicitons on the test split
    print('Accuracy: ', accuracy_score(c_model.predict(X_test), y_test))
    # Ask whether or not the result of the model's prediction on the evaluation data should be saved or not
    #response = input('Do you want to save the result? Y/N \n')
    # If the answer should be saved
    if response.lower() == 'y':
        # Use the model to make predictions based on the evaluation data and save those predictions in a list
        area_stock = pd.DataFrame(area.stock.to_list()).add_prefix('Day_')
        area_features = area.drop(columns=['stock']).join(area_stock)
        area_features = area_features.astype(float)
        classif = list(c_model.predict(area_features))
        # Write the predictions to a txt file in the specified path with the specified name
        with open(os.path.join(path, name + '.txt'), 'w') as f:
            f.write('\n'.join(list(map(str, classif))))

### Models

#### Logistic regression (best for leaderboard)

In [642]:
# Equity
s_classif_predictor(s_equity, LogisticRegression, s_e, 'answer/EURUSDV1M_1w')
# Volatility
s_classif_predictor(s_volatility, LogisticRegression, s_v, 'answer/VIX_1w')

Accuracy:  0.6656050955414012
Accuracy:  0.5955414012738853


#### Extra Trees classification

In [643]:
# Equity
s_classif_predictor(s_equity, ExtraTreesClassifier, s_e, 'answer/EURUSDV1M_1w')
# Volatility
s_classif_predictor(s_volatility, ExtraTreesClassifier, v, 'answer/VIX_1w')

Accuracy:  0.697452229299363
Accuracy:  0.6656050955414012


#### Random Forest classification

In [644]:
# Equity
s_classif_predictor(s_equity, RandomForestClassifier, s_e, 'answer/EURUSDV1M_1w')
# Volatility
s_classif_predictor(s_volatility, RandomForestClassifier, s_v, 'answer/VIX_1w')

Accuracy:  0.6910828025477707
Accuracy:  0.7165605095541401


#### HistGradBoost classification

In [645]:
# Equity
s_classif_predictor(s_equity, HistGradientBoostingClassifier, s_e, 'answer/EURUSDV1M_1w')
# Volatility
s_classif_predictor(s_volatility, HistGradientBoostingClassifier, s_v, 'answer/VIX_1w')

Accuracy:  0.6528662420382165
Accuracy:  0.7133757961783439


#### XGBoost classification

In [646]:
# Equity
s_classif_predictor(s_equity, xgb.XGBClassifier, s_e, 'answer/EURUSDV1M_1w')
# Volatility
s_classif_predictor(s_volatility, xgb.XGBClassifier, s_v, 'answer/VIX_1w')

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


Accuracy:  0.6751592356687898
Accuracy:  0.6751592356687898


#### LightGBModel classification

In [647]:
# Equity
s_classif_predictor(s_equity, lgb.LGBMClassifier, s_e, 'answer/EURUSDV1M_1w')
# Volatility
s_classif_predictor(s_volatility, lgb.LGBMClassifier, s_v, 'answer/VIX_1w')

Accuracy:  0.6878980891719745
Accuracy:  0.6815286624203821


# <font color = 'red'>Submissions</font>

## <font color = 'blue'>Regression</font>

## Using the training set with train_test_split

#### Extra Trees regression

In [507]:
# Equity
s_reg_predictor(s_equity, ExtraTreesRegressor, s_e, 'answer/EURUSDV1M_1w', response='y')
# Volatility
s_reg_predictor(s_volatility, ExtraTreesRegressor, s_v, 'answer/VIX_1w', response='y')

RMSE:  0.27040698623802467
RMSE:  0.39239190744334584


#### HistGradBoost regression

In [537]:
# Equity
s_reg_predictor(s_equity, HistGradientBoostingRegressor, s_e, 'answer/EURUSDV1M_1w', response='y')
# Volatility
s_reg_predictor(s_volatility, HistGradientBoostingRegressor, s_v, 'answer/VIX_1w', response='y')

RMSE:  0.3097169805320025
RMSE:  0.38427795675241827


#### LightGBModel regression

In [626]:
# Equity
s_reg_predictor(s_equity, lgb.LGBMRegressor, s_e, 'answer/EURUSDV1M_1w', response='y')
# Volatility
s_reg_predictor(s_volatility, lgb.LGBMRegressor, s_v, 'answer/VIX_1w', response='y')

RMSE:  0.30504865016237187
RMSE:  0.3742112881704846


#### XGBoost regression

In [651]:
# Equity
s_reg_predictor(s_equity, xgb.XGBRegressor, s_e, 'answer/EURUSDV1M_1w', response='y')
# Volatility
s_reg_predictor(s_volatility, xgb.XGBRegressor, s_v, 'answer/VIX_1w', response='y')

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


RMSE:  0.3676016769676427
RMSE:  0.3472002394613814


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


#### Linear regression

In [669]:
# Equity
s_reg_predictor(s_equity, LinearRegression, s_e, 'answer/EURUSDV1M_1w', response='y')
# Volatility
s_reg_predictor(s_volatility, LinearRegression, s_v, 'answer/VIX_1w', response='y')

RMSE:  0.3239155089816116
RMSE:  0.3952095009206132


#### Just taking the last element of the stock price series (best for leader board :))

In [697]:
# Equity
last_entry = list(e.stock.apply(lambda x: x[-1]))
with open('/Users/charlesnicholas/Documents/Natixis/starting_kit_final/answer/EURUSDV1M_1w/pred_reg.txt', 'w') as f:
    f.write('\n'.join(list(map(str, last_entry))))
# Volatility
last_entry = list(v.stock.apply(lambda x: x[-1]))
with open('/Users/charlesnicholas/Documents/Natixis/starting_kit_final/answer/VIX_1w/pred_reg.txt', 'w') as f:
    f.write('\n'.join(list(map(str, last_entry))))

## Using the entire training set to train

### Function

In [508]:
def s_reg_predictor_all(data, model, area, path, name='pred_reg', response='n'):
    '''
    This functions fits a model to data, makes predictions and offers the possibility to save the predictions
    in a .txt file if one is content with the RMSE of the predictions
    ----------
    Paramters:
    data takes a DataFrame with the features to predict from and the target to predict
    model is the model used to make the predictions. It must have a fit() and predict() method
    area takes the data that should be predicted from after the model has been trained
    path is the folder to save the predictions in
    name is the name of the txt file to which the predictions can be saved
    response decides whether the results should be saved (y) or not (n), the default is not
    '''
    
    # Create the variable with the stock prices
    stock = pd.DataFrame(data.stock.to_list()).add_prefix('Day_')
    # Create the variable with the speech analyses included
    features = data.drop(columns=['stock', 'target_classif', 'target_reg']).join(stock)
    features = features.astype(float)
    # Create the variable with the regression target
    target = pd.DataFrame(data.target_reg)
    # Create the train and test set
    #X_train, X_test, y_train, y_test = train_test_split(features, target)
    # Instantiate the model
    r = model()
    # Train the model on the training data
    r_model = r.fit(features, target.values.ravel())
    # Print the RMSE of the predicitons on the test split
    #print('RMSE: ', (mean_squared_error(r_model.predict(X_test), y_test))**0.5)
    # Ask whether or not the result of the model's prediction on the evaluation data should be saved or not
    #response = input('Do you want to save the result? Y/N \n')
    # If the answer should be saved
    if response.lower() == 'y':
        # Use the model to make predictions based on the evaluation data and save those predictions in a list
        area_stock = pd.DataFrame(area.stock.to_list()).add_prefix('Day_')
        area_features = area.drop(columns=['stock']).join(area_stock)
        area_features = area_features.astype(float)
        reg = list(r_model.predict(area_features))
        # Write the predictions to a txt file in the specified path with the specified name
        with open(os.path.join(path, name + '.txt'), 'w') as f:
            f.write('\n'.join(list(map(str, reg))))

### Models

#### Linear regression

In [529]:
# Equity
s_reg_predictor_all(s_equity, LinearRegression, s_e, 'answer/EURUSDV1M_1w', response='y')
# Volatility
s_reg_predictor_all(s_volatility, LinearRegression, s_v, 'answer/VIX_1w', response='y')

## <font color = 'blue'>Classification</font>

## Using the training set with train_test_split

#### Logistic regression

In [670]:
# Equity
s_classif_predictor(s_equity, LogisticRegression, s_e, 'answer/EURUSDV1M_1w', response='y')
# Volatility
s_classif_predictor(s_volatility, LogisticRegression, s_v, 'answer/VIX_1w', response='y')

Accuracy:  0.6719745222929936
Accuracy:  0.6050955414012739


#### Extra Trees classifier

In [531]:
# Equity
s_classif_predictor(s_equity, ExtraTreesClassifier, s_e, 'answer/EURUSDV1M_1w', response='y')
# Volatility
s_classif_predictor(s_volatility, ExtraTreesClassifier, s_v, 'answer/VIX_1w', response='y')

Accuracy:  0.6592356687898089
Accuracy:  0.6528662420382165


## Using the entire training set to train

### Function

In [527]:
def s_classif_predictor_all(data, model, area, path, name='pred_classif', response='n'):
    '''
    This functions fits a model to data, makes predictions and offers the possibility to save the predictions
    in a .txt file if one is content with the Accuracy of the predictions
    ----------
    Paramters:
    data takes a DataFrame with the features to predict from and the target to predict
    model is the model used to make the predictions. It must have a fit() and predict() method
    area takes the data that should be predicted from after the model has been trained
    path is the folder to save the predictions in
    name is the name of the txt file to which the predictions can be saved
    response decides whether the results should be saved (y) or not (n), the default is not
    '''
    
    # Create the variable with the stock prices
    stock = pd.DataFrame(data.stock.to_list()).add_prefix('Day_')
    # Create the variable with the speech analyses included
    features = data.drop(columns=['stock', 'target_classif', 'target_reg']).join(stock)
    features = features.astype(float)
    # Create the variable with the regression target
    target = pd.DataFrame(data.target_classif)
    # Create the train and test set
    #X_train, X_test, y_train, y_test = train_test_split(features, target)
    # Instantiate the model
    if 'solver' in model().get_params():
        c = model(solver='liblinear')  # This gives us the best result for linear regression
    else:
        c = model()
    # Train the model on the training data
    c_model = c.fit(features, target.values.ravel())
    # Print the RMSE of the predicitons on the test split
    #print('Accuracy: ', accuracy_score(c_model.predict(X_test), y_test))
    # Ask whether or not the result of the model's prediction on the evaluation data should be saved or not
    #response = input('Do you want to save the result? Y/N \n')
    # If the answer should be saved
    if response.lower() == 'y':
        # Use the model to make predictions based on the evaluation data and save those predictions in a list
        area_stock = pd.DataFrame(area.stock.to_list()).add_prefix('Day_')
        area_features = area.drop(columns=['stock']).join(area_stock)
        area_features = area_features.astype(float)
        classif = list(c_model.predict(area_features))
        # Write the predictions to a txt file in the specified path with the specified name
        with open(os.path.join(path, name + '.txt'), 'w') as f:
            f.write('\n'.join(list(map(str, classif))))

### Models

#### Random Forest classification

In [530]:
# Equity
s_classif_predictor_all(s_equity, RandomForestClassifier, s_e, 'answer/EURUSDV1M_1w', response='y')
# Volatility
s_classif_predictor_all(s_volatility, RandomForestClassifier, s_v, 'answer/VIX_1w', response='y')

In [None]:
# Using the entire dataset to train produces worse results