# Machine Learning Module

This module was created to find the most accurate machine learning model that allows to predict the direction of the movement of any financial asset in the next minute, using as input intraday data of the last 10 days. 

The module is composed by two sections.

* **Feature selection**
    
    This process is is made using Lasso regularization
    
    

* **Fitting Machine Learning Models**
    
    Among the models used in this section will have
    
    
    * Logistic Regression Classifier
    * XGBoost Classifier
    * Decission Tree Classifier
    * Support Vector Machine

First, we need to install and import the libraries

In [None]:
#pip install xgboost

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn
import datetime
import xgboost as xgb

from xgboost import XGBClassifier
from datetime import datetime

from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler

### Reading data and split the dataset in train and test

In [2]:
path = 'C:/Users/abrah/Desktop/DISSERTATION/Data/'
total_results = pd.DataFrame()

In [63]:
#symbol = "EWW" #ETF replicating MEXICO IPC
#symbol = "AMXL.MX" #AMÉRICA MÓVIL MÉXICO
#symbol = "WALMEX.MX" #WALMART DE MEXICO
symbol = "GFNORTEO.MX" #GRUPO FINANCIERO BANORTE MÉXICO

In [64]:
data_for_model = pd.read_csv(path + symbol+'_last_month_with_ti.csv')

### Let's split data into training and test sets

In [65]:
X = data_for_model[['Close','Volume','Force Index','Money Flow Index','SMA 14','EMA 14','ADX',
                    'CCI','Stochastic Oscilator','PPO','ROC','ATR']]
X = StandardScaler().fit_transform(X)

#X = data_for_model[['Close','Volume','Force Index','Money Flow Index','SMA 14','EMA 14','ADX']]

Y = data_for_model['Movement to Predict'].to_numpy(dtype='int')

In [66]:
#max(accuracies)

In [67]:
# split data into train and test sets
seed_split = 25
test_size = 0.20
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed_split)

In [68]:
sum(y_test)/len(y_test)

0.4247787610619469

In [69]:
X_train.shape, X_test.shape

((5876, 12), (1469, 12))

## Feature Selection

### Possible options for feature selection

In [70]:
#estimator = LogisticRegression(random_state=50,max_iter=100000)
#selector = RFE(estimator, n_features_to_select=7, step=1)
#selector.fit(X_train, y_train)
#selector.get_support()
#X_train = pd.DataFrame(X_train)

In [71]:
#estimator = RandomForestRegressor(n_estimators=100, max_depth=10)
#selector = RFE(estimator, n_features_to_select=7, step=1)
#selector.fit(X_train, y_train)
#selector.get_support()
#X_train = pd.DataFrame(X_train)

In [72]:
#estimator = LinearSVC(C=0.01, penalty="l1", dual=False).fit(X_train, y_train)
#selector = SelectFromModel(estimator, prefit=True)
#X_train = pd.DataFrame(X_train)

In [73]:
#estimator = ExtraTreesClassifier(n_estimators=50).fit(X_train, y_train)
#estimator.feature_importances_
#selector = SelectFromModel(estimator, prefit=True)
#X_train = pd.DataFrame(X_train)

In [74]:
#estimator = AdaBoostRegressor(random_state=0, n_estimators=50).fit(X_train, y_train)
#selector = SelectFromModel(estimator, prefit=True)
#selector.get_support()
#X_train = pd.DataFrame(X_train)

### Lasso for feature selection

In [75]:
estimator = Lasso(alpha=0.01).fit(X_train, y_train)
selector = SelectFromModel(estimator, prefit=True)
selector.get_support()
X_train = pd.DataFrame(X_train)

In [76]:
selected_feat = X_train.columns[(selector.get_support())]
print('total features: {}'.format((X_train.shape[1])))
print(f'selected features: {len(selected_feat)}')
print(selected_feat)

total features: 12
selected features: 3
Int64Index([2, 8, 11], dtype='int64')


In [77]:
X_train_selected = selector.transform(X_train)
X_test_selected = selector.transform(X_test)
X_train_selected.shape, X_test_selected.shape

((5876, 3), (1469, 3))

## Fitting Machine Learning Models

### Let's create a list of models

In [78]:
seed = 42
list_of_models = [XGBClassifier(seed=seed), 
                  LogisticRegression(random_state=seed, solver='liblinear',max_iter=10000), 
                  DecisionTreeClassifier(random_state=seed),
                  SVC(random_state=seed,kernel='sigmoid')]

### Predictions without feature selection

In [79]:
results_without_fs= pd.DataFrame({"Symbol":[],"Feature selection":[], "Model":[], 
                                  "Cross Val Score":[],"Accuracy":[],"F1-Score":[],"AUC":[]})

for i in range(len(list_of_models)):
    
    model = list_of_models[i]
    #print(y_train)
    model.fit(X_train, y_train)


    # make predictions for test data
    y_pred = model.predict(X_test)
    #print(y_pred)
    
    # evaluate predictions
    scores = cross_val_score(model, X_train, y_train, cv=5)
    accuracy = accuracy_score(y_test, y_pred)
    f1_score_final = f1_score(y_test, y_pred, average='macro',zero_division=1)
    roc_auc_score_final = roc_auc_score(y_test, y_pred)
    new_row =[symbol,"No",
              list_of_models[i].__class__.__name__,
              round((scores.mean()*100),2),
              round((accuracy*100),2),
              round((f1_score_final*100),2),
              round((roc_auc_score_final*100),2)]
    results_without_fs.loc[i] = new_row

print(results_without_fs)
total_results = total_results.append(results_without_fs)

        Symbol Feature selection                   Model  Cross Val Score  \
0  GFNORTEO.MX                No           XGBClassifier            54.22   
1  GFNORTEO.MX                No      LogisticRegression            56.84   
2  GFNORTEO.MX                No  DecisionTreeClassifier            53.66   
3  GFNORTEO.MX                No                     SVC            50.71   

   Accuracy  F1-Score    AUC  
0     56.50     54.39  54.50  
1     57.79     43.70  51.43  
2     54.12     53.24  53.27  
3     48.74     47.50  47.50  


### Predictions with feature selection: Lasso

In [80]:
results_with_fs= pd.DataFrame({"Symbol":[],"Feature selection":[], "Model":[], 
                                  "Cross Val Score":[],"Accuracy":[],"F1-Score":[],"AUC":[]})

for i in range(len(list_of_models)):
    
    
    model = list_of_models[i]
    model.fit(X_train_selected, y_train)


    # make predictions for test data
    y_pred = model.predict(X_test_selected)
    #print(y_pred)
    
    # evaluate predictions
    scores = cross_val_score(model, X_train_selected, y_train, cv=5)
    accuracy = accuracy_score(y_test, y_pred)
    f1_score_final = f1_score(y_test, y_pred, average='macro',zero_division=1)
    roc_auc_score_final = roc_auc_score(y_test, y_pred)
    new_row =[symbol,"Yes",
              list_of_models[i].__class__.__name__,
              round((scores.mean()*100),2),
              round((accuracy*100),2),
              round((f1_score_final*100),2),
              round((roc_auc_score_final*100),2)]
    results_with_fs.loc[i] = new_row
    
print(results_with_fs)
total_results = total_results.append(results_with_fs)

        Symbol Feature selection                   Model  Cross Val Score  \
0  GFNORTEO.MX               Yes           XGBClassifier            55.00   
1  GFNORTEO.MX               Yes      LogisticRegression            57.13   
2  GFNORTEO.MX               Yes  DecisionTreeClassifier            52.01   
3  GFNORTEO.MX               Yes                     SVC            51.38   

   Accuracy  F1-Score    AUC  
0     53.57     49.71  50.55  
1     58.13     43.69  51.69  
2     52.42     51.67  51.72  
3     50.65     49.45  49.45  


In [81]:
total_results

Unnamed: 0,Symbol,Feature selection,Model,Cross Val Score,Accuracy,F1-Score,AUC
0,EWW,No,XGBClassifier,54.27,54.25,50.06,50.72
1,EWW,No,LogisticRegression,58.53,58.66,38.15,50.08
2,EWW,No,DecisionTreeClassifier,52.58,50.7,49.38,49.38
3,EWW,No,SVC,50.8,50.23,48.67,48.67
0,EWW,Yes,XGBClassifier,54.23,54.65,47.7,49.78
1,EWW,Yes,LogisticRegression,58.82,58.39,37.31,49.73
2,EWW,Yes,DecisionTreeClassifier,51.04,53.65,52.06,52.06
3,EWW,Yes,SVC,52.78,50.9,49.18,49.19
0,AMXL.MX,No,XGBClassifier,68.66,69.57,50.46,52.06
1,AMXL.MX,No,LogisticRegression,71.44,72.64,42.54,50.02


In [82]:
total_results.to_csv(path +'total_results.csv')