### Finance data analysis: binary classification
This project makes use of machine learning data analytic techniques to make binary classification on financial data and predict a stock is worth buying or not. 
Dataset: 200+ Financial Indicators of US stocks (2014-2018).
The last column of datasets is 'class', lists a binary classification for each stock. From a trading perspective, the 1 identifies those stocks that a hypothetical trader should buy at the start of the year and sell at the end of the year for a profit, while the 0 identifies those stocks that a hypothetical trader should not buy since their value will decrease, meaning a loss of capital.

I use these datasets to train several machine learning models:

-Logistic Regression

-Support Vector Machine

-Decision Tree

Before train the models, I apply for each one, 5 fold cross validation to find the best parameters.


In [None]:
#Import the libraries
import datetime as dt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn import preprocessing
import numpy as np
import pandas as pd
import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from imblearn.over_sampling import RandomOverSampler

In [None]:
#this function takes 2 file:
    #-file contain data for the training set
    #-file2 contain data for the test set
#return the traindata,trainlabel,testdata,testlabel extracted from files
def getData(file, file2, year):
    #load data using pandas dataframe
    df = pd.read_csv(file)
    df2 = pd.read_csv(file2)
    
    #trasform in numpy array both dataframes
    data=np.array(df)
    data2=np.array(df2)

    #first column ID removed from the dataset
    Xtr, Ytr = data[:,1:-4], data[:, -1]
    Xte, Yte = data2[:,1:-4], data2[:, -1]
    
    #for all year except 2014, oversampled values to solve problem of imbalanced classes
    if(year!=2014):
        Ytr = Ytr.astype(np.int32)
        ros = RandomOverSampler(random_state=0)
        Xtr, Ytr = ros.fit_resample(Xtr, Ytr)
    
    #Normalization features in [-1,1]
    scaler = preprocessing.StandardScaler().fit(Xtr)
    Xtr2 = scaler.transform(Xtr)
    Xte2 = scaler.transform(Xte)
    
    traindata = np.array(Xtr2)
    trainlabel = np.array(Ytr)

    testdata = np.array(Xte2)
    testlabel = np.array(Yte)

    traindata=traindata.astype(float)
    trainlabel=trainlabel.astype(int)
    testdata=testdata.astype(float)
    testlabel = testlabel.astype(int)
    return traindata,trainlabel,testdata,testlabel

In [None]:
#Function to evaluate my classifiers
def evaluate(Yte,Ypred):
    return accuracy_score(Yte, Ypred)       

In [None]:
#Logistic Regression
def LogRegression(Xtr,Ytr,Xte,Yte):
    clf = LogisticRegression(solver='lbfgs', max_iter = 10000)
    grid={"C":np.logspace(-3,3,7), "penalty":["l2"]}
    #5 fold cross validation
    clf_cv=GridSearchCV(clf,grid,cv=5)
    clf_cv.fit(Xtr,Ytr)
    Ypred = clf_cv.best_estimator_.predict(Xte)
    print("best parameters:", clf_cv.best_params_)
    return evaluate(Yte,Ypred)

In [None]:
#Decision Tree
def decision_tree(Xtr,Ytr,Xte,Yte):
    tree = DecisionTreeClassifier(random_state=1)
    md=np.arange(1,50,10)
    param_grid = {'max_depth': md}
    #5 fold cross validation
    gridcv = sklearn.model_selection.GridSearchCV(tree, param_grid, verbose=1, cv=5)
    gridcv.fit(Xtr, Ytr)
    Ypred = gridcv.best_estimator_.predict(Xte)
    print("best parameters:", gridcv.best_params_)
    return evaluate(Yte,Ypred)

In [None]:
#Support Vector Machine
def SVM(Xtr,Ytr,Xte,Yte):
    svm = SVC(kernel='rbf')
    C_grid = np.logspace(0, 3, 4)
    gamma_grid = np.logspace(-2, 1, 4)
    param_grid = {'C': C_grid, 'gamma': gamma_grid}
    #5 fold cross validation
    gridcv = GridSearchCV(svm, param_grid, verbose=1, cv=5)
    gridcv.fit(Xtr, Ytr)
    Ypred = gridcv.best_estimator_.predict(Xte)
    print("best parameters:", gridcv.best_params_)
    return evaluate(Yte,Ypred)

In [None]:
#This function take in input:
##-traindata,trainlabel,testdata,testlabel
#return the accourancies of the 3 classifiers
def training_models(traindata,trainlabel,testdata,testlabel):
    acc_tree = decision_tree(traindata,trainlabel,testdata,testlabel)
    acc_lr = LogRegression(traindata,trainlabel,testdata,testlabel)
    acc_svm = SVM(traindata,trainlabel,testdata,testlabel)
    return acc_tree, acc_lr, acc_svm

In [None]:
#Analysis data 2014-2015
#I use the data of 2014 to train the model and the data of 2015 to test
file1 = 'Data Preprocessing/2014_preprocessed.csv'
file2 = 'Data Preprocessing/2015_preprocessed.csv'
traindata,trainlabel,testdata,testlabel = getData(file1, file2, 2014)
print("Training models and evaluations")
acc_tree, acc_lr, acc_svm = training_models(traindata,trainlabel,testdata,testlabel)
#acc_tree, acc_lr, acc_svm = training_models(traindata,trainlabel,traindata,trainlabel)
print("Accurancy tree: ", acc_tree)
print("Accurancy lr: ", acc_lr)
print("Accurancy svm: ", acc_svm)

In [None]:
#Analysis data 2015-2016
#I use the data of 2015 to train the model and the data of 2016 to test
file3 = 'Data Preprocessing/2015_preprocessed.csv'
file4 = 'Data Preprocessing/2016_preprocessed.csv'
traindata,trainlabel,testdata,testlabel = getData(file3, file4, 2015)
print("Training models and evaluations")
acc_tree, acc_lr, acc_svm = training_models(traindata,trainlabel,testdata,testlabel)
#acc_tree, acc_lr, acc_svm = training_models(traindata,trainlabel,traindata,trainlabel)
print("Accurancy tree: ", acc_tree)
print("Accurancy lr: ", acc_lr)
print("Accurancy svm: ", acc_svm)

In [None]:
#Analysis data 2016-2017
#I use the data of 2016 to train the model and the data of 2017 to test
file5 = 'Data Preprocessing/2016_preprocessed.csv'
file6 = 'Data Preprocessing/2017_preprocessed.csv'
traindata,trainlabel,testdata,testlabel = getData(file5, file6, 2016)
print("Training models and evaluations")
acc_tree, acc_lr, acc_svm = training_models(traindata,trainlabel,testdata,testlabel)
#acc_tree, acc_lr, acc_svm = training_models(traindata,trainlabel,traindata,trainlabel)
print("Accurancy tree: ", acc_tree)
print("Accurancy lr: ", acc_lr)
print("Accurancy svm: ", acc_svm)

In [None]:
#Analysis data 2017-2018
#I use the data of 2017 to train the model and the data of 2018 to test
file7 = 'Data Preprocessing/2017_preprocessed.csv'
file8 = 'Data Preprocessing/2018_preprocessed.csv'
traindata,trainlabel,testdata,testlabel = getData(file7, file8, 2017)
print("Training models and evaluations")
acc_tree, acc_lr, acc_svm = training_models(traindata,trainlabel,testdata,testlabel)
#acc_tree, acc_lr, acc_svm = training_models(traindata,trainlabel,traindata,trainlabel)
print("Accurancy tree: ", acc_tree)
print("Accurancy lr: ", acc_lr)
print("Accurancy svm: ", acc_svm)

In [None]:
#best algorithm based on the mean of the performances
lr = np.array([46, 52, 49, 52])
svm = np.array([41, 66, 27, 31])
tree = np.array([47, 62, 41, 44])
print(np.mean(lr))
print(np.mean(svm))
print(np.mean(tree))