# Importing libraries

In [1]:
#Importing the necessary libraries
import numpy as np
import pandas as pd
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, auc, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
import warnings
warnings.filterwarnings('ignore')
import datetime

# Importing data set

In [2]:
#Importing the dataset
df = pd.read_csv('000001.XSHE.csv')
df.head()       #df - raw dataset

Unnamed: 0.1,Unnamed: 0,open,close,high,low,volume,money,avg,high_limit,low_limit,pre_close,paused,factor
0,2005-01-04,1.41,1.4,1.41,1.39,8204283,11465603.0,1.4,1.56,1.27,1.41,0,0.214624
1,2005-01-05,1.4,1.39,1.41,1.36,15013006,20718559.0,1.38,1.54,1.26,1.4,0,0.214624
2,2005-01-06,1.4,1.4,1.41,1.38,12423677,17333840.0,1.4,1.53,1.25,1.39,0,0.214624
3,2005-01-07,1.41,1.4,1.42,1.39,8788185,12302853.0,1.4,1.54,1.26,1.4,0,0.214624
4,2005-01-10,1.4,1.41,1.41,1.37,12263592,17111498.0,1.4,1.54,1.26,1.4,0,0.214624


# Data preprocessing

In [3]:
#Renaming the unnamed columns
newdf = df.rename(columns={'Unnamed: 0':'Date','volume':'volume(x10^8)','money':'money(x10^8)'})

for i in range(df.shape[0]):
    newdf['volume(x10^8)'] = df['volume']/(10**8)
    newdf['money(x10^8)']  = df['money']/(10**8)

## i) Checking and Handling Missing values 

In [4]:
#Checking for missing values
start = datetime.date(2005,1,1) 
K = (2023-2005)*365 + 4 #No.of days
res = []                #List to store days from Jan 1 2005 to Jan 1 2023
for day in range(K):
	date = (start + datetime.timedelta(days = day)).isoformat()
	res.append(date)
aclist = list(newdf['Date'])     #List of days actually present in data set
id=0
msgdata = []     # List of Missing data and their indices
for i in res:
    msgele = []
    if i not in aclist:
        msgele.append(i)
        msgele.append(id)
        print(f"{i}(index-{id}) has missing data")
        msgdata.append(msgele)
    id+=1

2005-01-01(index-0) has missing data
2005-01-02(index-1) has missing data
2005-01-03(index-2) has missing data
2005-01-08(index-7) has missing data
2005-01-09(index-8) has missing data
2005-01-15(index-14) has missing data
2005-01-16(index-15) has missing data
2005-01-22(index-21) has missing data
2005-01-23(index-22) has missing data
2005-01-29(index-28) has missing data
2005-01-30(index-29) has missing data
2005-02-05(index-35) has missing data
2005-02-06(index-36) has missing data
2005-02-07(index-37) has missing data
2005-02-08(index-38) has missing data
2005-02-09(index-39) has missing data
2005-02-10(index-40) has missing data
2005-02-11(index-41) has missing data
2005-02-12(index-42) has missing data
2005-02-13(index-43) has missing data
2005-02-14(index-44) has missing data
2005-02-15(index-45) has missing data
2005-02-19(index-49) has missing data
2005-02-20(index-50) has missing data
2005-02-26(index-56) has missing data
2005-02-27(index-57) has missing data
2005-03-05(index-

2014-07-13(index-3480) has missing data
2014-07-19(index-3486) has missing data
2014-07-20(index-3487) has missing data
2014-07-26(index-3493) has missing data
2014-07-27(index-3494) has missing data
2014-08-02(index-3500) has missing data
2014-08-03(index-3501) has missing data
2014-08-09(index-3507) has missing data
2014-08-10(index-3508) has missing data
2014-08-16(index-3514) has missing data
2014-08-17(index-3515) has missing data
2014-08-23(index-3521) has missing data
2014-08-24(index-3522) has missing data
2014-08-30(index-3528) has missing data
2014-08-31(index-3529) has missing data
2014-09-06(index-3535) has missing data
2014-09-07(index-3536) has missing data
2014-09-08(index-3537) has missing data
2014-09-13(index-3542) has missing data
2014-09-14(index-3543) has missing data
2014-09-20(index-3549) has missing data
2014-09-21(index-3550) has missing data
2014-09-27(index-3556) has missing data
2014-09-28(index-3557) has missing data
2014-10-01(index-3560) has missing data


2020-04-04(index-5572) has missing data
2020-04-05(index-5573) has missing data
2020-04-06(index-5574) has missing data
2020-04-11(index-5579) has missing data
2020-04-12(index-5580) has missing data
2020-04-18(index-5586) has missing data
2020-04-19(index-5587) has missing data
2020-04-25(index-5593) has missing data
2020-04-26(index-5594) has missing data
2020-05-01(index-5599) has missing data
2020-05-02(index-5600) has missing data
2020-05-03(index-5601) has missing data
2020-05-04(index-5602) has missing data
2020-05-05(index-5603) has missing data
2020-05-09(index-5607) has missing data
2020-05-10(index-5608) has missing data
2020-05-16(index-5614) has missing data
2020-05-17(index-5615) has missing data
2020-05-23(index-5621) has missing data
2020-05-24(index-5622) has missing data
2020-05-30(index-5628) has missing data
2020-05-31(index-5629) has missing data
2020-06-06(index-5635) has missing data
2020-06-07(index-5636) has missing data
2020-06-13(index-5642) has missing data


In [5]:
#Handling missing values

#Calculating mean of non-missing values
open_mean = newdf['open'].mean()
close_mean = newdf['close'].mean()
high_mean = newdf['high'].mean()
low_mean = newdf['low'].mean()
volume_mean = newdf['volume(x10^8)'].mean()
money_mean = newdf['money(x10^8)'].mean()
avg_mean = newdf['avg'].mean()
highlimit_mean = newdf['high_limit'].mean()
lowlimit_mean = newdf['low_limit'].mean()
preclose_mean = newdf['pre_close'].mean()
paused_mean = newdf['paused'].mean()
factor_mean = newdf['factor'].mean()

In [6]:
# Function to insert row in the dataframe
def Insert_row_(row_number, df, row_value):
    # Slice the upper half of the dataframe
    df1 = df[0:row_number]
  
    # Store the result of lower half of the dataframe
    df2 = df[row_number:]
  
    # Insert the row in the upper half dataframe
    df1.loc[row_number]=row_value
  
    # Concat the two dataframes
    df_result = pd.concat([df1, df2])
  
    # Reassign the index labels
    df_result.index = [*range(df_result.shape[0])]
  
    # Return the updated dataframe
    return df_result

record=[]
newdf1 = newdf

for data in msgdata:
    record.append(data[0])
    record.append(open_mean)
    record.append(close_mean)
    record.append(high_mean)
    record.append(low_mean)
    record.append(volume_mean)
    record.append(money_mean)
    record.append(avg_mean)
    record.append(highlimit_mean)
    record.append(lowlimit_mean)
    record.append(preclose_mean)
    record.append(paused_mean)
    record.append(factor_mean)
    newdf1 = Insert_row_(data[1],newdf1,record)
    record=[]

In [7]:
# Reformatting dates
newdf1['Date'] = pd.to_datetime(newdf1['Date'],format='%Y-%m-%d')
newdf1['Date'] = newdf1['Date'].dt.strftime('%d-%m-%Y')
newdf1.head()

Unnamed: 0,Date,open,close,high,low,volume(x10^8),money(x10^8),avg,high_limit,low_limit,pre_close,paused,factor
0,01-01-2005,8.138029,8.146838,8.269845,8.018455,1.110067,9.68249,8.144026,57979.041792,7.348496,8.144223,0.033836,0.570852
1,02-01-2005,8.138029,8.146838,8.269845,8.018455,1.110067,9.68249,8.144026,57979.041792,7.348496,8.144223,0.033836,0.570852
2,03-01-2005,8.138029,8.146838,8.269845,8.018455,1.110067,9.68249,8.144026,57979.041792,7.348496,8.144223,0.033836,0.570852
3,04-01-2005,1.41,1.4,1.41,1.39,0.082043,0.114656,1.4,1.56,1.27,1.41,0.0,0.214624
4,05-01-2005,1.4,1.39,1.41,1.36,0.15013,0.207186,1.38,1.54,1.26,1.4,0.0,0.214624


## ii) Checking and Handling Null values 

In [8]:
#Checking for null values
for i in newdf1.columns:
    if(newdf1[i].isnull().sum()!=0):      
        print(f'Column {i} has {newdf1[i].isnull().sum()} null value(s)')
    else:
        print(f'Column {i} has no null value(s)')

Column Date has no null value(s)
Column open has no null value(s)
Column close has no null value(s)
Column high has no null value(s)
Column low has no null value(s)
Column volume(x10^8) has no null value(s)
Column money(x10^8) has no null value(s)
Column avg has no null value(s)
Column high_limit has no null value(s)
Column low_limit has no null value(s)
Column pre_close has no null value(s)
Column paused has no null value(s)
Column factor has no null value(s)


In [9]:
#Handling null values
for i in newdf1.columns:
    if(newdf1[i].isnull().sum()!=0):      
        for j in range(newdf1.shape[0]):
            if(newdf1[i][j].isna()==True):
                if(i=='open'):
                    newdf1[i][j] = open_mean
                elif(i=='close'):
                    newdf1[i][j] = close_mean
                elif(i=='high'):
                    newdf1[i][j] = high_mean
                elif(i=='low'):
                    newdf1[i][j] = low_mean
                elif(i=='volume(x10^8)'):
                    newdf1[i][j] = volume_mean
                elif(i=='money(x10^8)'):
                    newdf1[i][j] = money_mean
                elif(i=='avg'):
                    newdf1[i][j] = avg_mean
                elif(i=='high_limit'):
                    newdf1[i][j] = highlimit_mean
                elif(i=='low_limit'):
                    newdf1[i][j] = lowlimit_mean
                elif(i=='pre_close'):
                    newdf1[i][j] = preclose_mean
                elif(i=='paused'):
                    newdf1[i][j] = paused_mean
                else:
                    newdf1[i][j] = factor_mean
    else:
        continue
        
newdf1.head()

Unnamed: 0,Date,open,close,high,low,volume(x10^8),money(x10^8),avg,high_limit,low_limit,pre_close,paused,factor
0,01-01-2005,8.138029,8.146838,8.269845,8.018455,1.110067,9.68249,8.144026,57979.041792,7.348496,8.144223,0.033836,0.570852
1,02-01-2005,8.138029,8.146838,8.269845,8.018455,1.110067,9.68249,8.144026,57979.041792,7.348496,8.144223,0.033836,0.570852
2,03-01-2005,8.138029,8.146838,8.269845,8.018455,1.110067,9.68249,8.144026,57979.041792,7.348496,8.144223,0.033836,0.570852
3,04-01-2005,1.41,1.4,1.41,1.39,0.082043,0.114656,1.4,1.56,1.27,1.41,0.0,0.214624
4,05-01-2005,1.4,1.39,1.41,1.36,0.15013,0.207186,1.38,1.54,1.26,1.4,0.0,0.214624


## iii) Checking duplicate values 

In [10]:
#Checking for duplicate values

#In terms of stock, the values would be duplicates only when date as well as other attributes are same too.
#Same attributes for different dates doesn't imply duplicate values

if(newdf['Date'].duplicated().sum()!=0):      
    print(f'Data set has {newdf[i].duplicated().sum()} duplicate value(s)')
else:
    print('Data set has no duplicate value(s)')

Data set has no duplicate value(s)


## iv) Checking garbage values 

In [11]:
#Checking for garbage values
for i in newdf.columns :
    print(newdf[i].value_counts())
    print('//'*10)

2005-01-04    1
2016-12-30    1
2017-01-10    1
2017-01-09    1
2017-01-06    1
             ..
2011-01-12    1
2011-01-13    1
2011-01-14    1
2011-01-17    1
2022-12-30    1
Name: Date, Length: 4374, dtype: int64
////////////////////
5.39     53
1.35     29
6.16     28
1.24     26
4.87     23
         ..
14.97     1
5.70      1
14.65     1
14.39     1
12.76     1
Name: open, Length: 1455, dtype: int64
////////////////////
5.39     51
1.24     27
6.16     26
1.35     26
1.33     22
         ..
15.52     1
15.48     1
15.43     1
9.85      1
12.47     1
Name: close, Length: 1456, dtype: int64
////////////////////
5.39     51
1.35     29
6.16     27
1.24     23
1.37     19
         ..
12.94     1
13.70     1
13.51     1
12.02     1
12.73     1
Name: high, Length: 1457, dtype: int64
////////////////////
5.39     53
1.33     29
1.32     26
6.16     22
1.24     20
         ..
11.10     1
10.08     1
13.09     1
14.90     1
12.65     1
Name: low, Length: 1406, dtype: int64
/////////////////

## v) Extracting required subset of data(8 years data) 

In [12]:
#Extracting required subset of data from dataset

start_index = list(newdf1['Date']).index('01-01-2010')
end_index = list(newdf1['Date']).index('01-01-2018')

temp_date_list = list(newdf1['Date'][start_index:end_index+1])    # List of dates from Jan 1 2010 to Jan 1 2018
#print(temp_date_list)

required_df = newdf1.loc[newdf1['Date'].isin(temp_date_list)]
required_df = required_df.reset_index()
required_df = required_df.drop(columns=['index'])
required_df    #This is our required subset

Unnamed: 0,Date,open,close,high,low,volume(x10^8),money(x10^8),avg,high_limit,low_limit,pre_close,paused,factor
0,01-01-2010,8.138029,8.146838,8.269845,8.018455,1.110067,9.682490,8.144026,57979.041792,7.348496,8.144223,0.033836,0.570852
1,02-01-2010,8.138029,8.146838,8.269845,8.018455,1.110067,9.682490,8.144026,57979.041792,7.348496,8.144223,0.033836,0.570852
2,03-01-2010,8.138029,8.146838,8.269845,8.018455,1.110067,9.682490,8.144026,57979.041792,7.348496,8.144223,0.033836,0.570852
3,04-01-2010,7.550000,7.300000,7.570000,7.290000,0.785628,5.802495,7.380000,8.260000,6.750000,7.500000,0.000000,0.307935
4,05-01-2010,7.310000,7.170000,7.360000,7.010000,1.807197,12.934770,7.160000,8.030000,6.570000,7.300000,0.000000,0.307935
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2918,28-12-2017,12.110000,12.040000,12.270000,11.870000,1.703524,20.529445,12.050000,13.330000,10.900000,12.120000,0.000000,0.911657
2919,29-12-2017,12.040000,12.130000,12.240000,11.940000,1.078163,13.032220,12.090000,13.250000,10.840000,12.040000,0.000000,0.911657
2920,30-12-2017,8.138029,8.146838,8.269845,8.018455,1.110067,9.682490,8.144026,57979.041792,7.348496,8.144223,0.033836,0.570852
2921,31-12-2017,8.138029,8.146838,8.269845,8.018455,1.110067,9.682490,8.144026,57979.041792,7.348496,8.144223,0.033836,0.570852


## vi) Processing extreme values 

In [13]:
#Handling with outliers(extreme values)

def preprocess_with_dmam_and_zscore(x, n):
    # Calculate the median of the sequence
    xm = np.median(x)
    
    # Calculate the absolute differences from the median
    dm = np.abs(x - xm)
    
    # Calculate the median absolute deviation (DMAD)
    dmad = np.median(dm)
    
    # Calculate z-scores for each value
    z_scores = (x - np.mean(x)) / np.std(x)
    
    processed_x = np.empty_like(x)  # Create an empty array to store processed values
    
    for i, (xi, z_score) in enumerate(zip(x, z_scores)):
        if xi >= xm + n * dmad:
            processed_x[i] = xm + n * dmad
        elif xi <= xm - n * dmad:
            processed_x[i] = xm - n * dmad
        else:
            processed_x[i] = xi
    #print(type(processed_x))
    return processed_x

ppdf = required_df
for col in ppdf:
    if(col=='Date'):
        continue
    else:
        ppdf[col] = preprocess_with_dmam_and_zscore(ppdf[col],1)
ppdf.head()       # We have processed the extreme values 

Unnamed: 0,Date,open,close,high,low,volume(x10^8),money(x10^8),avg,high_limit,low_limit,pre_close,paused,factor
0,01-01-2010,8.138029,8.146838,8.269845,8.018455,1.110067,9.68249,8.144026,12.56,7.348496,8.144223,0.0,0.570852
1,02-01-2010,8.138029,8.146838,8.269845,8.018455,1.110067,9.68249,8.144026,12.56,7.348496,8.144223,0.0,0.570852
2,03-01-2010,8.138029,8.146838,8.269845,8.018455,1.110067,9.68249,8.144026,12.56,7.348496,8.144223,0.0,0.570852
3,04-01-2010,7.55,7.3,7.57,7.29,0.819264,6.341564,7.38,8.26,6.75,7.5,0.0,0.40382
4,05-01-2010,7.31,7.29,7.38,7.19,1.400869,12.93477,7.288052,8.03,6.576991,7.3,0.0,0.40382


## vii) Creating new column to identify stock trend direction (Target variable) 

In [14]:

no_of_cols = ppdf.shape[0]     #No. of columns
trend_direction=[]
for n in range(no_of_cols):
    i = ppdf['close'][n]
    j = ppdf['pre_close'][n]
    ratio = ((i-j)/j)
    if(ratio>0):
        trend_direction.append(1)
    else:
        trend_direction.append(0)
        
ppdf['Trend Direction'] = trend_direction
ppdf.head()     # This is the preprocessed data set without feature selection

Unnamed: 0,Date,open,close,high,low,volume(x10^8),money(x10^8),avg,high_limit,low_limit,pre_close,paused,factor,Trend Direction
0,01-01-2010,8.138029,8.146838,8.269845,8.018455,1.110067,9.68249,8.144026,12.56,7.348496,8.144223,0.0,0.570852,1
1,02-01-2010,8.138029,8.146838,8.269845,8.018455,1.110067,9.68249,8.144026,12.56,7.348496,8.144223,0.0,0.570852,1
2,03-01-2010,8.138029,8.146838,8.269845,8.018455,1.110067,9.68249,8.144026,12.56,7.348496,8.144223,0.0,0.570852,1
3,04-01-2010,7.55,7.3,7.57,7.29,0.819264,6.341564,7.38,8.26,6.75,7.5,0.0,0.40382,0
4,05-01-2010,7.31,7.29,7.38,7.19,1.400869,12.93477,7.288052,8.03,6.576991,7.3,0.0,0.40382,0


# Feature selection and Dropping irrelevant columns

In [15]:
#Splitting target class 
X = ppdf.drop(columns=['Trend Direction','Date']) 
y = ppdf['Trend Direction']

#Splitting data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

## i) SVM-RFE method 

In [17]:
# SVM-RFE feature selection
def svm_rfe_feature_selection(X, y, n_features_to_select):
    svm = SVC(kernel="linear")
    rfe = RFE(estimator=svm, n_features_to_select=n_features_to_select, step=1)
    rfe.fit(X, y)
    selected_features = X.columns[rfe.support_]
    return selected_features

# Set the number of features to select for SVM-RFE
n_features_to_select_svm = 7
selected_features_svm = svm_rfe_feature_selection(X, y, n_features_to_select_svm)
print("Selected features using SVM-RFE:")
selected_features_svm

Selected features using SVM-RFE:


Index(['open', 'close', 'high', 'avg', 'low_limit', 'pre_close', 'factor'], dtype='object')

In [18]:
svmdf = ppdf
svmdf = svmdf.drop(columns=['high','low','volume(x10^8)','money(x10^8)','paused'])
svmdf.head()      # This is the data set for model with SVM-RFE feature selection

Unnamed: 0,Date,open,close,avg,high_limit,low_limit,pre_close,factor,Trend Direction
0,01-01-2010,8.138029,8.146838,8.144026,12.56,7.348496,8.144223,0.570852,1
1,02-01-2010,8.138029,8.146838,8.144026,12.56,7.348496,8.144223,0.570852,1
2,03-01-2010,8.138029,8.146838,8.144026,12.56,7.348496,8.144223,0.570852,1
3,04-01-2010,7.55,7.3,7.38,8.26,6.75,7.5,0.40382,0
4,05-01-2010,7.31,7.29,7.288052,8.03,6.576991,7.3,0.40382,0


## ii) RF based Feature Selection method 

In [19]:
# Random Forest-based feature selection
def rf_feature_selection(X, y, threshold):
    rf = RandomForestClassifier(n_estimators=50, random_state=20)
    rf.fit(X, y)
    feature_importances = rf.feature_importances_
    selected_features = X.columns[feature_importances > threshold]
    return selected_features

# Set the threshold for feature selection using Random Forest
threshold_rf = 0.05  # Adjust this threshold as needed
selected_features_rf = rf_feature_selection(X, y, threshold_rf)
print("\nSelected features using Random Forest:")
print(selected_features_rf)


Selected features using Random Forest:
Index(['open', 'close', 'high', 'low', 'avg', 'high_limit', 'pre_close'], dtype='object')


In [20]:
rfdf = ppdf
rfdf = rfdf.drop(columns=['factor','low_limit','volume(x10^8)','money(x10^8)','paused'])
rfdf.head()      # This is the data set for model with RF based feature selection

Unnamed: 0,Date,open,close,high,low,avg,high_limit,pre_close,Trend Direction
0,01-01-2010,8.138029,8.146838,8.269845,8.018455,8.144026,12.56,8.144223,1
1,02-01-2010,8.138029,8.146838,8.269845,8.018455,8.144026,12.56,8.144223,1
2,03-01-2010,8.138029,8.146838,8.269845,8.018455,8.144026,12.56,8.144223,1
3,04-01-2010,7.55,7.3,7.57,7.29,7.38,8.26,7.5,0
4,05-01-2010,7.31,7.29,7.38,7.19,7.288052,8.03,7.3,0


In [21]:
#SVM-RFE feature selection applied training and test sets 
SVM_X = svmdf.drop(columns=['Trend Direction','Date']) 
SVM_y = svmdf['Trend Direction']
SVM_X_train, SVM_X_test, SVM_y_train, SVM_y_test = train_test_split(SVM_X, SVM_y, test_size=0.2)

#RF based feature selection applied training and test sets 
RF_X = rfdf.drop(columns=['Trend Direction','Date']) 
RF_y = rfdf['Trend Direction']
RF_X_train, RF_X_test, RF_y_train, RF_y_test = train_test_split(RF_X, RF_y, test_size=0.2)

# Training the model and Fitting the data

## i) SVM 

In [22]:
#SVM models

svm_model_1 = SVC(kernel='rbf',probability=True) #SVM model trained with data with no feature selection
svm_model_2 = SVC(kernel='rbf',probability=True) #SVM model trained with data with SVM-RFE feature selection
svm_model_3 = SVC(kernel='rbf',probability=True) #SVM model trained with data with RF based feature selection

svm_model_1.fit(X_train, y_train)
svm_model_2.fit(SVM_X_train, SVM_y_train)
svm_model_3.fit(RF_X_train, RF_y_train)

svm_predictions_1 = svm_model_1.predict(X_test)
svm_predictions_2 = svm_model_2.predict(SVM_X_test)
svm_predictions_3 = svm_model_3.predict(RF_X_test)

## ii) RF 

In [23]:
#RF models

rf_model_1 = RandomForestClassifier(n_estimators=100, random_state=42) #RF model trained with data with no feature selection
rf_model_2 = RandomForestClassifier(n_estimators=100, random_state=42) #RF model trained with data with SVM-RFE feature selection
rf_model_3 = RandomForestClassifier(n_estimators=100, random_state=42) #RF model trained with data with RF based feature selection

rf_model_1.fit(X_train, y_train)
rf_model_2.fit(SVM_X_train, SVM_y_train)
rf_model_3.fit(RF_X_train, RF_y_train)

rf_predictions_1 = rf_model_1.predict(X_test)
rf_predictions_2 = rf_model_2.predict(SVM_X_test)
rf_predictions_3 = rf_model_3.predict(RF_X_test)

## iii) ANN 

In [24]:
#ANN models

X1 = X_train.values  
y1 = y_train.values.reshape(-1, 1)  

X2 = SVM_X_train.values  
y2 = SVM_y_train.values.reshape(-1, 1)  

X3 = RF_X_train.values  
y3 = RF_y_train.values.reshape(-1, 1)  

# ANN model trained with data with no feature selection
nn_model_1 = MLPClassifier(hidden_layer_sizes=(X1.shape[1], X1.shape[1] // 2), activation='logistic', solver='sgd', learning_rate='constant', learning_rate_init=0.1, max_iter=1000)
nn_model_1.fit(X1, y1)

# ANN model trained with data with SVM-RFE feature selection
nn_model_2 = MLPClassifier(hidden_layer_sizes=(X2.shape[1], X2.shape[1] // 2), activation='logistic', solver='sgd', learning_rate='constant', learning_rate_init=0.1, max_iter=1000)
nn_model_2.fit(X2, y2)

# ANN model trained with data with RF based feature selection
nn_model_3 = MLPClassifier(hidden_layer_sizes=(X3.shape[1], X3.shape[1] // 2), activation='logistic', solver='sgd', learning_rate='constant', learning_rate_init=0.1, max_iter=1000)
nn_model_3.fit(X3, y3)


y_pred_nn_1 = nn_model_1.predict(X_test)
y_pred_nn_2 = nn_model_2.predict(SVM_X_test)
y_pred_nn_3 = nn_model_3.predict(RF_X_test)


# Model evaluation

## i) Model without feature selection 

In [25]:
#Models trained with data with no feature selection

print("NO FEATURE SELECTION")

svm_accuracy_1 = accuracy_score(y_test, svm_predictions_1)
print("SVM Model Accuracy:", svm_accuracy_1)

svm_test_proba_1 = svm_model_1.predict_proba(X_test)[:, 1]
svm_auc_1 = roc_auc_score(y_test, svm_test_proba_1)
print("SVM Model ROC AUC Score:", svm_auc_1)

rf_accuracy_1 = accuracy_score(y_test, rf_predictions_1)
print("Random Forest Model Accuracy:", rf_accuracy_1)

rf_test_proba_1 = rf_model_1.predict_proba(X_test)[:, 1]
rf_auc_1 = roc_auc_score(y_test, rf_test_proba_1)
print("Random Forest Model ROC AUC Score:", rf_auc_1)

nn_accuracy_1 = accuracy_score(y_test, np.round(y_pred_nn_1))
print("ANN Model Accuracy:", nn_accuracy_1)

roc_auc_nn_1 = roc_auc_score(y_test, y_pred_nn_1)
print("ANN Model ROC AUC Score:", roc_auc_nn_1)


NO FEATURE SELECTION
SVM Model Accuracy: 0.9264957264957265
SVM Model ROC AUC Score: 0.9833302160291779
Random Forest Model Accuracy: 0.9777777777777777
Random Forest Model ROC AUC Score: 0.9982874310296456
ANN Model Accuracy: 0.9145299145299145
ANN Model ROC AUC Score: 0.9138221266248948


## ii) Model with SVM-RFE feature selection 

In [26]:
#Models trained with data with SVM-RFE feature selection

print("SVM-RFE FEATURE SELECTION")

svm_accuracy_2 = accuracy_score(SVM_y_test, svm_predictions_2)
print("SVM Model Accuracy:", svm_accuracy_2)

svm_test_proba_2 = svm_model_2.predict_proba(SVM_X_test)[:, 1]
svm_auc_2 = roc_auc_score(SVM_y_test, svm_test_proba_2)
print("SVM Model ROC AUC Score:", svm_auc_2)

rf_accuracy_2 = accuracy_score(SVM_y_test, rf_predictions_2)
print("Random Forest Model Accuracy:", rf_accuracy_2)

rf_test_proba_2 = rf_model_2.predict_proba(SVM_X_test)[:, 1]
rf_auc_2 = roc_auc_score(SVM_y_test, rf_test_proba_2)
print("Random Forest Model ROC AUC Score:", rf_auc_2)

nn_accuracy_2 = accuracy_score(SVM_y_test, np.round(y_pred_nn_2))
print("ANN Model Accuracy:", nn_accuracy_2)

roc_auc_nn_2 = roc_auc_score(SVM_y_test, y_pred_nn_2)
print("ANN Model ROC AUC Score:", roc_auc_nn_2)


SVM-RFE FEATURE SELECTION
SVM Model Accuracy: 0.9316239316239316
SVM Model ROC AUC Score: 0.9903512880562061
Random Forest Model Accuracy: 0.9811965811965812
Random Forest Model ROC AUC Score: 0.9989988290398126
ANN Model Accuracy: 0.9025641025641026
ANN Model ROC AUC Score: 0.8982142857142857


## iii) Model with RF based Feature selection 

In [27]:
#Models trained with data with RF based feature selection

print("RF BASED FEATURE SELECTION")

svm_accuracy_3 = accuracy_score(RF_y_test, svm_predictions_3)
print("SVM Model Accuracy:", svm_accuracy_3)

svm_test_proba_3 = svm_model_3.predict_proba(RF_X_test)[:, 1]
svm_auc_3 = roc_auc_score(RF_y_test, svm_test_proba_3)
print("SVM Model ROC AUC Score:", svm_auc_3)

rf_accuracy_3 = accuracy_score(RF_y_test, rf_predictions_3)
print("Random Forest Model Accuracy:", rf_accuracy_3)

rf_test_proba_3 = rf_model_3.predict_proba(RF_X_test)[:, 1]
rf_auc_3 = roc_auc_score(RF_y_test, rf_test_proba_3)
print("Random Forest Model ROC AUC Score:", rf_auc_3)

nn_accuracy_3 = accuracy_score(RF_y_test, np.round(y_pred_nn_3))
print("ANN Model Accuracy:", nn_accuracy_3)

roc_auc_nn_3 = roc_auc_score(RF_y_test, y_pred_nn_3)
print("ANN Model ROC AUC Score:", roc_auc_nn_3)


RF BASED FEATURE SELECTION
SVM Model Accuracy: 0.9418803418803419
SVM Model ROC AUC Score: 0.993932379349046
Random Forest Model Accuracy: 0.9675213675213675
Random Forest Model ROC AUC Score: 0.9958964646464648
ANN Model Accuracy: 0.9145299145299145
ANN Model ROC AUC Score: 0.9131944444444444


## Additional evaluation metrics 

In [28]:
#Additional evaluation metrics defined in base paper

# Function to calculate annualized return
def annualized_return(P, n, Rf, sigma_p):
    Rp = ((1 + P) ** (250 / n) - 1) * 100
    Sharpe = (Rp - Rf) / sigma_p
    return Rp, Sharpe

# Function to calculate win rate
def win_rate(profitable_transactions, total_transactions):
    return profitable_transactions / total_transactions

# Function to calculate profit loss ratio
def profit_loss_ratio(total_profit, total_loss):
    return total_profit / total_loss

# Calculate additional metrics for each model

#NO FETURE SELECTION :-

#SVM :-
svm_ann_return_1, svm_sharpe_ratio_1 = annualized_return(svm_accuracy_1, len(y_test), 0.022, 0.15)
svm_win_rate_1 = win_rate(np.sum(y_test == 1), len(y_test))
svm_profit_loss_ratio_1 = profit_loss_ratio(np.sum(y_test == 1), np.sum(y_test == 0))

#RF :-
rf_ann_return_1, rf_sharpe_ratio_1 = annualized_return(rf_accuracy_1, len(y_test), 0.022, 0.15)
rf_win_rate_1 = win_rate(np.sum(y_test == 1), len(y_test))
rf_profit_loss_ratio_1 = profit_loss_ratio(np.sum(y_test == 1), np.sum(y_test == 0))

#ANN :-
nn_ann_return_1, nn_sharpe_ratio_1 = annualized_return(nn_accuracy_1, len(y_test), 0.022, 0.15)
nn_win_rate_1 = win_rate(np.sum(y_test == 1), len(y_test))
nn_profit_loss_ratio_1 = profit_loss_ratio(np.sum(y_test == 1), np.sum(y_test == 0))

#SVM-RFE FETURE SELECTION :-

#SVM :-
svm_ann_return_2, svm_sharpe_ratio_2 = annualized_return(svm_accuracy_2, len(SVM_y_test), 0.022, 0.15)
svm_win_rate_2 = win_rate(np.sum(SVM_y_test == 1), len(SVM_y_test))
svm_profit_loss_ratio_2 = profit_loss_ratio(np.sum(SVM_y_test == 1), np.sum(SVM_y_test == 0))

#RF :-
rf_ann_return_2, rf_sharpe_ratio_2 = annualized_return(rf_accuracy_2, len(SVM_y_test), 0.022, 0.15)
rf_win_rate_2 = win_rate(np.sum(y_test == 1), len(SVM_y_test))
rf_profit_loss_ratio_2 = profit_loss_ratio(np.sum(SVM_y_test == 1), np.sum(SVM_y_test == 0))

#ANN :-
nn_ann_return_2, nn_sharpe_ratio_2 = annualized_return(nn_accuracy_2, len(SVM_y_test), 0.022, 0.15)
nn_win_rate_2 = win_rate(np.sum(SVM_y_test == 1), len(SVM_y_test))
nn_profit_loss_ratio_2 = profit_loss_ratio(np.sum(SVM_y_test == 1), np.sum(SVM_y_test == 0))

#RF BASED FETURE SELECTION :-

#SVM :-
svm_ann_return_3, svm_sharpe_ratio_3 = annualized_return(svm_accuracy_3, len(RF_y_test), 0.022, 0.15)
svm_win_rate_3 = win_rate(np.sum(RF_y_test == 1), len(RF_y_test))
svm_profit_loss_ratio_3 = profit_loss_ratio(np.sum(RF_y_test == 1), np.sum(RF_y_test == 0))

#RF :-
rf_ann_return_3, rf_sharpe_ratio_3 = annualized_return(rf_accuracy_3, len(RF_y_test), 0.022, 0.15)
rf_win_rate_3 = win_rate(np.sum(RF_y_test == 1), len(RF_y_test))
rf_profit_loss_ratio_3 = profit_loss_ratio(np.sum(RF_y_test == 1), np.sum(RF_y_test == 0))

#ANN :-
nn_ann_return_3, nn_sharpe_ratio_3 = annualized_return(nn_accuracy_3, len(RF_y_test), 0.022, 0.15)
nn_win_rate_3 = win_rate(np.sum(RF_y_test == 1), len(RF_y_test))
nn_profit_loss_ratio_3 = profit_loss_ratio(np.sum(RF_y_test == 1), np.sum(RF_y_test == 0))

print("RESULTS :-")

print("\nANNUALIZED RETURN :-")

print("Of SVM model without feature selection = ",svm_ann_return_1)
print("Of SVM model with SVM-RFE feature selection = ",svm_ann_return_2)
print("Of SVM model with RF based feature selection = ",svm_ann_return_3)

print("\nOf RF model without feature selection = ",rf_ann_return_1)
print("Of RF model with SVM-RFE feature selection = ",rf_ann_return_2)
print("Of RF model with RF based feature selection = ",rf_ann_return_3)

print("\nOf ANN model without feature selection = ",nn_ann_return_1)
print("Of ANN model with SVM-RFE feature selection = ",nn_ann_return_2)
print("Of ANN model with RF based feature selection = ",nn_ann_return_3)

print("\nSHARPE RATIO :-")

print("Of SVM model without feature selection = ",svm_sharpe_ratio_1)
print("Of SVM model with SVM-RFE feature selection = ",svm_sharpe_ratio_2)
print("Of SVM model with RF based feature selection = ",svm_sharpe_ratio_3)

print("\nOf RF model without feature selection = ",rf_sharpe_ratio_1)
print("Of RF model with SVM-RFE feature selection = ",rf_sharpe_ratio_2)
print("Of RF model with RF based feature selection = ",rf_sharpe_ratio_3)

print("\nOf ANN model without feature selection = ",nn_sharpe_ratio_1)
print("Of ANN model with SVM-RFE feature selection = ",nn_sharpe_ratio_2)
print("Of ANN model with RF based feature selection = ",nn_sharpe_ratio_3)

print("\nWIN RATE :-")

print("Of SVM model without feature selection = ",svm_win_rate_1)
print("Of SVM model with SVM-RFE feature selection = ",svm_win_rate_2)
print("Of SVM model with RF based feature selection = ",svm_win_rate_3)

print("\nOf RF model without feature selection = ",rf_win_rate_1)
print("Of RF model with SVM-RFE feature selection = ",rf_win_rate_2)
print("Of RF model with RF based feature selection = ",rf_win_rate_3)

print("\nOf ANN model without feature selection = ",nn_win_rate_1)
print("Of ANN model with SVM-RFE feature selection = ",nn_win_rate_2)
print("Of ANN model with RF based feature selection = ",nn_win_rate_3)

print("\nPROFIT-LOSS RATIO :-")

print("Of SVM model without feature selection = ",svm_profit_loss_ratio_1)
print("Of SVM model with SVM-RFE feature selection = ",svm_profit_loss_ratio_2)
print("Of SVM model with RF based feature selection = ",svm_profit_loss_ratio_3)

print("\nOf RF model without feature selection = ",rf_profit_loss_ratio_1)
print("Of RF model with SVM-RFE feature selection = ",rf_profit_loss_ratio_2)
print("Of RF model with RF based feature selection = ",rf_profit_loss_ratio_3)

print("\nOf ANN model without feature selection = ",nn_profit_loss_ratio_1)
print("Of ANN model with SVM-RFE feature selection = ",nn_profit_loss_ratio_2)
print("Of ANN model with RF based feature selection = ",nn_profit_loss_ratio_3)


RESULTS :-

ANNUALIZED RETURN :-
Of SVM model without feature selection =  32.341407083330395
Of SVM model with SVM-RFE feature selection =  32.49184127430329
Of SVM model with RF based feature selection =  32.79202500824332

Of RF model without feature selection =  33.835577779029435
Of RF model with SVM-RFE feature selection =  33.934396117567054
Of RF model with RF based feature selection =  33.53853449080919

Of ANN model without feature selection =  31.98949984118895
Of ANN model with SVM-RFE feature selection =  31.636330834788893
Of ANN model with RF based feature selection =  31.98949984118895

SHARPE RATIO :-
Of SVM model without feature selection =  215.46271388886933
Of SVM model with SVM-RFE feature selection =  216.4656084953553
Of SVM model with RF based feature selection =  218.46683338828882

Of RF model without feature selection =  225.42385186019627
Of RF model with SVM-RFE feature selection =  226.0826407837804
Of RF model with RF based feature selection =  223.44356