In [1]:
import os
import datetime

import numpy as np
import pandas as pd
import pandas_datareader.data as web
import matplotlib.pyplot as plt
%matplotlib inline

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC


from sklearn.model_selection import train_test_split
# --------scalers
from sklearn.preprocessing import StandardScaler, MinMaxScaler
# --------cross-validation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
# -------- classification
# *** Logistic Regression
from sklearn.linear_model import LogisticRegression
# *** KNN
from sklearn.neighbors import KNeighborsClassifier
# *** Decision Tree; Random Forest
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
# *** Naive Bayes
from sklearn.naive_bayes import GaussianNB
# *** SVM classifier
from sklearn.svm import SVC
# --------  metrics:
from sklearn import metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.metrics import make_scorer


import xgboost as xgb

In [2]:
sp500 = pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')[0]
sp500Symbols = sorted(list(sp500['Symbol']))

sp400 = pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_400_companies')[0]
sp400Symbols = sorted(list(sp400['Ticker symbol']))[359:]

sp600 = pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_600_companies')[1]
sp600Symbols = sorted(list(sp600['Ticker symbol']))

spSymbols = sp500Symbols + sp400Symbols + sp600Symbols

testSymbols = ['ABC']
currentDate = datetime.date.today()

הרכשת נתונים

In [3]:
def get_financial_data(symbol):
    driver = webdriver.Chrome(executable_path="../chromedriver")
    driver.implicitly_wait(10)
    url = 'https://seekingalpha.com/symbol/' + symbol + '/income-statement'
    driver.get(url)

    # Getting a list of the dates
    dates_row = driver.find_element_by_class_name('dates-row')
    dates_list = dates_row.find_elements_by_tag_name("li")
    for i in range(len(dates_list)):
        dates_list[i] = dates_list[i].get_attribute('innerHTML')
    
    elem = driver.find_element_by_id('financials-tab')
    abs_html = elem.get_attribute('innerHTML')
    
    # Changing to YoY view
    view_arrow = driver.find_elements_by_class_name('select2-selection__arrow')[1]
    view_arrow.click()
    yoy_button = WebDriverWait(driver, 10).until(EC.presence_of_element_located(
        (By.XPATH, "//li[contains(text(),'YoY Growth')]")))
    yoy_button.click()
    
    elem = driver.find_element_by_id('financials-tab')
    yoy_html = elem.get_attribute('innerHTML')
    
    driver.quit()
    return (pd.read_html(abs_html), pd.read_html(yoy_html), dates_list)

def clean_income_statement(statement):
    if (statement[-30:] == "  Created with Highstock 6.1.4"):
        return statement[:len(statement) - 30]
    else:
        return statement

def clean_data(df):
    df.replace('-',np.nan,inplace=True)
    df.dropna(how='all', inplace=True)
    df.dropna(axis=1, how='all', inplace=True)
    df['Income Statement'] = df['Income Statement'].apply(clean_income_statement)
    df.set_index('Income Statement', inplace=True)
    df.dropna(how='all', inplace=True)
    return df[~df.index.duplicated(keep='last')]

In [None]:
for symbol in sp400Symbols:
    abs_tables, yoy_tables, dates = get_financial_data(symbol)

    # Taking care of Absolute tables
    for table in abs_tables:
        table.columns = dates
        
    abs_df = pd.concat(abs_tables)
    
    abs_df = clean_data(abs_df)
    
    abs_df.to_csv(f'Stocks_Data\sp400\{symbol}_Absolute.csv')
    
    # Taking care of YoY tables
    for table in yoy_tables:
        table.columns = dates
        
    yoy_df = pd.concat(yoy_tables)

    yoy_df = clean_data(yoy_df)
    
    yoy_df.to_csv(f'Stocks_Data\sp400\{symbol}_YoY.csv')

טיפול בנתונים

In [3]:
def agg_df():
    yoy_agg_df = pd.DataFrame()
    abs_agg_df = pd.DataFrame()
    agg_errors = []

    for folder in os.listdir('Stocks_Data'):
        folder_path = os.path.join('Stocks_Data',folder)
        for file in os.listdir(folder_path):
            symbol, view = file.split('_')
            file_path = os.path.join(f'Stocks_Data\{folder}',file)
            next_df = pd.read_csv(file_path)
            next_df.set_index('Income Statement',inplace=True)
            next_df = next_df.T
            next_df['Symbol'] = [symbol for x in next_df.index]
            if view[:3] == 'YoY':
                try:
                    yoy_agg_df = pd.concat([yoy_agg_df, next_df.iloc[:-1]])
                except:
                    agg_errors.append(file)
                    continue
            else:
                try:
                    abs_agg_df = pd.concat([abs_agg_df, next_df.iloc[:-1]])
                except:
                    agg_errors.append(file)
                    continue
    
    abs_agg_df.index.name = 'Date'
    yoy_agg_df.index.name = 'Date'
    
    return abs_agg_df, yoy_agg_df, agg_errors

def check_prices(df :pd.DataFrame):
    df[['Price Before', 'Price After','Change']] = np.nan
    df.index = pd.to_datetime(df.index)
    df.reset_index(inplace=True)
    
    prices_errors = []
    symbol = ""
    for index, row in df.iterrows():
        try:
            if symbol != row['Symbol']:
                print(row['Symbol'], end="\r")
                symbol = row['Symbol']
                start_date = row['Date']
                stock_data = web.DataReader(symbol, 'yahoo', start_date, currentDate)
                stock_data_avg = stock_data.groupby(pd.Grouper(freq='MS'))['Close'].mean()
            price_before = stock_data_avg.loc[row['Date']]
            df.loc[index, 'Price Before'] = price_before
            price_after = stock_data_avg.loc[row['Date'] + pd.DateOffset(months=1)]
            df.loc[index, 'Price After'] = price_after
            if price_after > price_before * 1.01:
                df.loc[index, 'Change'] = 1
            else:
                df.loc[index, 'Change'] = 0
        except:
            prices_errors.append(row['Symbol'])
            continue
            
    df.set_index('Date',inplace=True)
    return df, prices_errors

def cols_to_numeric(df :pd.DataFrame):
    for col in df.columns:
        if col == "Symbol":
            continue
        df.loc[:,col] = pd.to_numeric(df[col])
    return

In [None]:
abs_agg_df, yoy_agg_df, agg_errors = agg_df()
yoy_agg_df, prices_errors = check_prices(yoy_agg_df)
yoy_agg_df = yoy_agg_df[~pd.isna(yoy_agg_df['Change'])]

abs_agg_df.to_csv('Absolute_Aggregate.csv')
yoy_agg_df.to_csv('YoY_Aggregate.csv')

In [None]:
agg_errors

In [None]:
prices_errors

In [4]:
abs_agg_df = pd.read_csv('Absolute_Aggregate.csv')
abs_agg_df.set_index('Date',inplace=True)

yoy_agg_df = pd.read_csv('YoY_Aggregate.csv')
yoy_agg_df.set_index('Date',inplace=True)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [6]:
abs_agg_df_filtered = abs_agg_df.dropna(axis = 1,thresh=2250)

In [7]:
# Dealing with minus values, cutting off the bracekts and adding "-" sign
abs_agg_df_filtered.replace(r'\(\$?(\d*,?\d*\.?\d*)\)', r'-\1',regex=True, inplace=True)
# Cutting off '$', '%' and ',' signs
abs_agg_df_filtered.replace(r'\$|\%|\,', '',regex=True, inplace=True)
# Changing "NM" to "0"
abs_agg_df_filtered.replace(r'^NM$', '0',regex=True, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().replace(


In [8]:
cols_to_numeric(abs_agg_df_filtered)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


In [9]:
abs_agg_df_filtered.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4511 entries, Dec 2016 to Dec 2020
Data columns (total 38 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   Total Revenues                          4509 non-null   float64
 1   Total Operating Expenses                4221 non-null   float64
 2   Operating Income                        4226 non-null   float64
 3   Net Interest Expenses                   3936 non-null   float64
 4   EBT, Excl. Unusual Items                4511 non-null   float64
 5   EBT, Incl. Unusual Items                4511 non-null   float64
 6   Income Tax Expense                      4412 non-null   float64
 7   Earnings From Continuing Operations     4511 non-null   float64
 8   Net Income to Company                   4511 non-null   float64
 9   Net Income                              4511 non-null   float64
 10  NI to Common Incl Extra Items           4226 non-null 

In [93]:
yoy_agg_df_filtered = yoy_agg_df.dropna(axis = 1,thresh=3200)

In [94]:
yoy_agg_df_filtered.replace(r'\$|\%|\,', '',regex=True, inplace=True)
yoy_agg_df_filtered.replace(r'\(\$?(\d*,?\d*\.?\d*)\)', r'-\1',regex=True, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().replace(


In [95]:
cols_to_numeric(yoy_agg_df_filtered)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


In [96]:
yoy_agg_df_filtered.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3568 entries, 2017-12-01 to 2020-12-01
Data columns (total 16 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   Total Revenues                          3565 non-null   float64
 1   Total Operating Expenses                3335 non-null   float64
 2   Operating Income                        3202 non-null   float64
 3   EBT, Excl. Unusual Items                3377 non-null   float64
 4   EBT, Incl. Unusual Items                3201 non-null   float64
 5   Revenue Per Share                       3563 non-null   float64
 6   Basic Weighted Average Shares Outst.    3566 non-null   float64
 7   Diluted Weighted Average Shares Outst.  3566 non-null   float64
 8   Normalized Basic EPS                    3368 non-null   float64
 9   Normalized Diluted EPS                  3368 non-null   float64
 10  EBITDA                                  3223 non-n

In [97]:
final_yoy_agg_df = yoy_agg_df_filtered.dropna()

In [98]:
final_yoy_agg_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2858 entries, 2017-12-01 to 2020-12-01
Data columns (total 16 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   Total Revenues                          2858 non-null   float64
 1   Total Operating Expenses                2858 non-null   float64
 2   Operating Income                        2858 non-null   float64
 3   EBT, Excl. Unusual Items                2858 non-null   float64
 4   EBT, Incl. Unusual Items                2858 non-null   float64
 5   Revenue Per Share                       2858 non-null   float64
 6   Basic Weighted Average Shares Outst.    2858 non-null   float64
 7   Diluted Weighted Average Shares Outst.  2858 non-null   float64
 8   Normalized Basic EPS                    2858 non-null   float64
 9   Normalized Diluted EPS                  2858 non-null   float64
 10  EBITDA                                  2858 non-n

# Machine Learning

In [99]:
def load_dataset(df :pd.DataFrame, label_column :str, non_relevant_cols :list):
    TRAINING_FEATURES = df.columns[df.columns != label_column]
    TARGET_FEATURE = label_column

    X = df[TRAINING_FEATURES]
    X.drop(non_relevant_cols, axis=1, inplace=True)
    y = df[TARGET_FEATURE]
    return X, y

In [100]:
X, y = load_dataset(final_yoy_agg_df, 'Change', ['Symbol', 'Price Before', 'Price After'])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


## Random Split

In [101]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=41)

In [102]:
X_train

Unnamed: 0_level_0,Total Revenues,Total Operating Expenses,Operating Income,"EBT, Excl. Unusual Items","EBT, Incl. Unusual Items",Revenue Per Share,Basic Weighted Average Shares Outst.,Diluted Weighted Average Shares Outst.,Normalized Basic EPS,Normalized Diluted EPS,EBITDA,Normalized Net Income
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2017-12-01,14.47,12.96,23.49,-10.56,-23.60,13.68,0.73,0.96,-13.81,-14.78,25.72,-13.51
2019-02-01,2.02,-25.50,-6.26,-4.82,23.36,12.25,-9.12,-9.23,4.78,4.87,-4.60,-4.82
2017-06-01,6.04,2.02,11.52,11.07,17.10,8.22,-2.01,-1.92,13.36,13.10,11.30,11.07
2017-12-01,5.44,-2.37,15.22,7.00,15.22,8.43,-2.76,-2.73,10.08,9.91,14.73,7.00
2018-12-01,3.22,-9.83,7.63,8.65,8.94,5.42,-2.11,-2.20,10.41,10.55,8.05,8.06
...,...,...,...,...,...,...,...,...,...,...,...,...
2018-03-01,-0.44,15.63,-19.71,-18.40,-15.85,-0.56,0.12,-0.92,-18.60,-17.63,-11.90,-18.40
2019-12-01,-21.24,-41.24,-11.07,6.19,-48.09,-19.44,-2.22,-2.22,9.01,9.01,-2.64,6.19
2021-01-01,-13.41,-13.69,-86.92,-87.47,-98.53,-11.61,-2.02,-2.05,-87.22,-87.23,-63.37,-87.47
2019-12-01,8.21,11.06,-5.05,-2.86,0.19,16.79,-7.34,-7.73,4.44,5.16,0.06,-2.86


## Split by Date

In [103]:
new_X_train = X[X.index <= '2019-12-31']
new_X_test = X[X.index > '2019-12-31']
new_y_train = y[y.index <= '2019-12-31']
new_y_test = y[y.index > '2019-12-31']

In [104]:
new_X_train

Unnamed: 0_level_0,Total Revenues,Total Operating Expenses,Operating Income,"EBT, Excl. Unusual Items","EBT, Incl. Unusual Items",Revenue Per Share,Basic Weighted Average Shares Outst.,Diluted Weighted Average Shares Outst.,Normalized Basic EPS,Normalized Diluted EPS,EBITDA,Normalized Net Income
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2017-12-01,1.28,4.23,-8.80,-6.85,-30.13,-3.23,4.58,4.60,-9.67,-10.25,1.27,-6.23
2018-12-01,10.59,12.06,4.87,-22.06,70.94,9.30,1.24,1.26,-25.90,-24.59,9.10,-24.17
2019-12-01,7.07,4.98,15.78,18.46,-27.38,6.66,0.35,0.41,19.94,19.08,8.93,19.89
2017-12-01,0.90,5.85,-1.63,-0.93,618.55,-0.55,1.46,1.27,-3.32,-2.95,0.11,-1.95
2018-12-01,-32.85,-37.37,-30.22,-53.79,-71.54,-33.11,0.39,0.41,-54.32,-54.14,-33.57,-54.02
...,...,...,...,...,...,...,...,...,...,...,...,...
2018-12-01,13.33,1.29,65.46,148.64,495.45,12.12,1.07,1.14,146.30,145.94,25.50,148.64
2019-12-01,6.33,2.76,12.77,14.44,14.12,5.54,0.75,0.54,13.58,13.75,10.04,14.44
2017-12-01,8.57,1.18,19.40,21.80,24.19,9.86,-1.17,-1.02,23.13,23.07,15.91,21.75
2018-12-01,9.76,12.96,8.18,8.92,10.82,11.34,-1.40,-1.27,10.48,10.72,10.63,9.10


### Scalers

In [105]:
def scale_features(X_train, X_test, scale_type):
    X_train_scaled = pd.DataFrame()
    if scale_type == 'minmax':
        scaler = MinMaxScaler(feature_range=(-1, 1))
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.fit_transform(X_test)
    elif scale_type == 'standard':
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.fit_transform(X_test)
    return X_train_scaled, X_test_scaled

In [106]:
# Random split - scaling
X_train_minmax_scaled, X_test_minmax_scaled = scale_features(X_train, X_test, 'minmax')
X_train_standard_scaled, X_test_standard_scaled = scale_features(X_train, X_test, 'standard')

# Split by date - scaling
new_X_train_minmax_scaled, new_X_test_minmax_scaled = scale_features(new_X_train, new_X_test, 'minmax')
new_X_train_standard_scaled, new_X_test_standard_scaled = scale_features(new_X_train, new_X_test, 'standard')

## Classification Models

### Logistic Regression

#### without scaling

In [107]:
# Random split - test
LR_classification_model = LogisticRegression().fit(X_train, y_train)
y_pred = LR_classification_model.predict(X_test)

# Split by date - test
LR_classification_model = LogisticRegression().fit(new_X_train, new_y_train)
new_y_pred = LR_classification_model.predict(new_X_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [108]:
# Random split - result
print("accuracy is:",metrics.accuracy_score(y_test, y_pred))
print("precision is:",metrics.precision_score(y_test, y_pred))
print("recall is:",metrics.recall_score(y_test, y_pred))
print("f1 is:",metrics.f1_score(y_test, y_pred))
print(metrics.confusion_matrix(y_test, y_pred))

accuracy is: 0.6136363636363636
precision is: 0.6223021582733813
recall is: 0.969187675070028
f1 is: 0.7579408543263966
[[  5 210]
 [ 11 346]]


In [109]:
# Split by date - result
print("accuracy is:",metrics.accuracy_score(new_y_test, new_y_pred))
print("precision is:",metrics.precision_score(new_y_test, new_y_pred))
print("recall is:",metrics.recall_score(new_y_test, new_y_pred))
print("f1 is:",metrics.f1_score(new_y_test, new_y_pred))
print(metrics.confusion_matrix(new_y_test, new_y_pred))

accuracy is: 0.5345821325648416
precision is: 0.582010582010582
recall is: 0.7932692307692307
f1 is: 0.6714140386571719
[[ 41 237]
 [ 86 330]]


#### mimax scaling

In [110]:
# Random split - test
LR_classification_minmax_model = LogisticRegression().fit(X_train_minmax_scaled, y_train)
y_pred = LR_classification_minmax_model.predict(X_test_minmax_scaled)

# Split by date - test
LR_classification_minmax_model = LogisticRegression().fit(new_X_train_minmax_scaled, new_y_train)
new_y_pred = LR_classification_minmax_model.predict(new_X_test_minmax_scaled)

In [111]:
# Random split - result
print("accuracy is:",metrics.accuracy_score(y_test, y_pred))
print("precision is:",metrics.precision_score(y_test, y_pred))
print("recall is:",metrics.recall_score(y_test, y_pred))
print("f1 is:",metrics.f1_score(y_test, y_pred))
print(metrics.confusion_matrix(y_test, y_pred))

accuracy is: 0.6188811188811189
precision is: 0.6221441124780316
recall is: 0.9915966386554622
f1 is: 0.7645788336933045
[[  0 215]
 [  3 354]]


In [112]:
# Split by date - result
print("accuracy is:",metrics.accuracy_score(new_y_test, new_y_pred))
print("precision is:",metrics.precision_score(new_y_test, new_y_pred))
print("recall is:",metrics.recall_score(new_y_test, new_y_pred))
print("f1 is:",metrics.f1_score(new_y_test, new_y_pred))
print(metrics.confusion_matrix(new_y_test, new_y_pred))

accuracy is: 0.590778097982709
precision is: 0.5973451327433629
recall is: 0.9735576923076923
f1 is: 0.7404021937842779
[[  5 273]
 [ 11 405]]


#### standard scaling

In [113]:
# Random split - test
LR_classification_standard_model = LogisticRegression().fit(X_train_standard_scaled, y_train)
y_pred = LR_classification_standard_model.predict(X_test_standard_scaled)

# Split by date - test
LR_classification_standard_model = LogisticRegression().fit(new_X_train_standard_scaled, new_y_train)
new_y_pred = LR_classification_standard_model.predict(new_X_test_standard_scaled)

In [114]:
# Random split - result
print("accuracy is:",metrics.accuracy_score(y_test, y_pred))
print("precision is:",metrics.precision_score(y_test, y_pred))
print("recall is:",metrics.recall_score(y_test, y_pred))
print("f1 is:",metrics.f1_score(y_test, y_pred))
print(metrics.confusion_matrix(y_test, y_pred))

accuracy is: 0.5979020979020979
precision is: 0.6169429097605893
recall is: 0.938375350140056
f1 is: 0.7444444444444445
[[  7 208]
 [ 22 335]]


In [115]:
# Split by date - result
print("accuracy is:",metrics.accuracy_score(new_y_test, new_y_pred))
print("precision is:",metrics.precision_score(new_y_test, new_y_pred))
print("recall is:",metrics.recall_score(new_y_test, new_y_pred))
print("f1 is:",metrics.f1_score(new_y_test, new_y_pred))
print(metrics.confusion_matrix(new_y_test, new_y_pred))

accuracy is: 0.5677233429394812
precision is: 0.5914826498422713
recall is: 0.9014423076923077
f1 is: 0.7142857142857142
[[ 19 259]
 [ 41 375]]


### K-Nearest Neighbors

In [116]:
def find_best_k_for_KNN(X_train, y_train):
    parameters = {'n_neighbors':range(3,16,2)}
    knn = KNeighborsClassifier()
    clf = GridSearchCV(knn, parameters,scoring=make_scorer(metrics.precision_score))
    clf.fit(X_train, y_train)
    
    best_K = clf.best_params_['n_neighbors']
    
    return clf, best_K

#### without scaling

In [117]:
# Random split - test
KNN_classification_model, best_K = find_best_k_for_KNN(X_train, y_train)
y_pred = KNN_classification_model.predict(X_test)

# Split by date - test
KNN_classification_model, new_best_K = find_best_k_for_KNN(new_X_train, new_y_train)
new_y_pred = KNN_classification_model.predict(new_X_test)

In [118]:
# Random split - result
print("best K is:",best_K)
print("accuracy is:",metrics.accuracy_score(y_test, y_pred))
print("precision is:",metrics.precision_score(y_test, y_pred))
print("recall is:",metrics.recall_score(y_test, y_pred))
print("f1 is:",metrics.f1_score(y_test, y_pred))
print(metrics.confusion_matrix(y_test, y_pred))

best K is: 9
accuracy is: 0.5716783216783217
precision is: 0.6465968586387435
recall is: 0.6918767507002801
f1 is: 0.6684709066305818
[[ 80 135]
 [110 247]]


In [119]:
# Split by date - result
print("best K is:",new_best_K)
print("accuracy is:",metrics.accuracy_score(new_y_test, new_y_pred))
print("precision is:",metrics.precision_score(new_y_test, new_y_pred))
print("recall is:",metrics.recall_score(new_y_test, new_y_pred))
print("f1 is:",metrics.f1_score(new_y_test, new_y_pred))
print(metrics.confusion_matrix(new_y_test, new_y_pred))

best K is: 11
accuracy is: 0.537463976945245
precision is: 0.6004228329809725
recall is: 0.6826923076923077
f1 is: 0.6389201349831273
[[ 89 189]
 [132 284]]


#### mimax scaling

In [120]:
# Random split - test
KNN_classification_minmax_model, best_K = find_best_k_for_KNN(X_train_minmax_scaled, y_train)
y_pred = KNN_classification_minmax_model.predict(X_test_minmax_scaled)

# Split by date - test
KNN_classification_model, new_best_K = find_best_k_for_KNN(new_X_train_minmax_scaled, new_y_train)
new_y_pred = KNN_classification_model.predict(new_X_test_minmax_scaled)

In [121]:
# Random split - result
print("best K is:",best_K)
print("accuracy is:",metrics.accuracy_score(y_test, y_pred))
print("precision is:",metrics.precision_score(y_test, y_pred))
print("recall is:",metrics.recall_score(y_test, y_pred))
print("f1 is:",metrics.f1_score(y_test, y_pred))
print(metrics.confusion_matrix(y_test, y_pred))

best K is: 13
accuracy is: 0.6171328671328671
precision is: 0.6214788732394366
recall is: 0.988795518207283
f1 is: 0.7632432432432433
[[  0 215]
 [  4 353]]


In [122]:
# Split by date - result
print("best K is:",new_best_K)
print("accuracy is:",metrics.accuracy_score(new_y_test, new_y_pred))
print("precision is:",metrics.precision_score(new_y_test, new_y_pred))
print("recall is:",metrics.recall_score(new_y_test, new_y_pred))
print("f1 is:",metrics.f1_score(new_y_test, new_y_pred))
print(metrics.confusion_matrix(new_y_test, new_y_pred))

best K is: 13
accuracy is: 0.5951008645533141
precision is: 0.5982532751091703
recall is: 0.9879807692307693
f1 is: 0.7452402538531279
[[  2 276]
 [  5 411]]


#### standard scaling

In [123]:
# Random split - test
KNN_classification_standard_model, best_K = find_best_k_for_KNN(X_train_standard_scaled, y_train)
y_pred = KNN_classification_standard_model.predict(X_test_standard_scaled)

# Split by date - test
KNN_classification_model, new_best_K = find_best_k_for_KNN(new_X_train_standard_scaled, new_y_train)
new_y_pred = KNN_classification_model.predict(new_X_test_standard_scaled)

In [124]:
# Random split - result
print("best K is:",best_K)
print("accuracy is:",metrics.accuracy_score(y_test, y_pred))
print("precision is:",metrics.precision_score(y_test, y_pred))
print("recall is:",metrics.recall_score(y_test, y_pred))
print("f1 is:",metrics.f1_score(y_test, y_pred))
print(metrics.confusion_matrix(y_test, y_pred))

best K is: 15
accuracy is: 0.6171328671328671
precision is: 0.6468085106382979
recall is: 0.8515406162464986
f1 is: 0.7351874244256349
[[ 49 166]
 [ 53 304]]


In [125]:
# Split by date - result
print("best K is:",new_best_K)
print("accuracy is:",metrics.accuracy_score(new_y_test, new_y_pred))
print("precision is:",metrics.precision_score(new_y_test, new_y_pred))
print("recall is:",metrics.recall_score(new_y_test, new_y_pred))
print("f1 is:",metrics.f1_score(new_y_test, new_y_pred))
print(metrics.confusion_matrix(new_y_test, new_y_pred))

best K is: 9
accuracy is: 0.5590778097982709
precision is: 0.6041666666666666
recall is: 0.7668269230769231
f1 is: 0.6758474576271186
[[ 69 209]
 [ 97 319]]


### Decision Tree

In [126]:
def find_best_decision_tree_params(X_train, y_train):
    parameters = {'max_depth':range(2,11), 'min_samples_split':range(5,21)}
    dt = DecisionTreeClassifier()
    clf = GridSearchCV(dt, parameters,scoring=make_scorer(metrics.precision_score))
    clf.fit(X_train, y_train)
    
    best_max_depth = clf.best_params_['max_depth']
    best_min_samples_split = clf.best_params_['min_samples_split']
    
    return clf, best_max_depth, best_min_samples_split

#### without scaling

In [127]:
# Random split - test
DT_classification_model, best_max_depth, best_min_samples_split = find_best_decision_tree_params(X_train, y_train)
y_pred = DT_classification_model.predict(X_test)

# Split by date - test
DT_classification_model, new_best_max_depth, new_best_min_samples_split = find_best_decision_tree_params(new_X_train, new_y_train)
new_y_pred = DT_classification_model.predict(new_X_test)

In [128]:
# Random split - result
print("best max depth is:",best_max_depth)
print("best min samples split is:",best_min_samples_split)
print("accuracy is:",metrics.accuracy_score(y_test, y_pred))
print("precision is:",metrics.precision_score(y_test, y_pred))
print("recall is:",metrics.recall_score(y_test, y_pred))
print("f1 is:",metrics.f1_score(y_test, y_pred))
print(metrics.confusion_matrix(y_test, y_pred))

best max depth is: 8
best min samples split is: 14
accuracy is: 0.5821678321678322
precision is: 0.6520618556701031
recall is: 0.7086834733893558
f1 is: 0.6791946308724832
[[ 80 135]
 [104 253]]


In [129]:
# Split by date - result
print("best max depth is:",new_best_max_depth)
print("best min samples split is:",new_best_min_samples_split)
print("accuracy is:",metrics.accuracy_score(new_y_test, new_y_pred))
print("precision is:",metrics.precision_score(new_y_test, new_y_pred))
print("recall is:",metrics.recall_score(new_y_test, new_y_pred))
print("f1 is:",metrics.f1_score(new_y_test, new_y_pred))
print(metrics.confusion_matrix(new_y_test, new_y_pred))

best max depth is: 5
best min samples split is: 15
accuracy is: 0.5244956772334294
precision is: 0.583984375
recall is: 0.71875
f1 is: 0.6443965517241379
[[ 65 213]
 [117 299]]


#### mimax scaling

In [130]:
# Random split - test
DT_classification_minmax_model, best_max_depth, best_min_samples_split = find_best_decision_tree_params(X_train_minmax_scaled, y_train)
y_pred = DT_classification_minmax_model.predict(X_test_minmax_scaled)

# Split by date - test
DT_classification_minmax_model, new_best_max_depth, best_min_samples_split = find_best_decision_tree_params(new_X_train_minmax_scaled, new_y_train)
new_y_pred = DT_classification_minmax_model.predict(new_X_test_minmax_scaled)

In [131]:
# Random split - result
print("best max depth is:",best_max_depth)
print("best min samples split is:",best_min_samples_split)
print("accuracy is:",metrics.accuracy_score(y_test, y_pred))
print("precision is:",metrics.precision_score(y_test, y_pred))
print("recall is:",metrics.recall_score(y_test, y_pred))
print("f1 is:",metrics.f1_score(y_test, y_pred))
print(metrics.confusion_matrix(y_test, y_pred))

best max depth is: 8
best min samples split is: 14
accuracy is: 0.6223776223776224
precision is: 0.6239015817223199
recall is: 0.9943977591036415
f1 is: 0.7667386609071275
[[  1 214]
 [  2 355]]


In [132]:
# Split by date - result
print("best max depth is:",new_best_max_depth)
print("best min samples split is:",new_best_min_samples_split)
print("accuracy is:",metrics.accuracy_score(new_y_test, new_y_pred))
print("precision is:",metrics.precision_score(new_y_test, new_y_pred))
print("recall is:",metrics.recall_score(new_y_test, new_y_pred))
print("f1 is:",metrics.f1_score(new_y_test, new_y_pred))
print(metrics.confusion_matrix(new_y_test, new_y_pred))

best max depth is: 5
best min samples split is: 15
accuracy is: 0.4020172910662824
precision is: 0.6666666666666666
recall is: 0.004807692307692308
f1 is: 0.00954653937947494
[[277   1]
 [414   2]]


#### standard scaling

In [133]:
# Random split - test
DT_classification_standard_model, best_max_depth, best_min_samples_split = find_best_decision_tree_params(X_train_standard_scaled, y_train)
y_pred = DT_classification_standard_model.predict(X_test_standard_scaled)

# Split by date - test
DT_classification_standard_model, new_best_max_depth, best_min_samples_split = find_best_decision_tree_params(new_X_train_standard_scaled, new_y_train)
new_y_pred = DT_classification_standard_model.predict(new_X_test_standard_scaled)

In [134]:
# Random split - result
print("best max depth is:",best_max_depth)
print("best min samples split is:",best_min_samples_split)
print("accuracy is:",metrics.accuracy_score(y_test, y_pred))
print("precision is:",metrics.precision_score(y_test, y_pred))
print("recall is:",metrics.recall_score(y_test, y_pred))
print("f1 is:",metrics.f1_score(y_test, y_pred))
print(metrics.confusion_matrix(y_test, y_pred))

best max depth is: 8
best min samples split is: 14
accuracy is: 0.5944055944055944
precision is: 0.6361655773420479
recall is: 0.8179271708683473
f1 is: 0.715686274509804
[[ 48 167]
 [ 65 292]]


In [135]:
# Split by date - result
print("best max depth is:",new_best_max_depth)
print("best min samples split is:",new_best_min_samples_split)
print("accuracy is:",metrics.accuracy_score(new_y_test, new_y_pred))
print("precision is:",metrics.precision_score(new_y_test, new_y_pred))
print("recall is:",metrics.recall_score(new_y_test, new_y_pred))
print("f1 is:",metrics.f1_score(new_y_test, new_y_pred))
print(metrics.confusion_matrix(new_y_test, new_y_pred))

best max depth is: 5
best min samples split is: 15
accuracy is: 0.4899135446685879
precision is: 0.5654008438818565
recall is: 0.6442307692307693
f1 is: 0.6022471910112359
[[ 72 206]
 [148 268]]


### Random Forest

In [136]:
def find_best_random_forest_params(X_train, y_train):
    parameters = {'n_estimators':range(50,551,100)}
    rf = RandomForestClassifier()
    clf = GridSearchCV(rf, parameters,scoring=make_scorer(metrics.precision_score))
    clf.fit(X_train, y_train)
    
    best_n_estimators = clf.best_params_['n_estimators']
    
    return clf, best_n_estimators

#### without scaling

In [137]:
# Random split - test
RF_classification_model, best_n_estimators = find_best_random_forest_params(X_train, y_train)
y_pred = RF_classification_model.predict(X_test)

# Split by date - test
RF_classification_model, new_best_n_estimators = find_best_random_forest_params(new_X_train, new_y_train)
new_y_pred = RF_classification_model.predict(new_X_test)

In [138]:
# Random split - result
print("best number of estimators is:",best_n_estimators)
print("accuracy is:",metrics.accuracy_score(y_test, y_pred))
print("precision is:",metrics.precision_score(y_test, y_pred))
print("recall is:",metrics.recall_score(y_test, y_pred))
print("f1 is:",metrics.f1_score(y_test, y_pred))
print(metrics.confusion_matrix(y_test, y_pred))

best number of estimators is: 50
accuracy is: 0.6031468531468531
precision is: 0.6600985221674877
recall is: 0.7507002801120448
f1 is: 0.7024901703800787
[[ 77 138]
 [ 89 268]]


In [139]:
# Split by date - result
print("best number of estimators is:",new_best_n_estimators)
print("accuracy is:",metrics.accuracy_score(new_y_test, new_y_pred))
print("precision is:",metrics.precision_score(new_y_test, new_y_pred))
print("recall is:",metrics.recall_score(new_y_test, new_y_pred))
print("f1 is:",metrics.f1_score(new_y_test, new_y_pred))
print(metrics.confusion_matrix(new_y_test, new_y_pred))

best number of estimators is: 150
accuracy is: 0.5461095100864554
precision is: 0.6124721603563474
recall is: 0.6610576923076923
f1 is: 0.6358381502890172
[[104 174]
 [141 275]]


#### mimax scaling

In [140]:
# Random split - test
RF_classification_minmax_model, best_n_estimators = find_best_random_forest_params(X_train_minmax_scaled, y_train)
y_pred = RF_classification_minmax_model.predict(X_test_minmax_scaled)

# Split by date - test
RF_classification_minmax_model, new_best_n_estimators = find_best_random_forest_params(new_X_train_minmax_scaled, new_y_train)
new_y_pred = RF_classification_minmax_model.predict(new_X_test_minmax_scaled)

In [141]:
# Random split - result
print("best number of estimators is:",best_n_estimators)
print("accuracy is:",metrics.accuracy_score(y_test, y_pred))
print("precision is:",metrics.precision_score(y_test, y_pred))
print("recall is:",metrics.recall_score(y_test, y_pred))
print("f1 is:",metrics.f1_score(y_test, y_pred))
print(metrics.confusion_matrix(y_test, y_pred))

best number of estimators is: 50
accuracy is: 0.5891608391608392
precision is: 0.6173076923076923
recall is: 0.8991596638655462
f1 is: 0.7320410490307868
[[ 16 199]
 [ 36 321]]


In [142]:
# Split by date - result
print("best number of estimators is:",new_best_n_estimators)
print("accuracy is:",metrics.accuracy_score(new_y_test, new_y_pred))
print("precision is:",metrics.precision_score(new_y_test, new_y_pred))
print("recall is:",metrics.recall_score(new_y_test, new_y_pred))
print("f1 is:",metrics.f1_score(new_y_test, new_y_pred))
print(metrics.confusion_matrix(new_y_test, new_y_pred))

best number of estimators is: 50
accuracy is: 0.40057636887608067
precision is: 0.5
recall is: 0.002403846153846154
f1 is: 0.004784688995215312
[[277   1]
 [415   1]]


#### standard scaling

In [143]:
# Random split - test
RF_classification_standard_model, best_n_estimators = find_best_random_forest_params(X_train_standard_scaled, y_train)
y_pred = RF_classification_standard_model.predict(X_test_standard_scaled)

# Split by date - test
RF_classification_standard_model, new_best_n_estimators = find_best_random_forest_params(new_X_train_standard_scaled, new_y_train)
new_y_pred = RF_classification_standard_model.predict(new_X_test_standard_scaled)

In [144]:
# Random split - result
print("best number of estimators is:",best_n_estimators)
print("accuracy is:",metrics.accuracy_score(y_test, y_pred))
print("precision is:",metrics.precision_score(y_test, y_pred))
print("recall is:",metrics.recall_score(y_test, y_pred))
print("f1 is:",metrics.f1_score(y_test, y_pred))
print(metrics.confusion_matrix(y_test, y_pred))

best number of estimators is: 350
accuracy is: 0.6188811188811189
precision is: 0.6349514563106796
recall is: 0.9159663865546218
f1 is: 0.75
[[ 27 188]
 [ 30 327]]


In [145]:
# Split by date - result
print("best number of estimators is:",new_best_n_estimators)
print("accuracy is:",metrics.accuracy_score(new_y_test, new_y_pred))
print("precision is:",metrics.precision_score(new_y_test, new_y_pred))
print("recall is:",metrics.recall_score(new_y_test, new_y_pred))
print("f1 is:",metrics.f1_score(new_y_test, new_y_pred))
print(metrics.confusion_matrix(new_y_test, new_y_pred))

best number of estimators is: 250
accuracy is: 0.5432276657060519
precision is: 0.6078431372549019
recall is: 0.6706730769230769
f1 is: 0.6377142857142857
[[ 98 180]
 [137 279]]


### Naive Bayes

#### without scaling

In [146]:
# Random split - test
NB_classification_model = GaussianNB().fit(X_train, y_train)
y_pred = NB_classification_model.predict(X_test)

# Split by date - test
NB_classification_model = GaussianNB().fit(new_X_train, new_y_train)
new_y_pred = NB_classification_model.predict(new_X_test)

In [147]:
# Random split - result
print("accuracy is:",metrics.accuracy_score(y_test, y_pred))
print("precision is:",metrics.precision_score(y_test, y_pred))
print("recall is:",metrics.recall_score(y_test, y_pred))
print("f1 is:",metrics.f1_score(y_test, y_pred))
print(metrics.confusion_matrix(y_test, y_pred))

accuracy is: 0.6136363636363636
precision is: 0.6273408239700374
recall is: 0.938375350140056
f1 is: 0.7519640852974185
[[ 16 199]
 [ 22 335]]


In [148]:
# Split by date - result
print("accuracy is:",metrics.accuracy_score(new_y_test, new_y_pred))
print("precision is:",metrics.precision_score(new_y_test, new_y_pred))
print("recall is:",metrics.recall_score(new_y_test, new_y_pred))
print("f1 is:",metrics.f1_score(new_y_test, new_y_pred))
print(metrics.confusion_matrix(new_y_test, new_y_pred))

accuracy is: 0.5951008645533141
precision is: 0.5988286969253295
recall is: 0.9831730769230769
f1 is: 0.7443130118289354
[[  4 274]
 [  7 409]]


#### mimax scaling

In [149]:
# Random split - test
NB_classification_minmax_model = GaussianNB().fit(X_train_minmax_scaled, y_train)
y_pred = NB_classification_minmax_model.predict(X_test_minmax_scaled)

# Split by date - test
NB_classification_minmax_model = GaussianNB().fit(new_X_train_minmax_scaled, new_y_train)
new_y_pred = NB_classification_minmax_model.predict(new_X_test_minmax_scaled)

In [150]:
# Random split - result
print("accuracy is:",metrics.accuracy_score(y_test, y_pred))
print("precision is:",metrics.precision_score(y_test, y_pred))
print("recall is:",metrics.recall_score(y_test, y_pred))
print("f1 is:",metrics.f1_score(y_test, y_pred))
print(metrics.confusion_matrix(y_test, y_pred))

accuracy is: 0.3758741258741259
precision is: 0.5
recall is: 0.0028011204481792717
f1 is: 0.005571030640668524
[[214   1]
 [356   1]]


In [151]:
# Split by date - result
print("accuracy is:",metrics.accuracy_score(new_y_test, new_y_pred))
print("precision is:",metrics.precision_score(new_y_test, new_y_pred))
print("recall is:",metrics.recall_score(new_y_test, new_y_pred))
print("f1 is:",metrics.f1_score(new_y_test, new_y_pred))
print(metrics.confusion_matrix(new_y_test, new_y_pred))

accuracy is: 0.40057636887608067
precision is: 0.0
recall is: 0.0
f1 is: 0.0
[[278   0]
 [416   0]]


  _warn_prf(average, modifier, msg_start, len(result))


#### standard scaling

In [152]:
# Random split - test
NB_classification_standard_model = GaussianNB().fit(X_train_standard_scaled, y_train)
y_pred = NB_classification_standard_model.predict(X_test_standard_scaled)

# Split by date - test
NB_classification_standard_model = GaussianNB().fit(new_X_train_standard_scaled, new_y_train)
new_y_pred = NB_classification_standard_model.predict(new_X_test_standard_scaled)

In [153]:
# Random split - result
print("accuracy is:",metrics.accuracy_score(y_test, y_pred))
print("precision is:",metrics.precision_score(y_test, y_pred))
print("recall is:",metrics.recall_score(y_test, y_pred))
print("f1 is:",metrics.f1_score(y_test, y_pred))
print(metrics.confusion_matrix(y_test, y_pred))

accuracy is: 0.40384615384615385
precision is: 0.6428571428571429
recall is: 0.10084033613445378
f1 is: 0.17433414043583537
[[195  20]
 [321  36]]


In [154]:
# Split by date - result
print("accuracy is:",metrics.accuracy_score(new_y_test, new_y_pred))
print("precision is:",metrics.precision_score(new_y_test, new_y_pred))
print("recall is:",metrics.recall_score(new_y_test, new_y_pred))
print("f1 is:",metrics.f1_score(new_y_test, new_y_pred))
print(metrics.confusion_matrix(new_y_test, new_y_pred))

accuracy is: 0.38760806916426516
precision is: 0.47593582887700536
recall is: 0.21394230769230768
f1 is: 0.29519071310116085
[[180  98]
 [327  89]]


In [None]:
xg_reg = xgb.XGBClassifier(verbosity = 0, booster = 'gbtree', objective = 'binary:logistic', min_child_weight = 2,
                           max_delta_step = 2 ,learning_rate = 0.9, n_estimators = 8, tree_method = 'hist',
                           max_depth = 7)
# dart
# gbtree
# gblinear

xg_reg.fit(X_train,y_train)

y_pred = xg_reg.predict(X_test)

print("accuracy is:",metrics.accuracy_score(y_test, y_pred))
print("precision is:",metrics.precision_score(y_test, y_pred))
print("recall is:",metrics.recall_score(y_test, y_pred))
print("f1 is:",metrics.f1_score(y_test, y_pred))
metrics.confusion_matrix(y_test, y_pred)

In [None]:
print("accuracy is:",metrics.accuracy_score(y_test, y_pred))
print("precision is:",metrics.precision_score(y_test, y_pred))
print("recall is:",metrics.recall_score(y_test, y_pred))
print("f1 is:",metrics.f1_score(y_test, y_pred))
metrics.confusion_matrix(y_test, y_pred)

# xg_reg = xgb.XGBClassifier(verbosity = 0, booster = 'gbtree', objective = 'binary:logistic', min_child_weight = 1,
#                            max_delta_step = 0.4 ,learning_rate = 0.9, n_estimators = 9, tree_method = 'hist',
#                            max_depth = 6)
# accuracy is: 0.585
# precision is: 0.6745098039215687
# recall is: 0.6745098039215687
# f1 is: 0.6745098039215687
# array([[ 62,  83],
#        [ 83, 172]], dtype=int64)

In [None]:
print("accuracy is:",metrics.accuracy_score(y_test, y_pred))
print("precision is:",metrics.precision_score(y_test, y_pred))
print("recall is:",metrics.recall_score(y_test, y_pred))
print("f1 is:",metrics.f1_score(y_test, y_pred))
metrics.confusion_matrix(y_test, y_pred)

# xg_reg = xgb.XGBClassifier(objective ='reg:pseudohubererror',min_child_weight = 1.1, max_delta_step = 1,
#                            n_estimators = 9)
# accuracy is: 0.6225
# precision is: 0.6573426573426573
# recall is: 0.7800829875518672
# f1 is: 0.713472485768501
# array([[ 61,  98],
#        [ 53, 188]], dtype=int64)

# xg_reg = xgb.XGBClassifier(objective ='reg:logistic',min_child_weight = 1.9, max_delta_step = 0.4, learning_rate = 0.9,
#                 n_estimators = 15, tree_method = 'hist', max_depth = 10)
# accuracy is: 0.625
# precision is: 0.6666666666666666
# recall is: 0.7551867219917012
# f1 is: 0.708171206225681
# array([[ 68,  91],
#        [ 59, 182]], dtype=int64)