In [1]:
import os
import datetime

import numpy as np
import pandas as pd
import pandas_datareader.data as web
import matplotlib.pyplot as plt
%matplotlib inline

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC


from sklearn.model_selection import train_test_split
# --------scalers
from sklearn.preprocessing import StandardScaler, MinMaxScaler
# --------cross-validation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
# -------- classification
# *** Logistic Regression
from sklearn.linear_model import LogisticRegression
# *** KNN
from sklearn.neighbors import KNeighborsClassifier
# *** Decision Tree; Random Forest
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
# *** Naive Bayes
from sklearn.naive_bayes import GaussianNB
# *** SVM classifier
from sklearn.svm import SVC
# --------  metrics:
from sklearn import metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.metrics import make_scorer


import xgboost as xgb

In [2]:
sp500 = pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')[0]
sp500Symbols = sorted(list(sp500['Symbol']))

sp400 = pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_400_companies')[0]
sp400Symbols = sorted(list(sp400['Ticker symbol']))[359:]

sp600 = pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_600_companies')[1]
sp600Symbols = sorted(list(sp600['Ticker symbol']))

spSymbols = sp500Symbols + sp400Symbols + sp600Symbols

testSymbols = ['ABC']
currentDate = datetime.date.today()

הרכשת נתונים

In [3]:
def get_financial_data(symbol):
    driver = webdriver.Chrome(executable_path="../chromedriver")
    driver.implicitly_wait(10)
    url = 'https://seekingalpha.com/symbol/' + symbol + '/income-statement'
    driver.get(url)

    # Getting a list of the dates
    dates_row = driver.find_element_by_class_name('dates-row')
    dates_list = dates_row.find_elements_by_tag_name("li")
    for i in range(len(dates_list)):
        dates_list[i] = dates_list[i].get_attribute('innerHTML')
    
    elem = driver.find_element_by_id('financials-tab')
    abs_html = elem.get_attribute('innerHTML')
    
    # Changing to YoY view
    view_arrow = driver.find_elements_by_class_name('select2-selection__arrow')[1]
    view_arrow.click()
    yoy_button = WebDriverWait(driver, 10).until(EC.presence_of_element_located(
        (By.XPATH, "//li[contains(text(),'YoY Growth')]")))
    yoy_button.click()
    
    elem = driver.find_element_by_id('financials-tab')
    yoy_html = elem.get_attribute('innerHTML')
    
    driver.quit()
    return (pd.read_html(abs_html), pd.read_html(yoy_html), dates_list)

def clean_income_statement(statement):
    if (statement[-30:] == "  Created with Highstock 6.1.4"):
        return statement[:len(statement) - 30]
    else:
        return statement

def clean_data(df):
    df.replace('-',np.nan,inplace=True)
    df.dropna(how='all', inplace=True)
    df.dropna(axis=1, how='all', inplace=True)
    df['Income Statement'] = df['Income Statement'].apply(clean_income_statement)
    df.set_index('Income Statement', inplace=True)
    df.dropna(how='all', inplace=True)
    return df[~df.index.duplicated(keep='last')]

In [None]:
for symbol in sp400Symbols:
    abs_tables, yoy_tables, dates = get_financial_data(symbol)

    # Taking care of Absolute tables
    for table in abs_tables:
        table.columns = dates
        
    abs_df = pd.concat(abs_tables)
    
    abs_df = clean_data(abs_df)
    
    abs_df.to_csv(f'Stocks_Data\sp400\{symbol}_Absolute.csv')
    
    # Taking care of YoY tables
    for table in yoy_tables:
        table.columns = dates
        
    yoy_df = pd.concat(yoy_tables)

    yoy_df = clean_data(yoy_df)
    
    yoy_df.to_csv(f'Stocks_Data\sp400\{symbol}_YoY.csv')

טיפול בנתונים

In [4]:
def agg_df():
    yoy_agg_df = pd.DataFrame()
    abs_agg_df = pd.DataFrame()
    agg_errors = []

    for folder in os.listdir('Stocks_Data'):
        folder_path = os.path.join('Stocks_Data',folder)
        for file in os.listdir(folder_path):
            symbol, view = file.split('_')
            file_path = os.path.join(f'Stocks_Data\{folder}',file)
            next_df = pd.read_csv(file_path)
            next_df.set_index('Income Statement',inplace=True)
            next_df = next_df.T
            next_df['Symbol'] = [symbol for x in next_df.index]
            if view[:3] == 'YoY':
                try:
                    yoy_agg_df = pd.concat([yoy_agg_df, next_df.iloc[:-1]])
                except:
                    agg_errors.append(file)
                    continue
            else:
                try:
                    abs_agg_df = pd.concat([abs_agg_df, next_df.iloc[:-1]])
                except:
                    agg_errors.append(file)
                    continue
    
    abs_agg_df.index.name = 'Date'
    yoy_agg_df.index.name = 'Date'
    
    return abs_agg_df, yoy_agg_df, agg_errors

def check_prices(df :pd.DataFrame):
    df[['Price Before', 'Price After','Change']] = np.nan
    df.index = pd.to_datetime(df.index)
    df.reset_index(inplace=True)
    
    prices_errors = []
    symbol = ""
    for index, row in df.iterrows():
        try:
            if symbol != row['Symbol']:
                print(row['Symbol'], end="\r")
                symbol = row['Symbol']
                start_date = row['Date']
                stock_data = web.DataReader(symbol, 'yahoo', start_date, currentDate)
                stock_data_avg = stock_data.groupby(pd.Grouper(freq='MS'))['Close'].mean()
            price_before = stock_data_avg.loc[row['Date']]
            df.loc[index, 'Price Before'] = price_before
            price_after = stock_data_avg.loc[row['Date'] + pd.DateOffset(months=1)]
            df.loc[index, 'Price After'] = price_after
            if price_after > price_before * 1.01:
                df.loc[index, 'Change'] = 1
            else:
                df.loc[index, 'Change'] = 0
        except:
            prices_errors.append(row['Symbol'])
            continue
            
    df.set_index('Date',inplace=True)
    return df, prices_errors

def cols_to_numeric(df :pd.DataFrame):
    for col in df.columns:
        if col == "Symbol":
            continue
        df.loc[:,col] = pd.to_numeric(df[col])
    return

In [None]:
abs_agg_df, yoy_agg_df, agg_errors = agg_df()
yoy_agg_df, prices_errors = check_prices(yoy_agg_df)
yoy_agg_df = yoy_agg_df[~pd.isna(yoy_agg_df['Change'])]

abs_agg_df.to_csv('Absolute_Aggregate.csv')
yoy_agg_df.to_csv('YoY_Aggregate.csv')

In [None]:
agg_errors

In [None]:
prices_errors

In [5]:
abs_agg_df = pd.read_csv('Absolute_Aggregate.csv')
abs_agg_df.set_index('Date',inplace=True)

yoy_agg_df = pd.read_csv('YoY_Aggregate.csv')
yoy_agg_df.set_index('Date',inplace=True)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [6]:
abs_agg_df_filtered = abs_agg_df.dropna(axis = 1,thresh=2250)

In [7]:
# Dealing with minus values, cutting off the bracekts and adding "-" sign
abs_agg_df_filtered.replace(r'\(\$?(\d*,?\d*\.?\d*)\)', r'-\1',regex=True, inplace=True)
# Cutting off '$', '%' and ',' signs
abs_agg_df_filtered.replace(r'\$|\%|\,', '',regex=True, inplace=True)
# Changing "NM" to "0"
abs_agg_df_filtered.replace(r'^NM$', '0',regex=True, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().replace(


In [8]:
cols_to_numeric(abs_agg_df_filtered)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


In [9]:
abs_agg_df_filtered.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4511 entries, Dec 2016 to Dec 2020
Data columns (total 38 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   Total Revenues                          4509 non-null   float64
 1   Total Operating Expenses                4221 non-null   float64
 2   Operating Income                        4226 non-null   float64
 3   Net Interest Expenses                   3936 non-null   float64
 4   EBT, Excl. Unusual Items                4511 non-null   float64
 5   EBT, Incl. Unusual Items                4511 non-null   float64
 6   Income Tax Expense                      4412 non-null   float64
 7   Earnings From Continuing Operations     4511 non-null   float64
 8   Net Income to Company                   4511 non-null   float64
 9   Net Income                              4511 non-null   float64
 10  NI to Common Incl Extra Items           4226 non-null 

In [10]:
yoy_agg_df_filtered = yoy_agg_df.dropna(axis = 1,thresh=3100)

In [11]:
yoy_agg_df_filtered.replace(r'\$|\%|\,', '',regex=True, inplace=True)
yoy_agg_df_filtered.replace(r'\(\$?(\d*,?\d*\.?\d*)\)', r'-\1',regex=True, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().replace(


In [12]:
cols_to_numeric(yoy_agg_df_filtered)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


In [13]:
yoy_agg_df_filtered.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3568 entries, 2017-12-01 to 2020-12-01
Data columns (total 25 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   Total Revenues                          3565 non-null   float64
 1   Total Operating Expenses                3335 non-null   float64
 2   Operating Income                        3202 non-null   float64
 3   EBT, Excl. Unusual Items                3377 non-null   float64
 4   EBT, Incl. Unusual Items                3201 non-null   float64
 5   Earnings From Continuing Operations     3174 non-null   float64
 6   Net Income to Company                   3156 non-null   float64
 7   Net Income                              3148 non-null   float64
 8   Revenue Per Share                       3563 non-null   float64
 9   Basic EPS                               3146 non-null   float64
 10  Basic EPS - Continuing Ops              3159 non-n

In [14]:
yoy_agg_test = yoy_agg_df_filtered.dropna()

In [15]:
yoy_agg_test

Unnamed: 0_level_0,Total Revenues,Total Operating Expenses,Operating Income,"EBT, Excl. Unusual Items","EBT, Incl. Unusual Items",Earnings From Continuing Operations,Net Income to Company,Net Income,Revenue Per Share,Basic EPS,...,Normalized Basic EPS,Normalized Diluted EPS,EBITDA,EBITA,EBIT,Normalized Net Income,Symbol,Price Before,Price After,Change
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-12-01,1.28,4.23,-8.80,-6.85,-30.13,-30.31,-30.31,-30.31,-3.23,-34.28,...,-9.67,-10.25,1.27,-8.89,-8.80,-6.23,ACC,42.088500,39.309524,0.0
2018-12-01,10.59,12.06,4.87,-22.06,70.94,69.88,69.88,69.61,9.30,68.95,...,-25.90,-24.59,9.10,5.02,4.87,-24.17,ACC,42.463684,43.227618,1.0
2019-12-01,7.07,4.98,15.78,18.46,-27.38,-27.17,-27.17,-27.44,6.66,-27.97,...,19.94,19.08,8.93,15.33,15.78,19.89,ACC,46.831904,46.125238,0.0
2020-12-01,-7.68,-2.04,-29.03,-69.96,-19.34,-19.49,-19.49,-14.32,-7.90,-15.81,...,-63.01,-63.31,-13.68,-28.58,-29.03,-63.11,ACC,42.595000,41.870000,0.0
2017-12-01,0.90,5.85,-1.63,-0.93,618.55,4679.43,4679.43,3153.05,-0.55,3183.32,...,-3.32,-2.95,0.11,-1.63,-1.63,-1.95,ACHC,32.072500,33.693334,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-12-01,-0.82,-4.22,-5.39,-5.91,-6.35,-7.35,-7.35,-7.35,0.19,-6.44,...,-4.99,-4.77,-7.56,-7.74,-5.39,-5.91,ZBRA,377.428181,401.101055,1.0
2017-12-01,8.57,1.18,19.40,21.80,24.19,5.25,5.25,5.24,9.86,6.24,...,23.13,23.07,15.91,18.54,19.40,21.75,ZTS,72.019499,75.964762,1.0
2018-12-01,9.76,12.96,8.18,8.92,10.82,65.20,65.20,65.28,11.34,67.96,...,10.48,10.72,10.63,10.82,8.18,9.10,ZTS,86.605790,84.647619,0.0
2019-12-01,7.47,10.68,10.89,11.75,6.57,5.34,5.34,5.04,8.56,5.99,...,12.52,12.68,14.06,13.96,10.89,11.34,ZTS,126.498095,136.842857,1.0


# Machine Learning

In [16]:
def load_dataset(df :pd.DataFrame, label_column :str, non_relevant_cols :list):
    TRAINING_FEATURES = df.columns[df.columns != label_column]
    TARGET_FEATURE = label_column

    X = df[TRAINING_FEATURES]
    X.drop(non_relevant_cols, axis=1, inplace=True)
    y = df[TARGET_FEATURE]
    return X, y

In [17]:
X, y = load_dataset(yoy_agg_test, 'Change', ['Symbol', 'Price Before', 'Price After'])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=41)

In [19]:
X_train

Unnamed: 0_level_0,Total Revenues,Total Operating Expenses,Operating Income,"EBT, Excl. Unusual Items","EBT, Incl. Unusual Items",Earnings From Continuing Operations,Net Income to Company,Net Income,Revenue Per Share,Basic EPS,...,Basic Weighted Average Shares Outst.,Diluted EPS,Diluted EPS - Continuing Ops,Diluted Weighted Average Shares Outst.,Normalized Basic EPS,Normalized Diluted EPS,EBITDA,EBITA,EBIT,Normalized Net Income
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-05-01,7.47,10.35,7.36,11.01,11.01,108.43,108.43,108.43,10.50,114.33,...,-2.72,112.82,112.82,-2.45,14.43,13.75,5.23,7.04,7.36,11.01
2020-09-01,-8.42,5.43,-58.59,-59.70,-57.59,-57.38,-57.38,-57.44,-8.09,-57.31,...,-0.36,-56.95,-56.95,-1.05,-59.70,-59.37,-32.03,-58.59,-58.59,-59.80
2018-12-01,0.72,-0.67,-12.39,-15.12,-19.25,-14.66,-14.66,-14.64,2.29,-13.34,...,-1.53,-12.61,-12.61,-2.31,-13.73,-13.16,-6.52,-11.98,-12.39,-15.11
2020-12-01,-5.37,-7.46,-14.42,-15.28,-42.06,-48.66,-48.66,-51.29,-4.92,-50.98,...,-0.49,-50.83,-50.83,-0.51,-16.36,-16.30,-12.54,-14.00,-14.42,-16.84
2017-12-01,3.31,7.12,17.39,23.23,26.08,26.63,26.63,26.63,3.68,27.05,...,-0.36,26.98,26.98,-0.29,23.67,23.69,13.22,17.39,17.39,23.23
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019-12-01,-0.70,9.55,-24.66,115.17,222.47,202.96,202.96,207.99,-0.62,206.25,...,-0.09,210.00,210.00,0.28,115.04,117.26,-5.25,-15.44,-24.66,116.35
2017-12-01,-3.54,-7.63,1.74,1.51,3.25,33.13,33.13,39.45,-1.89,41.56,...,-1.72,41.38,41.38,-1.72,7.97,7.97,1.99,1.61,1.74,6.14
2017-12-01,9.84,9.97,43.54,28.68,20.38,-3.55,-3.55,-3.55,8.94,-3.34,...,0.78,-4.55,-4.55,1.20,28.70,25.86,36.48,43.54,43.54,28.68
2017-12-01,13.52,13.29,-1.65,0.01,3.97,3.21,3.21,3.21,14.94,4.30,...,-1.24,4.30,4.30,-1.08,1.00,0.96,2.07,-1.58,-1.65,0.01


### Scalers

In [20]:
def scale_features(X_train, X_test, scale_type):
    X_train_scaled = pd.DataFrame()
    if scale_type == 'minmax':
        scaler = MinMaxScaler(feature_range=(-1, 1))
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.fit_transform(X_test)
    elif scale_type == 'standard':
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.fit_transform(X_test)
    return scaler, X_train_scaled, X_test_scaled

In [21]:
minmax_scaler, X_train_minmax_scaled, X_test_minmax_scaled = scale_features(X_train, X_test, 'minmax')

standard_scaler, X_train_standard_scaled, X_test_standard_scaled = scale_features(X_train, X_test, 'standard')

## Classification Models

### Logistic Regression

#### without scaling

In [22]:
LR_classification_model = LogisticRegression().fit(X_train, y_train)
y_pred = LR_classification_model.predict(X_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [23]:
print("accuracy is:",metrics.accuracy_score(y_test, y_pred))
print("precision is:",metrics.precision_score(y_test, y_pred))
print("recall is:",metrics.recall_score(y_test, y_pred))
print("f1 is:",metrics.f1_score(y_test, y_pred))
print(metrics.confusion_matrix(y_test, y_pred))

accuracy is: 0.5850091407678245
precision is: 0.6145124716553289
recall is: 0.8262195121951219
f1 is: 0.7048114434330298
[[ 49 170]
 [ 57 271]]


#### mimax scaling

In [26]:
LR_classification_minmax_model = LogisticRegression().fit(X_train_minmax_scaled, y_train)
y_pred = LR_classification_minmax_model.predict(X_test_minmax_scaled)

In [27]:
print("accuracy is:",metrics.accuracy_score(y_test, y_pred))
print("precision is:",metrics.precision_score(y_test, y_pred))
print("recall is:",metrics.recall_score(y_test, y_pred))
print("f1 is:",metrics.f1_score(y_test, y_pred))
print(metrics.confusion_matrix(y_test, y_pred))

accuracy is: 0.5978062157221207
precision is: 0.5989010989010989
recall is: 0.9969512195121951
f1 is: 0.7482837528604119
[[  0 219]
 [  1 327]]


#### standard scaling

In [28]:
LR_classification_standard_model = LogisticRegression().fit(X_train_standard_scaled, y_train)
y_pred = LR_classification_standard_model.predict(X_test_standard_scaled)

In [29]:
print("accuracy is:",metrics.accuracy_score(y_test, y_pred))
print("precision is:",metrics.precision_score(y_test, y_pred))
print("recall is:",metrics.recall_score(y_test, y_pred))
print("f1 is:",metrics.f1_score(y_test, y_pred))
print(metrics.confusion_matrix(y_test, y_pred))

accuracy is: 0.5959780621572212
precision is: 0.6059405940594059
recall is: 0.9329268292682927
f1 is: 0.7346938775510204
[[ 20 199]
 [ 22 306]]


### K-Nearest Neighbors

In [44]:
def find_best_k_for_KNN(X_train, y_train):
    parameters = {'n_neighbors':range(3,16,2)}
    knn = KNeighborsClassifier()
    clf = GridSearchCV(knn, parameters,scoring=make_scorer(metrics.precision_score))
    clf.fit(X_train, y_train)
    
    best_K = clf.best_params_['n_neighbors']
    
    return clf, best_K

#### without scaling

In [45]:
KNN_classification_model, best_K = find_best_k_for_KNN(X_train, y_train)
y_pred = KNN_classification_model.predict(X_test)

In [46]:
print("best K is:",best_K)
print("accuracy is:",metrics.accuracy_score(y_test, y_pred))
print("precision is:",metrics.precision_score(y_test, y_pred))
print("recall is:",metrics.recall_score(y_test, y_pred))
print("f1 is:",metrics.f1_score(y_test, y_pred))
print(metrics.confusion_matrix(y_test, y_pred))

best K is: 13
accuracy is: 0.5612431444241316
precision is: 0.6105527638190955
recall is: 0.7408536585365854
f1 is: 0.6694214876033057
[[ 64 155]
 [ 85 243]]


#### mimax scaling

In [58]:
KNN_classification_minmax_model, best_K = find_best_k_for_KNN(X_train_minmax_scaled, y_train)
y_pred = KNN_classification_minmax_model.predict(X_test_minmax_scaled)

In [59]:
print("best K is:",best_K)
print("accuracy is:",metrics.accuracy_score(y_test, y_pred))
print("precision is:",metrics.precision_score(y_test, y_pred))
print("recall is:",metrics.recall_score(y_test, y_pred))
print("f1 is:",metrics.f1_score(y_test, y_pred))
print(metrics.confusion_matrix(y_test, y_pred))

best K is: 13
accuracy is: 0.6014625228519196
precision is: 0.6011029411764706
recall is: 0.9969512195121951
f1 is: 0.7499999999999999
[[  2 217]
 [  1 327]]


#### standard scaling

In [60]:
KNN_classification_standard_model, best_K = find_best_k_for_KNN(X_train_standard_scaled, y_train)
y_pred = KNN_classification_standard_model.predict(X_test_standard_scaled)

In [61]:
print("best K is:",best_K)
print("accuracy is:",metrics.accuracy_score(y_test, y_pred))
print("precision is:",metrics.precision_score(y_test, y_pred))
print("recall is:",metrics.recall_score(y_test, y_pred))
print("f1 is:",metrics.f1_score(y_test, y_pred))
print(metrics.confusion_matrix(y_test, y_pred))

best K is: 5
accuracy is: 0.5118829981718465
precision is: 0.5905044510385756
recall is: 0.6067073170731707
f1 is: 0.5984962406015037
[[ 81 138]
 [129 199]]


### Decision Tree

In [54]:
def find_best_decision_tree_params(X_train, y_train):
    parameters = {'max_depth':range(2,11), 'min_samples_split':range(5,21)}
    dt = DecisionTreeClassifier()
    clf = GridSearchCV(dt, parameters,scoring=make_scorer(metrics.precision_score))
    clf.fit(X_train, y_train)
    
    best_max_depth = clf.best_params_['max_depth']
    best_min_samples_split = clf.best_params_['min_samples_split']
    
    return clf, best_max_depth, best_min_samples_split

#### without scaling

In [68]:
DT_classification_model, best_max_depth, best_min_samples_split = find_best_decision_tree_params(X_train, y_train)
y_pred = DT_classification_model.predict(X_test)

In [69]:
print("best max depth is:",best_max_depth)
print("best min samples split is:",best_min_samples_split)
print("accuracy is:",metrics.accuracy_score(y_test, y_pred))
print("precision is:",metrics.precision_score(y_test, y_pred))
print("recall is:",metrics.recall_score(y_test, y_pred))
print("f1 is:",metrics.f1_score(y_test, y_pred))
print(metrics.confusion_matrix(y_test, y_pred))

best max depth is: 10
best min samples split is: 20
accuracy is: 0.5685557586837294
precision is: 0.6474358974358975
recall is: 0.6158536585365854
f1 is: 0.63125
[[109 110]
 [126 202]]


#### mimax scaling

In [70]:
DT_classification_minmax_model, best_max_depth, best_min_samples_split = find_best_decision_tree_params(X_train_minmax_scaled, y_train)
y_pred = DT_classification_minmax_model.predict(X_test_minmax_scaled)

In [71]:
print("best max depth is:",best_max_depth)
print("best min samples split is:",best_min_samples_split)
print("accuracy is:",metrics.accuracy_score(y_test, y_pred))
print("precision is:",metrics.precision_score(y_test, y_pred))
print("recall is:",metrics.recall_score(y_test, y_pred))
print("f1 is:",metrics.f1_score(y_test, y_pred))
print(metrics.confusion_matrix(y_test, y_pred))

best max depth is: 10
best min samples split is: 19
accuracy is: 0.4040219378427788
precision is: 1.0
recall is: 0.006097560975609756
f1 is: 0.012121212121212121
[[219   0]
 [326   2]]


#### standard scaling

In [72]:
DT_classification_standard_model, best_max_depth, best_min_samples_split = find_best_decision_tree_params(X_train_standard_scaled, y_train)
y_pred = DT_classification_standard_model.predict(X_test_standard_scaled)

In [73]:
print("best max depth is:",best_max_depth)
print("best min samples split is:",best_min_samples_split)
print("accuracy is:",metrics.accuracy_score(y_test, y_pred))
print("precision is:",metrics.precision_score(y_test, y_pred))
print("recall is:",metrics.recall_score(y_test, y_pred))
print("f1 is:",metrics.f1_score(y_test, y_pred))
print(metrics.confusion_matrix(y_test, y_pred))

best max depth is: 10
best min samples split is: 18
accuracy is: 0.4954296160877514
precision is: 0.6007751937984496
recall is: 0.4725609756097561
f1 is: 0.5290102389078498
[[116 103]
 [173 155]]


### Random Forest

In [74]:
def find_best_random_forest_params(X_train, y_train):
    parameters = {'n_estimators':range(50,551,100)}
    rf = RandomForestClassifier()
    clf = GridSearchCV(rf, parameters,scoring=make_scorer(metrics.precision_score))
    clf.fit(X_train, y_train)
    
    best_n_estimators = clf.best_params_['n_estimators']
    
    return clf, best_n_estimators

#### without scaling

In [75]:
RF_classification_model, best_n_estimators = find_best_random_forest_params(X_train, y_train)
y_pred = RF_classification_model.predict(X_test)

In [76]:
print("best number of estimators is:",best_max_depth)
print("accuracy is:",metrics.accuracy_score(y_test, y_pred))
print("precision is:",metrics.precision_score(y_test, y_pred))
print("recall is:",metrics.recall_score(y_test, y_pred))
print("f1 is:",metrics.f1_score(y_test, y_pred))
print(metrics.confusion_matrix(y_test, y_pred))

best number of estimators is: 10
accuracy is: 0.5850091407678245
precision is: 0.6222760290556901
recall is: 0.7835365853658537
f1 is: 0.6936572199730094
[[ 63 156]
 [ 71 257]]


#### mimax scaling

In [77]:
RF_classification_minmax_model, best_n_estimators = find_best_random_forest_params(X_train_minmax_scaled, y_train)
y_pred = RF_classification_minmax_model.predict(X_test_minmax_scaled)

In [78]:
print("best number of estimators is:",best_max_depth)
print("accuracy is:",metrics.accuracy_score(y_test, y_pred))
print("precision is:",metrics.precision_score(y_test, y_pred))
print("recall is:",metrics.recall_score(y_test, y_pred))
print("f1 is:",metrics.f1_score(y_test, y_pred))
print(metrics.confusion_matrix(y_test, y_pred))

best number of estimators is: 10
accuracy is: 0.5996343692870201
precision is: 0.600739371534196
recall is: 0.9908536585365854
f1 is: 0.7479861910241657
[[  3 216]
 [  3 325]]


#### standard scaling

In [79]:
RF_classification_standard_model, best_n_estimators = find_best_random_forest_params(X_train_standard_scaled, y_train)
y_pred = RF_classification_standard_model.predict(X_test_standard_scaled)

In [80]:
print("best number of estimators is:",best_max_depth)
print("accuracy is:",metrics.accuracy_score(y_test, y_pred))
print("precision is:",metrics.precision_score(y_test, y_pred))
print("recall is:",metrics.recall_score(y_test, y_pred))
print("f1 is:",metrics.f1_score(y_test, y_pred))
print(metrics.confusion_matrix(y_test, y_pred))

best number of estimators is: 10
accuracy is: 0.5484460694698354
precision is: 0.622356495468278
recall is: 0.6280487804878049
f1 is: 0.6251896813353566
[[ 94 125]
 [122 206]]


### Naïve Bayes

#### without scaling

In [81]:
NB_classification_model = GaussianNB().fit(X_train, y_train)
y_pred = NB_classification_model.predict(X_test)

In [82]:
print("accuracy is:",metrics.accuracy_score(y_test, y_pred))
print("precision is:",metrics.precision_score(y_test, y_pred))
print("recall is:",metrics.recall_score(y_test, y_pred))
print("f1 is:",metrics.f1_score(y_test, y_pred))
print(metrics.confusion_matrix(y_test, y_pred))

accuracy is: 0.6014625228519196
precision is: 0.6011029411764706
recall is: 0.9969512195121951
f1 is: 0.7499999999999999
[[  2 217]
 [  1 327]]


#### mimax scaling

In [83]:
NB_classification_minmax_model = GaussianNB().fit(X_train_minmax_scaled, y_train)
y_pred = NB_classification_minmax_model.predict(X_test_minmax_scaled)

In [84]:
print("accuracy is:",metrics.accuracy_score(y_test, y_pred))
print("precision is:",metrics.precision_score(y_test, y_pred))
print("recall is:",metrics.recall_score(y_test, y_pred))
print("f1 is:",metrics.f1_score(y_test, y_pred))
print(metrics.confusion_matrix(y_test, y_pred))

accuracy is: 0.396709323583181
precision is: 0.0
recall is: 0.0
f1 is: 0.0
[[217   2]
 [328   0]]


#### standard scaling

In [85]:
NB_classification_standard_model = GaussianNB().fit(X_train_standard_scaled, y_train)
y_pred = NB_classification_standard_model.predict(X_test_standard_scaled)

In [86]:
print("accuracy is:",metrics.accuracy_score(y_test, y_pred))
print("precision is:",metrics.precision_score(y_test, y_pred))
print("recall is:",metrics.recall_score(y_test, y_pred))
print("f1 is:",metrics.f1_score(y_test, y_pred))
print(metrics.confusion_matrix(y_test, y_pred))

accuracy is: 0.44972577696526506
precision is: 0.5849056603773585
recall is: 0.28353658536585363
f1 is: 0.38193018480492813
[[153  66]
 [235  93]]


In [None]:
xg_reg = xgb.XGBClassifier(verbosity = 0, booster = 'gbtree', objective = 'binary:logistic', min_child_weight = 2,
                           max_delta_step = 2 ,learning_rate = 0.9, n_estimators = 8, tree_method = 'hist',
                           max_depth = 7)
# dart
# gbtree
# gblinear

xg_reg.fit(X_train,y_train)

y_pred = xg_reg.predict(X_test)

print("accuracy is:",metrics.accuracy_score(y_test, y_pred))
print("precision is:",metrics.precision_score(y_test, y_pred))
print("recall is:",metrics.recall_score(y_test, y_pred))
print("f1 is:",metrics.f1_score(y_test, y_pred))
metrics.confusion_matrix(y_test, y_pred)

In [None]:
print("accuracy is:",metrics.accuracy_score(y_test, y_pred))
print("precision is:",metrics.precision_score(y_test, y_pred))
print("recall is:",metrics.recall_score(y_test, y_pred))
print("f1 is:",metrics.f1_score(y_test, y_pred))
metrics.confusion_matrix(y_test, y_pred)

# xg_reg = xgb.XGBClassifier(verbosity = 0, booster = 'gbtree', objective = 'binary:logistic', min_child_weight = 1,
#                            max_delta_step = 0.4 ,learning_rate = 0.9, n_estimators = 9, tree_method = 'hist',
#                            max_depth = 6)
# accuracy is: 0.585
# precision is: 0.6745098039215687
# recall is: 0.6745098039215687
# f1 is: 0.6745098039215687
# array([[ 62,  83],
#        [ 83, 172]], dtype=int64)

In [None]:
print("accuracy is:",metrics.accuracy_score(y_test, y_pred))
print("precision is:",metrics.precision_score(y_test, y_pred))
print("recall is:",metrics.recall_score(y_test, y_pred))
print("f1 is:",metrics.f1_score(y_test, y_pred))
metrics.confusion_matrix(y_test, y_pred)

# xg_reg = xgb.XGBClassifier(objective ='reg:pseudohubererror',min_child_weight = 1.1, max_delta_step = 1,
#                            n_estimators = 9)
# accuracy is: 0.6225
# precision is: 0.6573426573426573
# recall is: 0.7800829875518672
# f1 is: 0.713472485768501
# array([[ 61,  98],
#        [ 53, 188]], dtype=int64)

# xg_reg = xgb.XGBClassifier(objective ='reg:logistic',min_child_weight = 1.9, max_delta_step = 0.4, learning_rate = 0.9,
#                 n_estimators = 15, tree_method = 'hist', max_depth = 10)
# accuracy is: 0.625
# precision is: 0.6666666666666666
# recall is: 0.7551867219917012
# f1 is: 0.708171206225681
# array([[ 68,  91],
#        [ 59, 182]], dtype=int64)

In [None]:
pd.DataFrame(X_train_standard_scaled).fillna(float('inf'),inplace=True)

In [None]:
X_train