## Version 20240815

## 00 Importing Modules & Mounting Drive

In [None]:
!pip install ydata-profiling

import pathlib
import datetime
import seaborn as sns
import matplotlib.pyplot as plt
import geopandas as gpd
import os
import numpy as np
import pandas as pd
import sklearn
import pickle
import plotly.express as px
import plotly.graph_objects as go
import xgboost as xgb

from scipy import stats as sm
from IPython.display import Image
from graphviz import Source
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, mean_squared_error, r2_score, accuracy_score, average_precision_score, f1_score, precision_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from urllib.request import urlopen
from io import BytesIO
from zipfile import ZipFile
from ydata_profiling import ProfileReport
from tabulate import tabulate
from time import sleep

%matplotlib inline

In [None]:
'''from google.colab import drive
drive.mount('/content/drive') #'''

In [None]:
'''MAIN_PATH = str(pathlib.Path().resolve())
WORK_PATH = MAIN_PATH + '/drive/MyDrive/Workspace'
SOURCE_PATH = WORK_PATH + '/00_Data_Source'
CACHE_PATH = WORK_PATH + '/00_Cache_Data' #'''

## 01 Choosing & Importing Dataset

### 01.00 Importing Data from Zipfile

In [None]:
'''filename = 'Diseases_And_Symptoms.zip' # replace this

url = 'https://github.com/azzindani/00_Data_Source/raw/main/'+ filename
http_response = urlopen(url)
zipfile = ZipFile(BytesIO(http_response.read()))
zipfile.extractall() #'''

In [None]:
os.listdir()

In [None]:
'''df = pd.read_csv(os.listdir()[1], encoding = 'ISO-8859-1')#, sep = ';')
df.shape #'''

### 01.01 Importing Main Data

In [None]:
filename = 'Bank_Customer_Churn.csv' # replace this

url = 'https://github.com/azzindani/00_Data_Source/raw/main/'+ filename
df = pd.read_csv(url, encoding = 'ISO-8859-1')#, sep = ';')
df.shape #'''

In [None]:
df.nunique()

In [None]:
df.head(2)

In [None]:
df.info()

In [None]:
for column in df.columns:
  if df[column].dtypes == 'object':
    print(column)
    print('-' * 100)
    print(df[column].unique())
    print('=' * 100)

### 01.02 Importing Geo Data

In [None]:
'''geo_path = 'https://raw.githubusercontent.com/PublicaMundi/MappingAPI/master/data/geojson/us-states.json' # replace this

gdf = gpd.read_file(geo_path)
gdf.head(2) #'''

In [None]:
'''gdf = gdf.rename(columns = {'name' : 'State'})
gdf = gdf[['State', 'geometry']]
gdf.head(2) #'''

### 01.03 Importing Additional Data (for enrichment)

### 01.04 Dataframe Back Up

In [None]:
df_bu = df.copy()

## 02 Data Structuring

### 02.01 Selecting & Dropping Variables

In [None]:
column_list = [] # fill this

df = df.drop(column_list, axis = 1)
df.head(2) #'''

### 02.02 Cleaning Text Object

#### 02.02.01 Convert Header to Proper Text

In [None]:
'''for x in df.columns:
  y = x.title()
  df = df.rename(columns = {x : y}) #'''

#### 02.02.02 Strip Abnormal Spaces

In [None]:
'''for x in df.columns:
  if df[x].dtypes == 'object':
    try:
      df[x] = df[x].str.strip()
    except:
      pass #'''

#### 02.02.03 Convert Object Content to Proper Text

In [None]:
'''for x in df.columns:
  if df[x].dtypes == 'object':
    for a in df[x].unique():
      b = a.title()
      df[x] = df[x].replace(a, b)
  else:
    pass #'''

### 02.03 Coverting Data

#### 02.03.01 Convert to date

In [None]:
df.head(2)

In [None]:
column_list = ['Activity Period Start Date'] # fill this

for x in column_list:
  try:
    df[x] = pd.to_datetime(df[x])
  except:
    pass #'''

#### 02.03.02 Convert to integer

In [None]:
column_list = [] # fill this

for x in column_list:
  df[x] = df[x].astype('int') #'''

#### 02.03.03 Convert to Object (if necessary)

In [None]:
column_list = [] # fill this

for x in column_list:
  df[x] = df[x].astype('str') #'''

#### 02.03.04 Replace 0 to Nan (if necessary)

In [None]:
#df = df.replace(0, np.nan)

#### 02.03.05 Filling 0 to Nan

In [None]:
'''for column in df.columns:
  if df[column].dtype == 'float64' or df[column].dtype == 'int64':
    df[column] = df[column].fillna(0)
    print(column)
  else:
    pass #'''

#### 02.03.06 Dropping Nan

In [None]:
#df = df.dropna()

## 03 Data Cleaning

### 03.01 Replacing Variable Component

In [None]:
'''value_dict = {} # fill this

column_name = ''

df[column_name] = df[column_name].replace(value_dict) #'''

### 03.02 Add New Variable

#### 03.02.01 Add by Math Calculation

In [None]:
'''new_var = '' # fill this
obj_var1 = '' # fill this
obj_var2 = '' # fill this

df[new_var] = df[obj_var1] * df [obj_var2] #'''

#### 03.02.02 Add by Replacing "Other"

In [None]:
'''column_name = '' # fill this

df[column_name].value_counts() #'''

In [None]:
'''value_thres = 1300

replace_list = []

df_dict = df[column_name].value_counts().to_dict()
for i in df_dict:
  if df_dict[i] < value_thres:
    replace_list.append(i)

replace_list #'''

In [None]:
'''df[column_name] = df[column_name].copy().replace(to_replace = replace_list, value = 'Other') #'''

### 03.03 Inaccuracies

In [None]:
column_list = [] # fill this

for x in column_list:
  try:
    df[x] = df[x].replace('0', np.nan)
    df = df.dropna()
  except:
    pass

### 03.04 Handling

##### Data Distribution Check (Before)

In [None]:
df.head(2)

In [None]:
df.info()

In [None]:
'''x = 'current_month_debit' # replace this
y = 'current_month_balance' # replace this
color = 'occupation' # replace this

fig = px.histogram(
  df,
  x = x,
  y = y,
  color = color,
  marginal = 'box',
  hover_data = df.columns
)

fig.show() #'''

In [None]:
'''x = 'current_month_debit' # replace this
y = 'current_month_balance' # replace this
color = 'occupation' # replace this

fig = px.scatter(
  df,
  x = x,
  y = y,
  color = color,
  marginal_x = 'histogram',
  marginal_y = 'box',
  hover_data = df.columns
)

fig.show() #'''

#### 03.04.01 Using IQR (Inter Quantile Range)

In [None]:
def iqr_thres(dataframe, column, th1 = 0.25, th3 = 0.75):
  quartile1 = dataframe[column].quantile(th1)
  quartile3 = dataframe[column].quantile(th3)
  iqr = quartile3 - quartile1
  upper_limit = quartile3 + 1.5 * iqr
  lower_limit = quartile1 - 1.5 * iqr

  return lower_limit, upper_limit

In [None]:
def check_outliers_iqr(dataframe, column):
  lower_limit, upper_limit = iqr_thres(dataframe, column)
  if dataframe[(dataframe[column] > upper_limit) | (dataframe[column] < lower_limit)].any(axis = None):
    return True
  else:
    return False

In [None]:
def replace_iqr(dataframe, columns, th1 = 0.25, th3 = 0.75, replace = False):
  data = []

  for column in columns:
    if dataframe[column].dtypes == 'int64' or dataframe[column].dtypes == 'float64':
      if column != 'Outcome':
        outliers_ = check_outliers_iqr(dataframe, column)
        count = None
        lower_limit, upper_limit = iqr_thres(dataframe, column, th1, th3)

        if outliers_:
          count = dataframe[(dataframe[column] > upper_limit) | (dataframe[column] < lower_limit)][column].count()
          if replace:
            if lower_limit < 0:
              dataframe.loc[(dataframe[column] > upper_limit), column] = np.nan
            else:
              dataframe.loc[(dataframe[column] < lower_limit), column] = np.nan
              dataframe.loc[(dataframe[column] > upper_limit), column] = np.nan
        outliers_status = check_outliers_iqr(dataframe, column)
        data.append([outliers_, outliers_status, count, column, lower_limit, upper_limit ])

  table = tabulate(data, headers = ['Outliers (Previously)', 'Outliers', 'Count', 'Column', 'Lower Limit', 'Upper Limit'], tablefmt = 'rst', numalign = 'right')
  print('Removing Outliers using IQR')
  print(table)

In [None]:
column_list = [] # fill this

replace_iqr(
  dataframe = df,
  columns = column_list,
  replace = True
)
df = df.dropna()#'''

#### 03.04.02 Using Standard Deviation

In [None]:
def std_thres(dataframe, column):
  upper_limit = dataframe[column].mean() + 3 * dataframe[column].std()
  lower_limit = dataframe[column].mean() - 3 * dataframe[column].std()

  return lower_limit, upper_limit

In [None]:
def check_outliers_std(dataframe, column):
  lower_limit, upper_limit = std_thres(dataframe, column)
  if dataframe[(dataframe[column] > upper_limit) | (dataframe[column] < lower_limit)].any(axis = None):
    return True
  else:
    return False

In [None]:
def replace_std(dataframe, columns, replace = False):
  data = []

  for column in columns:
    if dataframe[column].dtypes == 'int64' or dataframe[column].dtypes == 'float64':
      if column != 'Outcome':
        outliers_ = check_outliers_std(dataframe, column)
        count = None
        lower_limit, upper_limit = std_thres(dataframe, column)

        if outliers_:
          count = dataframe[(dataframe[column] > upper_limit) | (dataframe[column] < lower_limit)][column].count()
          if replace:
            if lower_limit < 0:
              dataframe.loc[(dataframe[column] > upper_limit), column] = np.nan
            else:
              dataframe.loc[(dataframe[column] < lower_limit), column] = np.nan
              dataframe.loc[(dataframe[column] > upper_limit), column] = np.nan
        outliers_status = check_outliers_std(dataframe, column)
        data.append([outliers_, outliers_status, count, column, lower_limit, upper_limit])

  table = tabulate(data, headers = ['Outlier (Previously)', 'Outliers', 'Count', 'Column', 'Lower Limit', 'Upper Limit'], tablefmt = 'rst', numalign = 'right')
  print('Removing Outliers using 3 Standard Deviation')
  print(table)

In [None]:
column_list = [] # fill this

replace_std(
  dataframe = df,
  columns = column_list,
  replace = True
)
df = df.dropna()#'''

### 03.05 Handling Missing / Zeros / Null
##### Filling missing value (numerical only) is better using median than mean or mode

#### 03.05.01 Detecting Zero Value

In [None]:
zero_columns = [] # fill this

for x in df.columns:
  if df[x].dtypes == 'int64' or df[x].dtypes == 'float64':
    if (df[x] == 0).sum() != 0:
      print(x, ':', str((df[x] == 0).sum()))
      zero_columns.append(x)

zero_columns #'''

#### 03.05.02 Detecting Nan / Non Available Value

In [None]:
nan_columns = [] # fill this

for x in df.columns:
  if df[x].dtypes == 'object':
    if (df[x] == np.nan).sum() or (df[x] == '-').sum() != 0:
      print(x, ':', str((df[x] == np.nan).sum()))
      nan_columns.append(x)

nan_columns #'''

#### 03.05.03 Replacing Zero with Mean (for numerical value if median value == 0), if necessary

In [None]:
'''column_list = [] # fill this

df = df.replace(0, np.nan)

for x in column_list:
  df[x] = df[x].fillna(df[x].mean()) #'''

#### 03.05.04 Replacing Zero with Median (for numerical value if median value != 0), if necessary

In [None]:
'''column_list = [] # fill this

df = df.replace(0, np.nan)

for x in column_list:
  df[x] = df[x].fillna(df[x].median()) #'''

#### 03.05.05 Replacing Zero with Mode (for categorical / object value), if necessary

In [None]:
'''column_list = [] # fill this

df = df.replace(0, np.nan)

for x in column_list:
  df[x] = df[x].fillna(df[x].mode()) #'''

### 03.06 Handling Incomplete Data

### 03.07 Handling Data Biases

### 03.08 Handling Duplicates

In [None]:
#df = df.drop_duplicates()

##### Data Distribution Check (After)

In [None]:
'''x = 'current_month_debit' # replace this
y = 'current_month_balance' # replace this
color = 'occupation' # replace this

fig = px.histogram(
  df,
  x = x,
  y = y,
  color = color,
  marginal = 'box',
  hover_data = df.columns
)

fig.show() #'''

In [None]:
'''x = 'current_month_debit' # replace this
y = 'current_month_balance' # replace this
color = 'occupation' # replace this

fig = px.scatter(
  df,
  x = x,
  y = y,
  color = color,
  marginal_x = 'histogram',
  marginal_y = 'box',
  hover_data = df.columns
)

fig.show() #'''

## 04 Enriching Data
#### take other dataset, inside or outside from related dataset / business

In [None]:
'''location = 'Country' # replace this

a = df[location].unique()
b = gdf[location].unique()

for i in a:
  if i not in b:
    print(i)

print(50 * '=')

for i in b:
  if i not in a:
    print(i) #'''

In [None]:
value_dict = {}

try:
  df[location] = df[location].replace(value_dict)
except:
  pass #'''

## 05 Data Validation
#### Verifying consistency, quality, and security of data

## 06 Exploration Data Analysis (Univariate)

In [None]:
data_profile = ProfileReport(
  df,
  correlations = {
    'pearson' : {'calculate' : True},
    'spearman' : {'calculate' : True},
    'kendall' : {'calculate' : True},
    'phi_k' : {'calculate' : True},
    'cramers': {'calculate' : True},
  },
)

data_profile #'''

## 07 Select Variable X & Y | Splitting Data

#### 07.01 Data Balancing

In [None]:
'''# Check Before Data Balancing
y_var = 'HeartDisease'

fig, ax = plt.subplots(figsize = (5, 5))
sizes = [count for count in df[y_var].value_counts()]
labels = list(df[y_var].value_counts().index)

ax.pie(
  x = sizes,
  labels = labels,
  autopct = '%1.1f%%',
)
plt.show() #'''

In [None]:
'''n = 20000
append_data = []

for i in df[y_var].unique():
  df_x = df[df[y_var] == i][:n]
  append_data.append(df_x)

append_df = pd.concat(append_data)
append_df.shape #'''

In [None]:
'''# Check After Data Balancing

fig, ax = plt.subplots(figsize = (5, 5))
sizes = [count for count in append_df[y_var].value_counts()]
labels = list(append_df[y_var].value_counts().index)

ax.pie(
  x = sizes,
  labels = labels,
  autopct = '%1.1f%%',
)
plt.show() #'''

In [None]:
#df = append_df

### 07.02 Label Encoding / One Hot Encoding

In [None]:
df.head(2)

In [None]:
df.info()

In [None]:
columns = ['gender', 'occupation']

for column in columns:
  df[column] = df[column].astype('str')
  print(df[column].unique())

In [None]:
label_encoders = {}

for column in columns:
  le = LabelEncoder()
  df[column] = le.fit_transform(df[column])
  label_encoders[column] = le

for column in columns:
  labels = label_encoders[column].classes_
  print(column)
  print('-' * 50)
  for i in range(len(labels)):
    print(str(i), ':', labels[i])
  print('=' * 50)

### 07.03 Define X & Y variables

In [None]:
df.head(2)

In [None]:
df.info()

In [None]:
x_var = [
  'age',
  'gender',
  'dependents',
  'dependents',
  'occupation',
  'customer_nw_category',
  'current_balance',
  'previous_month_end_balance',
  'average_monthly_balance_prevQ',
  'average_monthly_balance_prevQ2',
  'current_month_credit',
  'previous_month_credit',
  'current_month_debit',
  'previous_month_debit',
  'current_month_balance',
  'previous_month_balance'
]
#x_var = df.columns[1:]
y_var = ['churn']
#y_var = df.columns[0]

x = df[x_var]
y = df[y_var]

sel_var = y_var + x_var
#df = df[sel_var]
df.head(2)

### 07.04 Split Data

In [None]:
scaler = StandardScaler()
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 42)

print('x_train shape :', x_train.shape)
print('x_test shape :', x_test.shape)
print('=' * 50)
print('y_train shape :', y_train.shape)
print('y_test shape :', y_test.shape)

## 08 Building Machine Learning Model
https://www.geeksforgeeks.org/types-of-machine-learning/

### 08.01 Supervised Machine Learning
meaning = y variable already available from dataset / “Labelled Dataset”

#### 08.01.01 Classification
predicting categorical target variables, which represent discrete classes or labels

##### 08.01.01.01 Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
def lr(x_train, y_train, x_test, y_test, save = False, matrix = False, unmatch = False):
  try:
    title = 'Logistic Regression Model'
    model = LogisticRegression()
    train = model.fit(x_train, y_train)
    y_predicted = model.predict(x_test)
    training_score = model.score(x_train, y_train)
    testing_score = model.score(x_test, y_test)

    print('=' * 100)
    print(title)
    print('=' * 100)
    print('Training Score : ', str(round(training_score * 100, 2)), '%')
    #print('Regression Coefficient :', str(train.coef_))
    #print('Regression Interception :', str(train.intercept_))
    print('Testing Score : ', str(round(testing_score * 100, 2)), '%')
    print('=' * 100)
    print('Classification Report\n', classification_report(y_test, y_predicted))
    print('=' * 100)

    if save == True:
      with open(title + '.sav', 'wb') as f:
        pickle.dump(train, f)
        print(title, 'has been saved')
        print('=' * 100)
    else:
      pass

    if matrix == True:
      cm = confusion_matrix(y_test, y_predicted)

      print('Confusion Matrix')
      plt.figure(figsize = (5, 3))
      sns.heatmap(cm, annot = True, fmt = 'd')
      plt.xlabel('Predicted')
      plt.ylabel('Actual')
      plt.show()
    else:
      pass

    if unmatch == True:
      print('=' * 100)
      print('Unmatched Prediction Result')
      print('-' * 100)
      for i in range(0, len(y)):
        if y_predicted[i] != y[i]:
          print ('Predicted : {0}\nActual: {1}\n'.format(y_predicted[i], y[i]))
    else:
      pass
  except:
    pass

In [None]:
lr(x_train, y_train, x_test, y_test, save = False, matrix = True, unmatch = False)

##### 08.01.01.02 Support Vector Machine

In [None]:
from sklearn.svm import SVC

In [None]:
def svm(x_train, y_train, x_test, y_test, save = False, matrix = False, unmatch = False):
  try:
    title = 'Support Vector Machine Model'
    model = SVC()
    train = model.fit(x_train, y_train)
    y_predicted = model.predict(x_test)
    training_score = model.score(x_train, y_train)
    testing_score = model.score(x_test, y_test)

    print('=' * 100)
    print(title)
    print('=' * 100)
    print('Training Score : ', str(round(training_score * 100, 2)), '%')
    print('Testing Score : ', str(round(testing_score * 100, 2)), '%')
    print('=' * 100)
    print('Classification Report\n', classification_report(y_test, y_predicted))
    print('=' * 100)

    if save == True:
      with open(title + '.sav', 'wb') as f:
        pickle.dump(train, f)
        print(title, 'has been saved')
        print('=' * 100)
    else:
      pass

    if matrix == True:
      cm = confusion_matrix(y_test, y_predicted)

      print('Confusion Matrix')
      plt.figure(figsize = (5, 3))
      sns.heatmap(cm, annot = True, fmt = 'd')
      plt.xlabel('Predicted')
      plt.ylabel('Actual')
      plt.show()
    else:
      pass

    if unmatch == True:
      print('=' * 100)
      print('Unmatched Prediction Result')
      print('-' * 100)
      for i in range(0, len(y)):
        if y_predicted[i] != y[i]:
          print ('Predicted : {0}\nActual: {1}\n'.format(y_predicted[i], y[i]))
    else:
      pass
  except:
    pass

In [None]:
svm(x_train, y_train, x_test, y_test, save = False, matrix = True, unmatch = False)

##### 08.01.01.03 Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
def rf(x_train, y_train, x_test, y_test, save = False, matrix = False, unmatch = False):
  try:
    title = 'Random Forest Model'
    model = RandomForestClassifier()
    train = model.fit(x_train, y_train)
    y_predicted = model.predict(x_test)
    training_score = model.score(x_train, y_train)
    testing_score = model.score(x_test, y_test)

    print('=' * 100)
    print(title)
    print('=' * 100)
    print('Training Score : ', str(round(training_score * 100, 2)), '%')
    print('Testing Score : ', str(round(testing_score * 100, 2)), '%')
    print('=' * 100)
    print('Classification Report\n', classification_report(y_test, y_predicted))
    print('=' * 100)

    if save == True:
      with open(title + '.sav', 'wb') as f:
        pickle.dump(train, f)
        print(title, 'has been saved')
        print('=' * 100)
    else:
      pass

    if matrix == True:
      cm = confusion_matrix(y_test, y_predicted)

      print('Confusion Matrix')
      plt.figure(figsize = (5, 3))
      sns.heatmap(cm, annot = True, fmt = 'd')
      plt.xlabel('Predicted')
      plt.ylabel('Actual')
      plt.show()
    else:
      pass

    if unmatch == True:
      print('=' * 100)
      print('Unmatched Prediction Result')
      print('-' * 100)
      for i in range(0, len(y)):
        if y_predicted[i] != y[i]:
          print ('Predicted : {0}\nActual: {1}\n'.format(y_predicted[i], y[i]))
    else:
      pass
  except:
    pass

In [None]:
rf(x_train, y_train, x_test, y_test, save = False, matrix = True, unmatch = False)

##### 08.01.01.04 Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier, export_graphviz

In [None]:
def dtc(x_train, y_train, x_test, y_test, save = False, matrix = False, unmatch = False):
  try:
    title = 'Decision Tree Classifier Model'
    model = DecisionTreeClassifier()
    train = model.fit(x_train, y_train)
    y_predicted = model.predict(x_test)
    training_score = model.score(x_train, y_train)
    testing_score = model.score(x_test, y_test)

    print('=' * 100)
    print(title)
    print('=' * 100)
    print('Training Score : ', str(round(training_score * 100, 2)), '%')
    print('Testing Score : ', str(round(testing_score * 100, 2)), '%')
    print('=' * 100)
    print('Classification Report\n', classification_report(y_test, y_predicted))
    print('=' * 100)

    if save == True:
      with open(title + '.sav', 'wb') as f:
        pickle.dump(train, f)
        print(title, 'has been saved')
        print('=' * 100)
    else:
      pass

    if matrix == True:
      cm = confusion_matrix(y_test, y_predicted)

      print('Confusion Matrix')
      plt.figure(figsize = (5, 3))
      sns.heatmap(cm, annot = True, fmt = 'd')
      plt.xlabel('Predicted')
      plt.ylabel('Actual')
      plt.show()
    else:
      pass

    if unmatch == True:
      print('=' * 100)
      print('Unmatched Prediction Result')
      print('-' * 100)
      for i in range(0, len(y)):
        if y_predicted[i] != y[i]:
          print ('Predicted : {0}\nActual: {1}\n'.format(y_predicted[i], y[i]))
    else:
      pass
  except:
    pass

In [None]:
dtc(x_train, y_train, x_test, y_test, save = False, matrix = True, unmatch = False)

##### 08.01.01.05 K-Nearest Neighbors (KNN)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
def knn(x_train, y_train, x_test, y_test, save = False, matrix = False, unmatch = False):
  try:
    title = 'K-Nearest Neighbors Model'
    model = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
    sc = StandardScaler()
    x_train_sc = sc.fit_transform(x_train)
    x_test_sc = sc.transform(x_test)

    train = model.fit(x_train_sc, y_train)
    y_predicted = model.predict(x_test_sc)
    training_score = model.score(x_train_sc, y_train)
    testing_score = model.score(x_test_sc, y_test)

    print('=' * 100)
    print(title)
    print('=' * 100)
    print('Training Score : ', str(round(training_score * 100, 2)), '%')
    print('Testing Score : ', str(round(testing_score * 100, 2)), '%')
    print('=' * 100)
    print('Classification Report\n', classification_report(y_test, y_predicted))
    print('=' * 100)

    if save == True:
      with open(title + '.sav', 'wb') as f:
        pickle.dump(train, f)
        print(title, 'has been saved')
        print('=' * 100)
    else:
      pass

    if matrix == True:
      cm = confusion_matrix(y_test, y_predicted)

      print('Confusion Matrix')
      plt.figure(figsize = (5, 3))
      sns.heatmap(cm, annot = True, fmt = 'd')
      plt.xlabel('Predicted')
      plt.ylabel('Actual')
      plt.show()
    else:
      pass

    if unmatch == True:
      print('=' * 100)
      print('Unmatched Prediction Result')
      print('-' * 100)
      for i in range(0, len(y)):
        if y_predicted[i] != y[i]:
          print ('Predicted : {0}\nActual: {1}\n'.format(y_predicted[i], y[i]))
    else:
      pass
  except:
    pass

In [None]:
knn(x_train, y_train, x_test, y_test, save = False, matrix = True, unmatch = False)

##### 08.01.01.06 Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB

In [None]:
def nb(x_train, y_train, x_test, y_test, save = False, matrix = False, unmatch = False):
  try:
    title = 'Naive Bayes Model'
    model = GaussianNB()
    train = model.fit(x_train, y_train)
    y_predicted = model.predict(x_test)
    training_score = model.score(x_train, y_train)
    testing_score = model.score(x_test, y_test)

    print('=' * 100)
    print(title)
    print('=' * 100)
    print('Training Score : ', str(round(training_score * 100, 2)), '%')
    print('Testing Score : ', str(round(testing_score * 100, 2)), '%')
    print('=' * 100)
    print('Classification Report\n', classification_report(y_test, y_predicted))
    print('=' * 100)

    if save == True:
      with open(title + '.sav', 'wb') as f:
        pickle.dump(train, f)
        print(title, 'has been saved')
        print('=' * 100)
    else:
      pass

    if matrix == True:
      cm = confusion_matrix(y_test, y_predicted)

      print('Confusion Matrix')
      plt.figure(figsize = (5, 3))
      sns.heatmap(cm, annot = True, fmt = 'd')
      plt.xlabel('Predicted')
      plt.ylabel('Actual')
      plt.show()
    else:
      pass

    if unmatch == True:
      print('=' * 100)
      print('Unmatched Prediction Result')
      print('-' * 100)
      for i in range(0, len(y)):
        if y_predicted[i] != y[i]:
          print ('Predicted : {0}\nActual: {1}\n'.format(y_predicted[i], y[i]))
    else:
      pass
  except:
    pass

In [None]:
nb(x_train, y_train, x_test, y_test, save = False, matrix = True, unmatch = False)

##### 08.01.01.07 K Fold Cross Validation

In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

kf = KFold(n_splits = 3)
folds = StratifiedKFold(n_splits = 3)

In [None]:
def get_score(model, x_train, x_test, y_train, y_test):
  model.fit(x_train, y_train)
  score = model.score(x_test, y_test)
  rounded = str(round(score * 100, 2)) + '%'
  return rounded

In [None]:
scores_lr = []
scores_svm = []
scores_rf = []
scores_dtc = []
scores_knn = []
scores_nb = []

try:
  for train_index, test_index in kf.split(x):
    scores_lr.append(get_score(LogisticRegression(), x_train, x_test, y_train, y_test))
    scores_svm.append(get_score(SVC(), x_train, x_test, y_train, y_test))
    scores_rf.append(get_score(RandomForestClassifier(), x_train, x_test, y_train, y_test))
    scores_dtc.append(get_score(DecisionTreeClassifier(), x_train, x_test, y_train, y_test))
    scores_knn.append(get_score(KNeighborsClassifier(), x_train, x_test, y_train, y_test))
    scores_nb.append(get_score(GaussianNB(), x_train, x_test, y_train, y_test))
except:
  pass

In [None]:
try:
  print('Logistic Regression Score :', str(scores_lr))
  print('Support Vector Machine Score :', str(scores_svm))
  print('Random Forest Score :', str(scores_rf))
  print('Decision Tree Classifier Score :', str(scores_dtc))
  print('K-Nearest Neighbors Score :', str(scores_knn))
  print('Naive Bayes Score :', str(scores_nb))
except:
  pass

Other Method (More Simple)

In [None]:
scores_lr = []
scores_svm = []
scores_rf = []
scores_dtc = []
scores_knn = []
scores_nb = []

try:
  lr = cross_val_score(LogisticRegression(), x, y)
  for i in lr:
    scores_lr.append(str(round(i * 100, 2)) + '%')

  svm = cross_val_score(SVC(), x, y)
  for i in svm:
    scores_svm.append(str(round(i * 100, 2)) + '%')

  rf = cross_val_score(RandomForestClassifier(n_estimators = 15), x, y)
  for i in rf:
    scores_rf.append(str(round(i * 100, 2)) + '%')

  dtc = cross_val_score(DecisionTreeClassifier(), x, y)
  for i in dtc:
    scores_dtc.append(str(round(i * 100, 2)) + '%')

  knn = cross_val_score(KNeighborsClassifier(), x, y)
  for i in knn:
    scores_knn.append(str(round(i * 100, 2)) + '%')

  nb = cross_val_score(GaussianNB(), x, y)
  for i in nb:
    scores_nb.append(str(round(i * 100, 2)) + '%')
except:
  pass

In [None]:
try:
  print('Logistic Regression Score :', str(scores_lr))
  print('Support Vector Machine Score :', str(scores_svm))
  print('Random Forest Score :', str(scores_rf))
  print('Decision Tree Classifier Score :', str(scores_dtc))
  print('K-Nearest Neighbors Score :', str(scores_knn))
  print('Naive Bayes Score :', str(scores_nb))
except:
  pass

##### 08.01.01.08 Hyper Parameter Tuning
below is example with one ML model

In [None]:
kernels = ['rbf', 'linear']
C = [1, 10, 20]
avg_scores = {}

try:
  for kval in kernels:
    for cval in C:
      cv_scores = cross_val_score(SVC(kernel = kval, C = cval, gamma = 'auto'), x, y, cv = 5)
      avg_scores['svm_' + kval + '_' + str(cval)] = np.average(cv_scores)
  for key, values in avg_scores.items():
      print(f"{key} : {str(round(values * 100, 2)) + '%'}")
  print('=' * 100)
except:
  pass #'''

##### 08.01.01.09 Grid Search
below is example with one ML model

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
def gsc_svm(x, y, params):
  try:
    model = GridSearchCV(SVC(gamma = 'auto'), params, cv = 5, return_train_score = False)

    train = model.fit(x, y)

    grid_df = pd.DataFrame(model.cv_results_)
    grid_df = grid_df[['param_C', 'param_kernel', 'mean_test_score']]
    grid_df = grid_df.sort_values('mean_test_score', ascending = False)
    grid_df = grid_df.reset_index(drop = True)
    print(grid_df)
  except:
    pass

In [None]:
params = {
  'C' : [1, 10, 20],
  'kernel' : ['rbf', 'linear']
}

gsc_svm(x, y, params)

##### 08.01.01.10 Random Search

In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
def rsc_svm(x, y, params):
  try:
    model = RandomizedSearchCV(SVC(gamma = 'auto'), params, cv = 5, return_train_score = False)

    train = model.fit(x, y)

    grid_df = pd.DataFrame(model.cv_results_)
    grid_df = grid_df[['param_C', 'param_kernel', 'mean_test_score']]
    grid_df = grid_df.sort_values('mean_test_score', ascending = False)
    grid_df = grid_df.reset_index(drop = True)

    display(grid_df)
  except:
    pass

In [None]:
params = {
  'C' : [1, 10, 20],
  'kernel' : ['rbf', 'linear']
}

rsc_svm(x, y, params)

Other Method

In [None]:
model_params = {
  'logistic_regression' : {
    'model' : LogisticRegression(solver = 'liblinear', multi_class = 'auto'),
    'params' : {
      'C' : [1, 5, 10]
    }
  },
  'support_vector_machine' : {
    'model' : SVC(gamma = 'auto'),
    'params' : {
      'C' : [1, 5, 10],
      'kernel' : ['rbf', 'linear']
    }
  },
  'random_forest' : {
    'model' : RandomForestClassifier(),
    'params' : {
      'n_estimators' : [1, 5, 10]
    }
  },
  'decision_tree_classifier' : {
    'model' : DecisionTreeClassifier(),
    'params' : {
      #'C' : [1, 5, 10]
    }
  },
  'k_nearest_neighbors' : {
    'model' : KNeighborsClassifier(),
    'params' : {
      #'C' : [1, 5, 10]
    }
  },
  'naive_bayes' : {
    'model' : GaussianNB(),
    'params' : {
      #'C' : [1, 5, 10]
    }
  },
}

In [None]:
def gsc(x, y, model_params):
  try:
    scores = []

    for model_name, mp in model_params.items():
      model = GridSearchCV(mp['model'], mp['params'], cv = 5, return_train_score = False)
      model.fit(x, y)
      scores.append({
          'model': model_name,
          'best_score': model.best_score_,
          'best_params': model.best_params_
      })

    tune_df = pd.DataFrame(scores, columns = ['model', 'best_score', 'best_params'])
    tune_df = tune_df.sort_values('best_score', ascending = False)
    tune_df = tune_df.reset_index(drop = True)

    display(tune_df)
  except:
    pass

In [None]:
gsc(x, y, model_params)

##### 08.01.01.11 Xgboost

In [None]:
def xgb_classier(
    x_train,
    y_train,
    x_test,
    y_test,
    save = False,
    plot = False,
    tree = False,
    matrix = False,
    unmatch = False,
    num_round = 20
  ):
  try:
    title = 'XGB Classification Model'
    dtrain = xgb.DMatrix(x_train, label = y_train)
    dtest = xgb.DMatrix(x_test, label = y_test)

    param = {
      'max_depth' : 3,  # the maximum depth of each tree
      'eta' : 0.3,  # training step
      'silent' : 1,  # logging mode - quiet
      'objective' : 'multi:softprob',  # error evaluation for multiclass training
      'num_class' : 3  # the number of classes that exist in this datset
    }

    model = xgb.train(param, dtrain, num_round)
    preds = model.predict(dtest)
    y_predicted = np.asarray([np.argmax(line) for line in preds])

    precision_s = precision_score(y_test, y_predicted, average = 'macro')
    accuracy_s = accuracy_score(y_test, y_predicted)

    print('=' * 100)
    print(title)
    print('=' * 100)
    print('Precision Score : ', str(round(precision_s * 100, 2)), '%')
    #print('Regression Coefficient :', str(train.coef_))
    #print('Regression Interception :', str(train.intercept_))
    print('Accuracy Score : ', str(round(accuracy_s * 100, 2)), '%')
    print('=' * 100)
    print('Classification Report\n', classification_report(y_test, y_predicted))
    print('=' * 100)

    if save == True:
      model.dump_model(title + '.txt')
    else:
      pass

    if plot == True:
      xgb.plot_importance(model, importance_type = 'gain')
    else:
      pass

    if tree == True:
      xgb.plot_tree(model, num_trees = 0)
      xgb.plot_tree(model, num_trees = 9, rankdir = 'LR')
    else:
      pass

    if matrix == True:
      cm = confusion_matrix(y_test, y_predicted)

      print('Confusion Matrix')
      plt.figure(figsize = (5, 3))
      sns.heatmap(cm, annot = True, fmt = 'd')
      plt.xlabel('Predicted')
      plt.ylabel('Real')
      plt.show()
    else:
      pass

    if unmatch == True:
      print('=' * 100)
      print('Unmatched Prediction Result')
      print('-' * 100)
      for i in range(0, len(y)):
        if y_predicted[i] != y[i]:
          print ('Predicted : {0}\nActual: {1}\n'.format(y_predicted[i], y[i]))
    else:
      pass
  except:
    pass

In [None]:
xgb_classier(
  x_train,
  y_train,
  x_test,
  y_test,
  save = False,
  plot = True,
  tree = True,
  matrix = True,
  unmatch = False,
  num_round = 50
)

In [None]:
def xgb_cv(x, y, nfold = 3, num_round = 10, seed = 123):
  try:
    dmatrix = xgb.DMatrix(data = x, label = y)
    params = {
      'objective' : 'binary:logistic',
      'max_depth' : 3,
      'colsample_bytree': 0.5,
      'subsample' : 0.75,
      'gamma' : 0.25,
      'learning_rate' : 0.3,
      'reg_alpha' : 0.01
    }
    df = xgb.cv(dtrain = dmatrix, params = params, nfold = nfold, num_boost_round = num_round, seed = seed)

    accuracy= 1 - df['test-logloss-mean'].iloc[-1]
    print('XGB Classifier Cross Validation')
    print('=' * 100)
    print('Baseline Cross Validation Accuracy :', str(round(accuracy * 100, 2)), '%')
    print('=' * 100)

    display(df)
  except:
    pass

In [None]:
xgb_cv(x, y, nfold = 3, num_round = 10, seed = 123)

In [None]:
def xgb_rscv(x, y, params, n_iter = 5):
  try:
    model = xgb.XGBClassifier(random_state = 123)

    xgb_rs = RandomizedSearchCV(
      estimator = model,
      param_distributions = params,
      cv = 3,
      n_iter = n_iter,
      verbose = 2,
      random_state = 123
    )
    xgb_rs.fit(x, y)

    print('=' * 100)
    print('XGB Classifier Random Search')
    print('=' * 100)
    print('Best Parameters Found :', xgb_rs.best_params_)
    print('Best Accuracy Found :', str(round(xgb_rs.best_score_ * 100, 2)), '%')
  except:
    pass

In [None]:
params = {
  'max_depth' : list((range(3,12))),
  'alpha' : [0,0.001, 0.01,0.1,1],
  'subsample' : [0.5,0.75,1],
  'learning_rate' : np.linspace(0.01,0.5, 10),
  'n_estimators' : [10, 25, 40]
}

xgb_rscv(x, y, params, n_iter = 10)

#### 08.01.02 Regression
predicting continuous target variables, which represent numerical values

##### 08.01.02.01 Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
def single_lir(x_var, y_var, save = False, graph = False):
  for i in x_var:
    for j in y_var:
      try:
        title = 'Linear Regression Model ' + i + ' & ' + j
        x = np.array(df[i])
        y = np.array(df[j])
        x = x.reshape(-1, 1)
        model = LinearRegression()
        train = model.fit(x, y)
        y_predicted = model.predict(x)
        training_score = model.score(x, y)

        print('=' * 100)
        print(title)
        print('-' * 100)
        print('Training Score : ', str(round(training_score * 100, 2)), '%')
        print('Coefficients : ', model.coef_)
        print('Mean Squared Error: %.2f' % mean_squared_error(y, y_predicted))
        print('Coefficient of Determination: %.2f' % r2_score(y, y_predicted))
        print('-' * 100)

        if save == True:
          with open(title + '.sav', 'wb') as f:
            pickle.dump(train, f)
            print(title, 'has been saved')
            print('=' * 100)
        else:
          pass

        if graph == True:
          plt.scatter(x, y, color = 'red')
          plt.plot(x, y_predicted, color = 'k')
          plt.xlabel(i)
          plt.ylabel(j)
          plt.show()
        else:
          pass
      except:
        pass

In [None]:
single_lir(x_var, y_var, save = False, graph = True)

In [None]:
def multi_lir(x_var, y_var, save = False):
  try:
    title = 'Multi Linear Regression Model'
    x = np.array(df[x_var])
    y = np.array(df[y_var])
    model = LinearRegression()
    train = model.fit(x, y)
    y_predicted = model.predict(x)
    training_score = model.score(x, y)

    print('=' * 100)
    print(title)
    print('-' * 100)
    print('Training Score : ', str(round(training_score * 100, 2)), '%')
    print('Coefficients : ', model.coef_)
    print('Mean Squared Error: %.2f' % mean_squared_error(y, y_predicted))
    print('Coefficient of Determination: %.2f' % r2_score(y, y_predicted))
    print('-' * 100)

    if save == True:
      with open(title + '.sav', 'wb') as f:
        pickle.dump(train, f)
        print(title, 'has been saved')
        print('=' * 100)
    else:
      pass
  except:
    pass

In [None]:
multi_lir(x_var, y_var, save = False)

##### 08.01.02.02 Polynomial Regression

In [None]:
from sklearn.preprocessing import PolynomialFeatures

In [None]:
def single_pr(x_var, y_var, degree = 5, save = False, graph = False):
  for i in x_var:
    for j in y_var:
      try:
        title = 'Polynomial Regression Model ' + i + ' & ' + j
        model = LinearRegression()
        poly = PolynomialFeatures(degree = degree, include_bias = True)
        x = np.array(df[i])
        y = np.array(df[j])
        x_poly = poly.fit_transform(x.reshape(-1, 1))
        train = model.fit(x_poly, y)
        y_predicted = model.predict(x_poly)
        training_score = model.score(x_poly, y)

        print('=' * 100)
        print(title)
        print('-' * 100)
        print('Training Score : ', str(round(training_score * 100, 2)), '%')
        print('Coefficients : ', model.coef_)
        print('Mean Squared Error: %.2f' % mean_squared_error(y, y_predicted))
        print('Coefficient of Determination: %.2f' % r2_score(y, y_predicted))
        print('-' * 100)

        if save == True:
          with open(title + '.sav', 'wb') as f:
            pickle.dump(train, f)
            print(title, 'has been saved')
            print('=' * 100)
        else:
          pass

        if graph == True:
          plt.scatter(x, y, color = 'red')
          plt.plot(x, y_predicted, color = 'k')
          plt.xlabel(i)
          plt.ylabel(j)
          plt.show()
        else:
          pass
      except:
        pass

In [None]:
single_pr(x_var, y_var, degree = 5, save = False, graph = True)

In [None]:
def multi_pr(x_var, y_var, degree = 5, save = False):
  try:
    title = 'Multi Polynomial Regression Model'
    model = LinearRegression()
    poly = PolynomialFeatures(degree = degree, include_bias = True)
    x = np.array(df[x_var])
    y = np.array(df[y_var])
    x_poly = poly.fit_transform(x)
    train = model.fit(x_poly, y)
    y_predicted = model.predict(x_poly)
    training_score = model.score(x_poly, y)

    print('=' * 100)
    print(title)
    print('-' * 100)
    print('Training Score : ', str(round(training_score * 100, 2)), '%')
    print('Coefficients : ', model.coef_)
    print('Mean Squared Error: %.2f' % mean_squared_error(y, y_predicted))
    print('Coefficient of Determination: %.2f' % r2_score(y, y_predicted))
    print('-' * 100)

    if save == True:
      with open(title + '.sav', 'wb') as f:
        pickle.dump(train, f)
        print(title, 'has been saved')
        print('=' * 100)
    else:
      pass
  except:
    pass

In [None]:
multi_pr(x_var, y_var, degree = 5, save = False)

##### 08.01.02.03 Lasso Regression

In [None]:
from sklearn.linear_model import Lasso

In [None]:
def single_lar(x_var, y_var, alpha = 0.01, save = False, graph = False):
  for i in x_var:
    for j in y_var:
      try:
        title = 'Lasso Regression Model ' + i + ' & ' + j
        model = Lasso(alpha = alpha, max_iter = 200, tol = 0.1)
        x = np.array(df[i])
        y = np.array(df[j])
        x = x.reshape(-1, 1)
        train = model.fit(x, y)
        y_predicted = model.predict(x)
        training_score = model.score(x, y)

        print('=' * 100)
        print(title)
        print('-' * 100)
        print('Training Score : ', str(round(training_score * 100, 2)), '%')
        print('Coefficients : ', model.coef_)
        print('Mean Squared Error: %.2f' % mean_squared_error(y, y_predicted))
        print('Coefficient of Determination: %.2f' % r2_score(y, y_predicted))
        print('-' * 100)

        if save == True:
          with open(title + '.sav', 'wb') as f:
            pickle.dump(train, f)
            print(title, 'has been saved')
            print('=' * 100)
        else:
          pass

        if graph == True:
          plt.scatter(x, y, color = 'red')
          plt.plot(x, y_predicted, color = 'k')
          plt.xlabel(i)
          plt.ylabel(j)
          plt.show()
        else:
          pass
      except:
        pass

In [None]:
single_lar(x_var, y_var, alpha = 0.0001, save = False, graph = True)

In [None]:
def multi_lar(x_var, y_var, alpha = 0.01, save = False):
  try:
    title = 'Multi Lasso Regression Model'
    model = Lasso(alpha = alpha, max_iter = 200, tol = 0.1)
    x = np.array(df[x_var])
    y = np.array(df[y_var])
    train = model.fit(x, y)
    y_predicted = model.predict(x)
    training_score = model.score(x, y)

    print('=' * 100)
    print(title)
    print('-' * 100)
    print('Training Score : ', str(round(training_score * 100, 2)), '%')
    print('Coefficients : ', model.coef_)
    print('Mean Squared Error: %.2f' % mean_squared_error(y, y_predicted))
    print('Coefficient of Determination: %.2f' % r2_score(y, y_predicted))
    print('-' * 100)

    if save == True:
      with open(title + '.sav', 'wb') as f:
        pickle.dump(train, f)
        print(title, 'has been saved')
        print('=' * 100)
    else:
      pass
  except:
    pass

In [None]:
multi_lar(x_var, y_var, alpha = 0.01, save = False)

##### 08.01.02.04 Ridge Regression

In [None]:
from sklearn.linear_model import Ridge

In [None]:
def single_rr(x_var, y_var, alpha = 0.01, save = False, graph = False):
  for i in x_var:
    for j in y_var:
      try:
        title = 'Ridge Regression Model ' + i + ' & ' + j
        model = Ridge(alpha = alpha, max_iter = 100, tol = 0.1)
        x = np.array(df[i])
        y = np.array(df[j])
        x = x.reshape(-1, 1)
        train = model.fit(x, y)
        y_predicted = model.predict(x)
        training_score = model.score(x, y)

        print('=' * 100)
        print(title)
        print('-' * 100)
        print('Training Score : ', str(round(training_score * 100, 2)), '%')
        print('Coefficients : ', model.coef_)
        print('Mean Squared Error: %.2f' % mean_squared_error(y, y_predicted))
        print('Coefficient of Determination: %.2f' % r2_score(y, y_predicted))
        print('-' * 100)

        if save == True:
          with open(title + '.sav', 'wb') as f:
            pickle.dump(train, f)
            print(title, 'has been saved')
            print('=' * 100)
        else:
          pass

        if graph == True:
          plt.scatter(x, y, color = 'red')
          plt.plot(x, y_predicted, color = 'k')
          plt.xlabel(i)
          plt.ylabel(j)
          plt.show()
        else:
          pass
      except:
        pass

In [None]:
single_rr(x_var, y_var, alpha = 0.0001, save = False, graph = True)

In [None]:
def multi_rr(x_var, y_var, alpha = 0.01, save = False):
  try:
    title = 'Multi Ridge Regression Model'
    model = Ridge(alpha = alpha, max_iter = 100, tol = 0.1)
    x = np.array(df[x_var])
    y = np.array(df[y_var])
    train = model.fit(x, y)
    y_predicted = model.predict(x)
    training_score = model.score(x, y)

    print('=' * 100)
    print(title)
    print('-' * 100)
    print('Training Score : ', str(round(training_score * 100, 2)), '%')
    print('Coefficients : ', model.coef_)
    print('Mean Squared Error: %.2f' % mean_squared_error(y, y_predicted))
    print('Coefficient of Determination: %.2f' % r2_score(y, y_predicted))
    print('-' * 100)

    if save == True:
      with open(title + '.sav', 'wb') as f:
        pickle.dump(train, f)
        print(title, 'has been saved')
        print('=' * 100)
    else:
      pass
  except:
    pass

In [None]:
multi_rr(x_var, y_var, alpha = 0.01, save = False)

##### 08.01.02.05 Decision tree

In [None]:
from sklearn.tree import DecisionTreeRegressor

In [None]:
def single_dtr(x_var, y_var, save = False, graph = False):
  for i in x_var:
    for j in y_var:
      try:
        title = 'Decision Tree Regressor Model ' + i + ' & ' + j
        model = DecisionTreeRegressor(random_state = 0)
        x = np.array(df[i])
        y = np.array(df[j])
        x = x.reshape(-1, 1)
        train = model.fit(x, y)
        y_predicted = model.predict(x)
        training_score = model.score(x, y)

        print('=' * 100)
        print(title)
        print('-' * 100)
        print('Training Score : ', str(round(training_score * 100, 2)), '%')
        print('Mean Squared Error: %.2f' % mean_squared_error(y, y_predicted))
        print('Coefficient of Determination: %.2f' % r2_score(y, y_predicted))
        print('-' * 100)

        if save == True:
          with open(title + '.sav', 'wb') as f:
            pickle.dump(train, f)
            print(title, 'has been saved')
            print('=' * 100)
        else:
          pass

        if graph == True:
          plt.scatter(x, y, color = 'red')
          plt.plot(x, y_predicted, color = 'k')
          plt.xlabel(i)
          plt.ylabel(j)
          plt.show()
        else:
          pass
      except:
        pass

In [None]:
single_dtr(x_var, y_var, save = False, graph = True)

In [None]:
def multi_dtr(x_var, y_var, alpha = 0.01, save = False):
  try:
    title = 'Multi Decision Tree Regressor Model'
    model = DecisionTreeRegressor(random_state = 0)
    x = np.array(df[x_var])
    y = np.array(df[y_var])
    train = model.fit(x, y)
    y_predicted = model.predict(x)
    training_score = model.score(x, y)

    print('=' * 100)
    print(title)
    print('-' * 100)
    print('Training Score : ', str(round(training_score * 100, 2)), '%')
    print('Mean Squared Error: %.2f' % mean_squared_error(y, y_predicted))
    print('Coefficient of Determination: %.2f' % r2_score(y, y_predicted))
    print('-' * 100)

    if save == True:
      with open(title + '.sav', 'wb') as f:
        pickle.dump(train, f)
        print(title, 'has been saved')
        print('=' * 100)
    else:
      pass
  except:
    pass

In [None]:
multi_dtr(x_var, y_var, alpha = 0.01, save = False)

##### 08.01.02.06 Random Forest Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
def single_rfr(x_var, y_var, n_estimators = 10, save = False, graph = False):
  for i in x_var:
    for j in y_var:
      try:
        title = 'Random Forest Regressor Model ' + i + ' & ' + j
        model = RandomForestRegressor(n_estimators = n_estimators, random_state = 0)
        x = np.array(df[i])
        y = np.array(df[j])
        x = x.reshape(-1, 1)
        train = model.fit(x, y)
        y_predicted = model.predict(x)
        training_score = model.score(x, y)

        print('=' * 100)
        print(title)
        print('-' * 100)
        print('Training Score : ', str(round(training_score * 100, 2)), '%')
        print('Mean Squared Error: %.2f' % mean_squared_error(y, y_predicted))
        print('Coefficient of Determination: %.2f' % r2_score(y, y_predicted))
        print('-' * 100)

        if save == True:
          with open(title + '.sav', 'wb') as f:
            pickle.dump(train, f)
            print(title, 'has been saved')
            print('=' * 100)
        else:
          pass

        if graph == True:
          plt.scatter(x, y, color = 'red')
          plt.plot(x, y_predicted, color = 'k')
          plt.xlabel(i)
          plt.ylabel(j)
          plt.show()
        else:
          pass
      except:
        pass

In [None]:
single_rfr(x_var, y_var, n_estimators = 10, save = False, graph = True)

In [None]:
def multi_rfr(x_var, y_var, n_estimators = 10, save = False):
  try:
    title = 'Multi Random Forest Regressor Model'
    model = RandomForestRegressor(n_estimators = n_estimators, random_state = 0)
    x = np.array(df[x_var])
    y = np.array(df[y_var])
    train = model.fit(x, y)
    y_predicted = model.predict(x)
    training_score = model.score(x, y)

    print('=' * 100)
    print(title)
    print('-' * 100)
    print('Training Score : ', str(round(training_score * 100, 2)), '%')
    print('Mean Squared Error: %.2f' % mean_squared_error(y, y_predicted))
    print('Coefficient of Determination: %.2f' % r2_score(y, y_predicted))
    print('-' * 100)

    if save == True:
      with open(title + '.sav', 'wb') as f:
        pickle.dump(train, f)
        print(title, 'has been saved')
        print('=' * 100)
    else:
      pass
  except:
    pass

In [None]:
multi_rfr(x_var, y_var, n_estimators = 10, save = False)

##### 08.01.02.07 Grid Search

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

In [None]:
model_params = {
  'linear_regression' : {
    'model' : LinearRegression(),
    'params' : {
      #'C' : [1, 5, 10]
    }
  },
  'lasso_regression' : {
    'model' : Lasso(),
    'params' : {
      'alpha' : [1, 10, 100],
      'max_iter' : [1, 10, 100],
      'tol' : [0.1, 0.01]
    }
  },
  'ridge_regression' : {
    'model' : Ridge(),
    'params' : {
      'alpha' : [1, 10, 100],
      'max_iter' : [1, 10, 100],
      'tol' : [0.1, 0.01]
    }
  },
  'decision_tree_regressor' : {
    'model' : DecisionTreeRegressor(),
    'params' : {
      'random_state' : [0, 42]
    }
  },
  'random_forest_regressor' : {
    'model' : RandomForestRegressor(),
    'params' : {
      'n_estimators' : [1, 5, 10],
      'random_state' : [0, 42]
    }
  },
}

In [None]:
def single_gsc(x_var, y_var, model_params, cv = 5):
  try:
    scores = []

    for i in x_var:
      for j in y_var:
        x = np.array(df[i])
        y = np.array(df[j])
        x = x.reshape(-1, 1)

        for model_name, mp in model_params.items():
          model = GridSearchCV(mp['model'], mp['params'], cv = cv, return_train_score = False)
          model.fit(x, y)
          scores.append({
              'model': model_name + ' ' + i + ' & ' + j,
              'best_score': model.best_score_,
              'best_params': model.best_params_
          })

    tune_df = pd.DataFrame(scores, columns = ['model', 'best_score', 'best_params'])
    tune_df = tune_df.sort_values('best_score', ascending = False)
    tune_df = tune_df.reset_index(drop = True)

    display(tune_df)
  except:
    pass

In [None]:
single_gsc(x_var, y_var, model_params, cv = 2)

In [None]:
def multi_gsc(x_var, y_var, model_params, cv = 5):
  try:
    scores = []

    x = np.array(df[x_var])
    y = np.array(df[y_var])

    for model_name, mp in model_params.items():
      model = GridSearchCV(mp['model'], mp['params'], cv = 5, return_train_score = False)
      model.fit(x, y)
      scores.append({
          'model': model_name + ' ' + i + ' & ' + j,
          'best_score': model.best_score_,
          'best_params': model.best_params_
      })

    tune_df = pd.DataFrame(scores, columns = ['model', 'best_score', 'best_params'])
    tune_df = tune_df.sort_values('best_score', ascending = False)
    tune_df = tune_df.reset_index(drop = True)

    display(tune_df)
  except:
    pass

In [None]:
multi_gsc(x_var, y_var, model_params, cv = 2)

##### 08.01.02.08 Xgboost Regression

In [None]:
def xgb_single_lir(x_var, y_var, save = False, n_estimators = 10, tree = False):
  for i in x_var:
    for j in y_var:
      try:
        title = 'XGB Regression Model ' + i + ' & ' + j
        x = np.array(df[i])
        y = np.array(df[j])
        x = x.reshape(-1, 1)
        model = xgb.XGBRegressor(objective = 'reg:linear', n_estimators = n_estimators, seed = 123)
        train = model.fit(x, y)
        y_predicted = model.predict(x)

        print('=' * 100)
        print(title)
        print('-' * 100)
        print('Mean Squared Error: %.2f' % mean_squared_error(y, y_predicted))
        print('Coefficient of Determination: %.2f' % r2_score(y, y_predicted))
        print('-' * 100)

        if save == True:
          with open(title + '.sav', 'wb') as f:
            pickle.dump(train, f)
            print(title, 'has been saved')
            print('=' * 100)
        else:
          pass

        if tree == True:
          xgb.plot_tree(model, num_trees = 0)
          xgb.plot_tree(model, num_trees = 9, rankdir = 'LR')
          xgb.plot_importance(model)
          plt.show()
        else:
          pass
      except:
        pass

In [None]:
xgb_single_lir(x_var, y_var, save = False, n_estimators = 10, tree = False)

In [None]:
def xgb_regressor(x_var, y_var, save = False, num_round = 5):
  for i in x_var:
    for j in y_var:
      try:
        title = 'XGB Regressor Model ' + i + ' & ' + j
        x = np.array(df[i])
        y = np.array(df[j])
        x = x.reshape(-1, 1)
        dmatrix = xgb.DMatrix(data = x, label = y)
        params = {
          'booster' : 'gblinear',
          'objective' : 'reg:linear'
        }
        model = xgb.train(params = params, dtrain = dmatrix, num_boost_round = num_round)
        #y_predicted = model.predict(dmatrix)
        preds = model.predict(dmatrix)
        y_predicted = np.asarray([np.argmax(line) for line in preds])

        print('=' * 100)
        print(title)
        print('-' * 100)
        print('Mean Squared Error: %.2f' % mean_squared_error(y, y_predicted))
        print('Coefficient of Determination: %.2f' % r2_score(y, y_predicted))
        print('-' * 100)

        if save == True:
          with open(title + '.sav', 'wb') as f:
            pickle.dump(model, f)
            print(title, 'has been saved')
            print('=' * 100)
        else:
          pass
      except:
        pass

In [None]:
xgb_regressor(x_var, y_var, save = False, num_round = 5)

In [None]:
def xgb_regcv_v1(x, y, nfold = 3, num_round = 10, seed = 123):
  try:
    dmatrix = xgb.DMatrix(data = x, label = y)
    params = {
      'objective' : 'reg:linear',
      'max_depth' : 3,
      'colsample_bytree': 0.5,
      'subsample' : 0.75,
      'gamma' : 0.25,
      'learning_rate' : 0.3,
      'reg_alpha' : 0.01,
    }
    df = xgb.cv(dtrain = dmatrix, params = params, nfold = nfold, num_boost_round = num_round, seed = num_round, metrics = 'rmse')

    print('XGB Regressor Cross Validation v1')
    print('=' * 100)

    display(df)
  except:
    pass

In [None]:
xgb_regcv_v1(x, y, nfold = 3, num_round = 10, seed = 123)

In [None]:
def xgb_regcv_v2(x, y, nfold = 3, num_round = 10, seed = 123):
  try:
    dmatrix = xgb.DMatrix(data = x, label = y)
    params = {
      'objective' : 'reg:linear',
      'max_depth' : 3,
      'colsample_bytree': 0.5,
      'subsample' : 0.75,
      'gamma' : 0.25,
      'learning_rate' : 0.3,
      'reg_alpha' : 0.01,
      'silent' : 1
    }
    df = xgb.cv(dtrain = dmatrix, params = params, nfold = nfold, num_boost_round = num_round, seed = num_round, metrics = 'mae')

    print('XGB Regressor Cross Validation v2')
    print('=' * 100)

    display(df)
  except:
    pass

In [None]:
xgb_regcv_v2(x, y, nfold = 3, num_round = 10, seed = 123)

In [None]:
def xgb_regcv_v3(x, y, nfold = 3, num_round = 10, seed = 123):
  try:
    dmatrix = xgb.DMatrix(data = x, label = y)
    reg_params = [1, 10, 100]
    params = {
      'objective' : 'reg:linear',
      'max_depth' : 3,
      'colsample_bytree': 0.5,
      'subsample' : 0.75,
      'gamma' : 0.25,
      'learning_rate' : 0.3,
      'reg_alpha' : 0.01,
      'silent' : 1
    }

    rmses = []
    for reg in reg_params:
      params['lambda'] = reg
      model = xgb.cv(
        dtrain = dmatrix,
        params = params,
        #nfold = nfold,
        num_boost_round = num_round,
        metrics = 'rmse',
        as_pandas = True,
        seed = 123
      )
      rmses.append(model['test-rmse-mean'].tail(1).values[0])

    print('XGB Regressor Cross Validation v3')
    print('=' * 100)
    print('Best RMSE as a Function of l2 :')
    print(pd.DataFrame(list(zip(reg_params, rmses)), columns = ['l2', 'rmse']))
  except:
    pass

In [None]:
xgb_regcv_v3(x, y, nfold = 3, num_round = 10, seed = 123)

In [None]:
def xgb_regcv_v4(x, y, nfold = 3, seed = 123):
  try:
    dmatrix = xgb.DMatrix(data = x, label = y)
    num_rounds = [5, 10, 15]
    params = {
      'objective' : 'reg:linear',
      'max_depth' : 3,
      'colsample_bytree': 0.5,
      'subsample' : 0.75,
      'gamma' : 0.25,
      'learning_rate' : 0.3,
      'reg_alpha' : 0.01,
      'silent' : 1
    }

    rmses = []
    for round in num_rounds:
      model = xgb.cv(
        dtrain = dmatrix,
        params = params,
        nfold = nfold,
        num_boost_round = round,
        metrics = 'rmse',
        as_pandas = True,
        seed = 123
      )
      rmses.append(model['test-rmse-mean'].tail().values[-1])

    print('XGB Regressor Cross Validation v4')
    print('=' * 100)
    print('Best RMSE on Rounds :')
    print(pd.DataFrame(list(zip(num_rounds, rmses)), columns = ['num_boosting_rounds', 'rmse']))
  except:
    pass

In [None]:
xgb_regcv_v4(x, y, nfold = 3, seed = 123)

In [None]:
def xgb_regcv_v5(x, y, nfold = 3, num_round = 10, seed = 123):
  try:
    dmatrix = xgb.DMatrix(data = x, label = y)
    eta_vals = [0.001, 0.01, 0.1]
    params = {
      'objective' : 'reg:linear',
      'max_depth' : 3,
      'colsample_bytree': 0.5,
      'subsample' : 0.75,
      'gamma' : 0.25,
      'learning_rate' : 0.3,
      'reg_alpha' : 0.01,
      'silent' : 1
    }

    rmses = []
    for curr_val in eta_vals:
      params['eta'] = curr_val
      model = xgb.cv(
        dtrain = dmatrix,
        params = params,
        nfold = nfold,
        num_boost_round = num_round,
        metrics = 'rmse',
        as_pandas = True,
        seed = 123
      )
      rmses.append(model['test-rmse-mean'].tail().values[-1])

    print('XGB Regressor Cross Validation v5')
    print('=' * 100)
    print('Best RMSE on Rounds :')
    print(pd.DataFrame(list(zip(eta_vals, rmses)), columns = ['eta', 'rmse']))
  except:
    pass

In [None]:
xgb_regcv_v5(x, y, nfold = 3, num_round = 10, seed = 123)

In [None]:
def xgb_regcv_v6(x, y, nfold = 3, num_round = 10, seed = 123):
  try:
    dmatrix = xgb.DMatrix(data = x, label = y)
    max_depths = [2, 5, 10, 20]
    params = {
      'objective' : 'reg:linear',
      'max_depth' : 3,
      'colsample_bytree': 0.5,
      'subsample' : 0.75,
      'gamma' : 0.25,
      'learning_rate' : 0.3,
      'reg_alpha' : 0.01,
      'silent' : 1
    }

    rmses = []
    for curr_val in max_depths:
      params['max_depth'] = curr_val
      model = xgb.cv(
        dtrain = dmatrix,
        params = params,
        nfold = nfold,
        num_boost_round = num_round,
        metrics = 'rmse',
        as_pandas = True,
        seed = 123
      )
      rmses.append(model['test-rmse-mean'].tail().values[-1])

    print('XGB Regressor Cross Validation v6')
    print('=' * 100)
    print('Best RMSE on Rounds :')
    print(pd.DataFrame(list(zip(max_depths, rmses)), columns = ['max_depth', 'rmse']))
  except:
    pass

In [None]:
xgb_regcv_v6(x, y, nfold = 3, num_round = 10, seed = 123)

In [None]:
def xgb_regcv_v7(x, y, nfold = 3, num_round = 10, seed = 123):
  try:
    dmatrix = xgb.DMatrix(data = x, label = y)
    colsample_bytree_vals = [0.1, 0.5, 0.8, 1]
    params = {
      'objective' : 'reg:linear',
      'max_depth' : 3,
      'colsample_bytree': 0.5,
      'subsample' : 0.75,
      'gamma' : 0.25,
      'learning_rate' : 0.3,
      'reg_alpha' : 0.01,
      'silent' : 1
    }

    rmses = []
    for curr_val in colsample_bytree_vals:
      params['colsample_bytree'] = curr_val
      model = xgb.cv(
        dtrain = dmatrix,
        params = params,
        nfold = nfold,
        num_boost_round = num_round,
        early_stopping_rounds = 5,
        metrics = 'rmse',
        as_pandas = True,
        seed = 123
      )
      rmses.append(model['test-rmse-mean'].tail().values[-1])

    print('XGB Regressor Cross Validation v7')
    print('=' * 100)
    print('Best RMSE on Rounds :')
    print(pd.DataFrame(list(zip(colsample_bytree_vals, rmses)), columns = ['colsample_bytree', 'rmse']))
  except:
    pass

In [None]:
xgb_regcv_v7(x, y, nfold = 3, num_round = 10, seed = 123)

In [None]:
def xgb_reg_gscv(x, y, params, cv = 3):
  try:
    dmatrix = xgb.DMatrix(data = x, label = y)
    model = xgb.XGBRegressor()

    mse = GridSearchCV(
      estimator = model,
      param_grid = params,
      scoring = 'neg_mean_squared_error',
      cv = cv,
      verbose = 1
    )

    mse.fit(x, y)

    print('XGB Regressor Random Search')
    print('=' * 100)
    print('Best Parameters Found : ', mse.best_params_)
    print('Lowest RMSE found : ', np.sqrt(np.abs(mse.best_score_)))
  except:
    pass

In [None]:
params = {
  'colsample_bytree' : [0.3, 0.7],
  'n_estimators' : [10, 50],
  'max_depth' : [1, 5, 10],
  'subsample' : [0.5, 0.75],
  'gamma' : [0.5, 0.75],
  'learning_rate' : [0.1, 0.3],
  'reg_alpha' : [0.01, 0.05],
}

xgb_reg_gscv(x, y, params, cv = 3)

In [None]:
def xgb_reg_rscv(x, y, params, cv = 3, n_iter = 5):
  try:
    dmatrix = xgb.DMatrix(data = x, label = y)
    model = xgb.XGBRegressor()

    mse = RandomizedSearchCV(
      estimator = model,
      param_distributions = params,
      scoring = 'neg_mean_squared_error',
      cv = cv,
      n_iter = n_iter,
      verbose = 1
    )

    mse.fit(x, y)

    print('XGB Regressor Grid Search')
    print('=' * 100)
    print('Best Parameters Found : ', mse.best_params_)
    print('Lowest RMSE found : ', np.sqrt(np.abs(mse.best_score_)))
  except:
    pass

In [None]:
params = {
  'colsample_bytree' : [0.3, 0.7],
  'n_estimators' : [10, 50],
  'max_depth' : [1, 5, 10],
  'subsample' : [0.5, 0.75],
  'gamma' : [0.5, 0.75],
  'learning_rate' : [0.1, 0.3],
  'reg_alpha' : [0.01, 0.05],
}

xgb_reg_rscv(x, y, params, cv = 3, n_iter = 50)

### 08.02 Unsupervised Machine Learning
algorithm discovers patterns and relationships using unlabeled data

#### 08.02.01 Clustering
grouping data points into clusters based on their similarity

##### 08.02.01.01 K-Means Clustering algorithm

create cluster based on variables, and define how many cluster needed

In [None]:
from sklearn.cluster import KMeans

2 Variables

In [None]:
def kmeans_2(var_a, var_b, n_clusters = 3, save = False, graph = False): # how many cluster needed
  try:
    title = 'K-Means Clustering Model ' + var_a + ' & ' + var_b
    model = KMeans(n_clusters = n_clusters, max_iter = 100, random_state = 42)
    x = df[[var_a, var_b]]
    train = model.fit(x)
    y = model.labels_

    final_df = x
    final_df['cluster'] = y

    print('=' * 100)
    print(title)
    print('=' * 100)
    print('K-Means Inertia :', model.inertia_)
    print('=' * 100)

    if save == True:
      with open(title + '.sav', 'wb') as f:
        pickle.dump(train, f)
        print(title, 'has been saved')
        print('=' * 100)
    else:
      pass

    if graph == True:
      fig = px.scatter(
        final_df,
        x = var_a,
        y = var_b,
        color = final_df.columns[-1]
      )
      fig.show()
    else:
      pass
  except:
    pass

In [None]:
var_a = 'sepal length (cm)'
var_b = 'sepal width (cm)'

kmeans_2(var_a, var_b, n_clusters = 3, save = False, graph = True)

More than 2 Variables

In [None]:
def kmeans(x_var, n_clusters = 3, save = False, graph = False): # how many cluster needed
  try:
    title = 'K-Means Clustering Model'
    model = KMeans(n_clusters = n_clusters, max_iter = 100, random_state = 42)
    x = df[x_var]
    train = model.fit(x)
    y = model.labels_

    final_df = x
    final_df['cluster'] = y

    print('=' * 100)
    print(title)
    print('=' * 100)
    print('K-Means Inertia :', model.inertia_)
    print('=' * 100)

    if save == True:
      with open(title + '.sav', 'wb') as f:
        pickle.dump(train, f)
        print(title, 'has been saved')
        print('=' * 100)
    else:
      pass

    if graph == True:
      fig = px.scatter_3d(
        final_df,
        x = final_df.columns[0],
        y = final_df.columns[1],
        z = final_df.columns[2],
        color = final_df.columns[-1]
      )
      fig.show()
    else:
      pass
  except:
    pass

In [None]:
x_var = [
  'sepal length (cm)',
  'sepal width (cm)',
  'petal length (cm)',
  'petal width (cm)'
]

kmeans(x_var, n_clusters = 3, save = False, graph = True)

##### 08.02.01.02 Mean-shift algorithm

In [None]:
from sklearn.cluster import MeanShift

In [None]:
def ms_2(var_a, var_b, save = False, graph = False):
  try:
    title = 'Mean Shift Model ' + var_a + ' & ' + var_b
    model = MeanShift()
    x = df[[var_a, var_b]]
    train = model.fit(x)
    y = model.labels_

    final_df = x
    final_df['cluster'] = y

    print('=' * 100)
    print(title)
    print('=' * 100)
    print('Number of Cluster :', len(final_df['cluster'].unique()))
    print('=' * 100)
    print('Clusters :', final_df['cluster'].unique())
    print('=' * 100)

    if save == True:
      with open(title + '.sav', 'wb') as f:
        pickle.dump(train, f)
        print(title, 'has been saved')
        print('=' * 100)
    else:
      pass

    if graph == True:
      fig = px.scatter(
        final_df,
        x = var_a,
        y = var_b,
        color = final_df.columns[-1]
      )
      fig.show()
    else:
      pass
  except:
    pass

In [None]:
var_a = 'sepal length (cm)'
var_b = 'sepal width (cm)'

ms_2(var_a, var_b, save = False, graph = True)

In [None]:
def ms(x_var, save = False, graph = False):
  try:
    title = 'Mean Shift Model'
    model = MeanShift()
    x = df[x_var]
    train = model.fit(x)
    y = model.labels_

    final_df = x
    final_df['cluster'] = y

    print('=' * 100)
    print(title)
    print('=' * 100)
    print('Number of Cluster :', len(final_df['cluster'].unique()))
    print('=' * 100)
    print('Clusters :', final_df['cluster'].unique())
    print('=' * 100)

    if save == True:
      with open(title + '.sav', 'wb') as f:
        pickle.dump(train, f)
        print(title, 'has been saved')
        print('=' * 100)
    else:
      pass

    if graph == True:
      fig = px.scatter_3d(
        final_df,
        x = final_df.columns[0],
        y = final_df.columns[1],
        z = final_df.columns[2],
        color = final_df.columns[-1]
      )
      fig.show()
    else:
      pass
  except:
    pass

In [None]:
x_var = [
  'sepal length (cm)',
  'sepal width (cm)',
  'petal length (cm)',
  'petal width (cm)'
]

ms(x_var, save = False, graph = True)

##### 08.02.01.03 DBSCAN Algorithm

In [None]:
from sklearn.cluster import DBSCAN

In [None]:
def dbscan_2(var_a, var_b, eps = 3, min_samples = 5, save = False, graph = False):
  try:
    title = 'DBSCAN Model ' + var_a + ' & ' + var_b
    model = DBSCAN(eps = eps, min_samples = min_samples)
    x = df[[var_a, var_b]]
    train = model.fit(x)
    y = model.labels_

    final_df = x
    final_df['cluster'] = y

    print('=' * 100)
    print(title)
    print('=' * 100)
    print('Number of Cluster :', len(final_df['cluster'].unique()))
    print('=' * 100)
    print('Clusters :', final_df['cluster'].unique())
    print('=' * 100)

    if save == True:
      with open(title + '.sav', 'wb') as f:
        pickle.dump(train, f)
        print(title, 'has been saved')
        print('=' * 100)
    else:
      pass

    if graph == True:
      fig = px.scatter(
        final_df,
        x = var_a,
        y = var_b,
        color = final_df.columns[-1]
      )
      fig.show()
    else:
      pass
  except:
    pass

In [None]:
var_a = 'sepal length (cm)'
var_b = 'sepal width (cm)'

dbscan_2(var_a, var_b, eps = 3, min_samples = 20, save = False, graph = True)

In [None]:
def dbscan(var_a, var_b, eps = 3, min_samples = 5, save = False, graph = False):
  try:
    title = 'DBSCAN Model ' + var_a + ' & ' + var_b
    model = DBSCAN(eps = eps, min_samples = min_samples)
    x = df[x_var]
    train = model.fit(x)
    y = model.labels_

    final_df = x
    final_df['cluster'] = y

    print('=' * 100)
    print(title)
    print('=' * 100)
    print('Number of Cluster :', len(final_df['cluster'].unique()))
    print('=' * 100)
    print('Clusters :', final_df['cluster'].unique())
    print('=' * 100)

    if save == True:
      with open(title + '.sav', 'wb') as f:
        pickle.dump(train, f)
        print(title, 'has been saved')
        print('=' * 100)
    else:
      pass

    if graph == True:
      fig = px.scatter_3d(
        final_df,
        x = final_df.columns[0],
        y = final_df.columns[1],
        z = final_df.columns[2],
        color = final_df.columns[-1]
      )
      fig.show()
    else:
      pass
  except:
    pass

In [None]:
x_var = [
  'sepal length (cm)',
  'sepal width (cm)',
  'petal length (cm)',
  'petal width (cm)'
]

dbscan(var_a, var_b, eps = 0.01, min_samples = 20, save = False, graph = True)

##### 08.02.01.04 Principal Component Analysis
reduce high dimensional data (if x variable contain more than 2)

In [None]:
from sklearn.decomposition import PCA

Reduce Variables to 2 / 2 Dimensional

In [None]:
def kmeans_pca(x_var, n_components = 2, save = False, graph = False): # how many cluster needed
  try:
    title = 'K-Means & PCA Clustering Model'
    model = KMeans(n_clusters = n_components, max_iter = 100, random_state = 42)
    pca = PCA(n_components = n_components)
    x = df[x_var]
    x = pca.fit_transform(x)
    x = scaler.fit_transform(x)
    train = model.fit(x)
    y = model.labels_

    x_pca = model.transform(x)
    df_pca = pd.DataFrame(
      x_pca,
      columns = ['PC{}'.format(i + 1) for i in range(n_components)]
    )

    print('=' * 100)
    print(title)
    print('=' * 100)
    print('K-Means & PCA Inertia :', model.inertia_)
    print('=' * 100)

    if save == True:
      with open(title + '.sav', 'wb') as f:
        pickle.dump(train, f)
        print(title, 'has been saved')
        print('=' * 100)
    else:
      pass

    if graph == True:
      fig = px.scatter(
        df_pca,
        df_pca.columns[0],
        df_pca.columns[1],
        color = y
      )
      fig.show()
    else:
      pass
  except:
    pass

In [None]:
x_var = [
  'sepal length (cm)',
  'sepal width (cm)',
  'petal length (cm)',
  'petal width (cm)'
]

kmeans_pca(x_var, n_components = 3, save = False, graph = True)

Reduce Variables to 3 / 3 Dimensional

In [None]:
def kmeans_pca_3d(x_var, n_components = 3, save = False, graph = False): # how many cluster needed
  try:
    title = 'K-Means & PCA Clustering Model'
    model = KMeans(n_clusters = n_components, max_iter = 100, random_state = 42)
    pca = PCA(n_components = n_components)
    x = df[x_var]
    x = pca.fit_transform(x)
    x = scaler.fit_transform(x)
    train = model.fit(x)
    y = model.labels_

    x_pca = model.transform(x)
    df_pca = pd.DataFrame(
      x_pca,
      columns = ['PC{}'.format(i + 1) for i in range(n_components)]
    )

    print('=' * 100)
    print(title)
    print('=' * 100)
    print('K-Means & PCA Inertia :', model.inertia_)
    print('=' * 100)

    if save == True:
      with open(title + '.sav', 'wb') as f:
        pickle.dump(train, f)
        print(title, 'has been saved')
        print('=' * 100)
    else:
      pass

    if graph == True:
      fig = px.scatter_3d(
        df_pca,
        x = df_pca.columns[0],
        y = df_pca.columns[1],
        z = df_pca.columns[2],
        color = y
      )
      fig.show()
    else:
      pass
  except:
    pass

In [None]:
x_var = [
  'sepal length (cm)',
  'sepal width (cm)',
  'petal length (cm)',
  'petal width (cm)'
]

kmeans_pca_3d(x_var, n_components = 4, save = False, graph = True)

##### 08.02.01.05 Independent Component Analysis

In [None]:
from sklearn.decomposition import FastICA

In [None]:
def kmeans_ica(x_var, n_components = 2, save = False, graph = False): # how many cluster needed
  try:
    title = 'K-Means & ICA Clustering Model'
    model = KMeans(n_clusters = n_components, max_iter = 100, random_state = 42)
    ica = FastICA(n_components = n_components)
    x = df[x_var]
    x = ica.fit_transform(x)
    x = scaler.fit_transform(x)
    train = model.fit(x)
    y = model.labels_

    x_ica = model.transform(x)
    df_ica = pd.DataFrame(
      x_ica,
      columns = ['PC{}'.format(i + 1) for i in range(n_components)]
    )

    print('=' * 100)
    print(title)
    print('=' * 100)
    print('K-Means & ICA Inertia :', model.inertia_)
    print('=' * 100)

    if save == True:
      with open(title + '.sav', 'wb') as f:
        pickle.dump(train, f)
        print(title, 'has been saved')
        print('=' * 100)
    else:
      pass

    if graph == True:
      fig = px.scatter(
        df_ica,
        df_ica.columns[0],
        df_ica.columns[1],
        color = y
      )
      fig.show()
    else:
      pass
  except:
    pass

In [None]:
x_var = [
  'sepal length (cm)',
  'sepal width (cm)',
  'petal length (cm)',
  'petal width (cm)'
]

kmeans_ica(x_var, n_components = 3, save = False, graph = True)

In [None]:
def kmeans_ica_3d(x_var, n_components = 3, save = False, graph = False): # how many cluster needed
  try:
    title = 'K-Means & ICA Clustering Model'
    model = KMeans(n_clusters = n_components, max_iter = 100, random_state = 42)
    ica = FastICA(n_components = n_components)
    x = df[x_var]
    x = ica.fit_transform(x)
    x = scaler.fit_transform(x)
    train = model.fit(x)
    y = model.labels_

    x_ica = model.transform(x)
    df_ica = pd.DataFrame(
      x_ica,
      columns = ['PC{}'.format(i + 1) for i in range(n_components)]
    )

    print('=' * 100)
    print(title)
    print('=' * 100)
    print('K-Means & PCA Inertia :', model.inertia_)
    print('=' * 100)

    if save == True:
      with open(title + '.sav', 'wb') as f:
        pickle.dump(train, f)
        print(title, 'has been saved')
        print('=' * 100)
    else:
      pass

    if graph == True:
      fig = px.scatter_3d(
        df_ica,
        x = df_ica.columns[0],
        y = df_ica.columns[1],
        z = df_ica.columns[2],
        color = y
      )
      fig.show()
    else:
      pass
  except:
    pass

In [None]:
x_var = [
  'sepal length (cm)',
  'sepal width (cm)',
  'petal length (cm)',
  'petal width (cm)'
]

kmeans_ica_3d(x_var, n_components = 4, save = False, graph = True)

#### 08.02.02 Association
discovering relationships between items in a dataset

##### 08.02.02.01 Apriori Algorithm

##### 08.02.02.02 Eclat

##### 08.02.02.03 FP-growth Algorithm