# Modeling

## Preparation

In [2]:
# Loading Libraries

import pandas as pd
import numpy as np
from sklearn.preprocessing import Normalizer, MinMaxScaler
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import os
import sys
from IPython.display import Image
import copy
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import recall_score, f1_score, accuracy_score, precision_score
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix as con_mat
from sklearn import metrics
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report


import causalnex
from causalnex.structure.notears import from_pandas
from causalnex.plots import plot_structure, NODE_STYLE, EDGE_STYLE
from causalnex.discretiser import Discretiser
from causalnex.structure import DAGRegressor
from causalnex.inference import InferenceEngine
from causalnex.network import BayesianNetwork
from causalnex.network.sklearn import BayesianNetworkClassifier
from causalnex.discretiser.discretiser_strategy import (
    DecisionTreeSupervisedDiscretiserMethod,
)
from causalnex.network import BayesianNetwork
from causalnex.inference import InferenceEngine
import mlflow
import time
import warnings
warnings.filterwarnings("ignore")

In [None]:
sys.path.append(os.path.abspath(os.path.join('../scripts')))
from utils import Utils
Util = Utils("../logs/modeling_notebook.logs")

In [None]:
# Loading Dataset

raw_df = pd.read_csv("../data/data.csv")
raw_df = raw_df.iloc[:,1:-1]
raw_df.head()

### Feature Extraction

In [None]:
# Separating features and target
 
features_df = raw_df.drop('diagnosis',axis=1)
target_df = raw_df['diagnosis']

In [None]:
features_df.head()

In [None]:
target_df

In [None]:
# lets eleminate highly correlated features first
#correlation matrix
Util.show_corr(features_df, "correlation matrix of features")

In [None]:
features_df2 = Util.remove_correlated(features_df, 0.9)
features_df2.head()

In [None]:
Util.forest_test(features_df2, target_df)

In [None]:
selected_feat = Util.select_features_RFE(features_df2, target_df, 10)
selected_feat.head()

In [None]:
Util.forest_test(selected_feat, target_df)

### Observation
- As can be seen from the random forest regression test, the accuracy has decreased only by 1%

### Scaling and Normalization

In [None]:
# scaling and normalizing
scaled = Util.scale_and_normalize(selected_feat)
scaled.head()

## Causal Graph Discovery

In [None]:
# rejoin target and features
pure_df = selected_feat.copy()
pure_df["target"] = LabelEncoder().fit_transform(target_df)

# 1 means Malignunt and 0 means Benign 
pure_df.head()

In [None]:
factor = len(pure_df)/100
pure_20 = pure_df.sample(int(20*factor), random_state=11)
print(f"size of pure_20 : {len(pure_20)}")
pure_40 = pure_df.sample(int(40*factor), random_state=11)
print(f"size of pure_40 : {len(pure_40)}")
pure_60 = pure_df.sample(int(60*factor), random_state=11)
print(f"size of pure_60 : {len(pure_60)}")
pure_80 = pure_df.sample(int(80*factor), random_state=11)
print(f"size of pure_80 : {len(pure_80)}")
print(f"size of pure_100 : {len(pure_df)}")

In [None]:
# generate the ground truth structure
ground_truth = from_pandas(pure_df, tabu_parent_nodes=["target"])

In [None]:
# generate structures from fractional datasets

sm20 = from_pandas(pure_20, tabu_parent_nodes=["target"])
sm40 = from_pandas(pure_40, tabu_parent_nodes=["target"])
sm60 = from_pandas(pure_60, tabu_parent_nodes=["target"])
sm80 = from_pandas(pure_80, tabu_parent_nodes=["target"])

### Check Stability of Graph

In [None]:
Util.plot_graph(ground_truth, 0.8)

In [None]:
print(Util.jacc_index(ground_truth,sm20, 0.8, 0.8))
Util.plot_graph(sm20, 0.8)

In [None]:
print(Util.jacc_index(ground_truth,sm40, 0.8, 0.8))
Util.plot_graph(sm40, 0.8)

In [None]:
print(Util.jacc_index(ground_truth,sm60, 0.8, 0.8))
Util.plot_graph(sm60, 0.8)

In [None]:
print(Util.jacc_index(ground_truth,sm80, 0.8, 0.8))
Util.plot_graph(sm80, 0.8)

## Modeling

### Preparing Data and Graph

In [None]:
# preparing data for logistic regression modelling
original_x = selected_feat
original_y = target_df.apply(lambda x: 0 if x=="B" else 1)
original_full = original_x.copy()
original_full["target"] = original_y

filtered = Util.filter_by_blanket(Util.apply_treshold(ground_truth, 0.8), original_x, "target")
filtered_x = filtered[0]
filtered_y = original_y
filtered_full = filtered_x.copy()
filtered_full["target"] = filtered_y


# preparing graphs for bayesian network modeling
original_sm = Util.apply_treshold(ground_truth, 0.8)
filtered_sm = filtered[1]

### Bayesian Network Modeling

In [None]:

desc_ori_df = Util.data_descretiser(original_full, original_x.columns.to_list(), "target")
desc_filt_df = Util.data_descretiser(filtered_full, filtered_x.columns.to_list(), "target")

In [None]:
# split dataset.
train_o, test_o = train_test_split( desc_ori_df, train_size=0.8, test_size=0.2, random_state=27)
train_f, test_f = train_test_split( desc_filt_df, train_size=0.8, test_size=0.2, random_state=27)

In [None]:
# create Bayesian Network
bn1 = Util.get_bayesian_net(desc_ori_df, train_o, original_sm)
bn2 = Util.get_bayesian_net(desc_filt_df, train_f, filtered_sm)

In [None]:
pred_v1 = bn1.predict(test_o, 'target')
true_v1 = test_o['target']

pred_v2 = bn2.predict(test_f, 'target')
true_v2 = test_f['target']

In [None]:
print('Recall: {:.4f}'.format(recall_score(y_true=true_v1, y_pred=pred_v1)))
print('F1: {:.4f} '.format(f1_score(y_true=true_v1, y_pred=pred_v1)))
print('Accuracy: {:.4f} '.format(accuracy_score(y_true=true_v1, y_pred=pred_v1)))
print('Precision: {:.4f} '.format(precision_score(y_true=true_v1, y_pred=pred_v1)))

In [None]:
print('Recall: {:.4f}'.format(recall_score(y_true=true_v2, y_pred=pred_v2)))
print('F1: {:.4f} '.format(f1_score(y_true=true_v2, y_pred=pred_v2)))
print('Accuracy: {:.4f} '.format(accuracy_score(y_true=true_v2, y_pred=pred_v2)))
print('Precision: {:.4f} '.format(precision_score(y_true=true_v2, y_pred=pred_v2)))

### Observation
- as can be seen from the above results the prediction ability of the data is the same
  whether the nodes outside the blanket are included or not 

### Logistic Regression Modeling

#### Model Training

In [None]:
# training with the original dataset

X_train, X_test, y_train, y_test = train_test_split(original_x, original_y, test_size=0.2, random_state=0)
log_model_o = LogisticRegression()
log_model_o.fit(X_train, y_train)

In [None]:
# training with the filtered dataset

X_train2, X_test2, y_train2, y_test2 = train_test_split(filtered_x, filtered_y, test_size=0.2, random_state=0)
log_model_f = LogisticRegression()
log_model_f.fit(X_train2, y_train2)

#### Prediction

In [None]:
y_pred_o = log_model_o.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.4f}'.format(log_model_o.score(X_test, y_test)))

In [None]:
y_pred_f = log_model_f.predict(X_test2)
print('Accuracy of logistic regression classifier on test set: {:.4f}'.format(log_model_f.score(X_test2, y_test2)))

In [None]:
# confustion matrix for ogininal dataset

confusion_matrix = con_mat(y_test, y_pred_o)
print(confusion_matrix)

In [None]:
# confustion matrix for filtered dataset

confusion_matrix = con_mat(y_test2, y_pred_f)
print(confusion_matrix)

In [None]:
Util.get_metrics(y_true=y_test, y_pred=y_pred_o)

In [None]:
Util.get_metrics(y_true=y_test2, y_pred=y_pred_f)