# Modeling

## Preparation

In [27]:
# Loading Libraries

import pandas as pd
import numpy as np
from sklearn.preprocessing import Normalizer, MinMaxScaler
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
import causalnex
from causalnex.structure.notears import from_pandas
from causalnex.plots import plot_structure, NODE_STYLE, EDGE_STYLE
from causalnex.discretiser import Discretiser
from causalnex.structure import DAGRegressor
from causalnex.inference import InferenceEngine
from causalnex.network import BayesianNetwork
from causalnex.network.sklearn import BayesianNetworkClassifier
import os
import sys
from IPython.display import Image
import copy
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestRegressor

import time
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Loading Dataset

raw_df = pd.read_csv("../data/data.csv")
raw_df.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,


### Feature Extraction

In [28]:
# Separating features and target
 
features_df = raw_df.drop('diagnosis',axis=1).iloc[:,:-1]
target_df = raw_df['diagnosis']

In [44]:
# random forest checker
def forest_test(features_r, target_r):
    features = StandardScaler().fit_transform(features_r)
    target = LabelEncoder().fit_transform(target_r)

    X_Train, X_Test, Y_Train, Y_Test = train_test_split(features, target, 
                                                        test_size = 0.30, 
                                                        random_state = 11)
    forest = RandomForestClassifier(n_estimators=700)
    _ = forest.fit(X_Train, Y_Train)
    print(f"accuracy score: {forest.score(X_Test, Y_Test)}")

In [45]:
forest_test(features_df, target_df)

accuracy score: 0.9649122807017544


In [33]:
def select_features_RFE(features_r, target_r, num):
    features = StandardScaler().fit_transform(features_r)
    target = LabelEncoder().fit_transform(target_r)
    # Init the transformer
    rfe = RFE(estimator=RandomForestRegressor(), n_features_to_select=num)

    # Fit to the training data
    _ = rfe.fit(features, target)

    # extract features
    new_features = features_r.loc[:, rfe.support_]

    return new_features
        

In [46]:
selected_feat = select_features_RFE(features_df, target_df, 7)
selected_feat.head()

Unnamed: 0,texture_mean,concave points_mean,radius_worst,texture_worst,perimeter_worst,area_worst,concave points_worst
0,10.38,0.1471,25.38,17.33,184.6,2019.0,0.2654
1,17.77,0.07017,24.99,23.41,158.8,1956.0,0.186
2,21.25,0.1279,23.57,25.53,152.5,1709.0,0.243
3,20.38,0.1052,14.91,26.5,98.87,567.7,0.2575
4,14.34,0.1043,22.54,16.67,152.2,1575.0,0.1625


In [47]:
forest_test(selected_feat, target_df)

accuracy score: 0.9532163742690059


### Observation
- As can be seen from the random forest regression test, the accuracy has decreased only by 1%

### Scaling and Normalization

In [57]:
def normalize(df):
  normald = Normalizer()
  normal = pd.DataFrame(normald.fit_transform(df))
  return normal

def scale(df):
  scaler = MinMaxScaler()
  scaled = pd.DataFrame(scaler.fit_transform(df))
  return scaled

def scale_and_normalize(df):
  columns = df.columns.to_list()
  normScaled = normalize(scale(df))
  normScaled.columns = columns
  return normScaled


In [49]:
# features before scaling
selected_feat.head()

Unnamed: 0,texture_mean,concave points_mean,radius_worst,texture_worst,perimeter_worst,area_worst,concave points_worst
0,10.38,0.1471,25.38,17.33,184.6,2019.0,0.2654
1,17.77,0.07017,24.99,23.41,158.8,1956.0,0.186
2,21.25,0.1279,23.57,25.53,152.5,1709.0,0.243
3,20.38,0.1052,14.91,26.5,98.87,567.7,0.2575
4,14.34,0.1043,22.54,16.67,152.2,1575.0,0.1625


In [58]:
# scaling and normalizing
scaled = scale_and_normalize(selected_feat)
scaled.head()

Unnamed: 0,texture_mean,concave points_mean,radius_worst,texture_worst,perimeter_worst,area_worst,concave points_worst
0,0.014559,0.469784,0.398885,0.090938,0.429429,0.2896,0.586032
1,0.219236,0.280512,0.488141,0.244168,0.434184,0.35005,0.514099
2,0.269839,0.439535,0.384704,0.248968,0.351554,0.258948,0.577383
3,0.298195,0.432091,0.205202,0.318928,0.199447,0.077688,0.731259
4,0.139256,0.461043,0.462248,0.110224,0.450867,0.303789,0.496645


['texture_mean',
 'concave points_mean',
 'radius_worst',
 'texture_worst',
 'perimeter_worst',
 'area_worst',
 'concave points_worst']

## Causal Graph Discovery

## Bayesian Network Modeling

## Logistic Regression Modeling