# DIS 2016 pipeline a Pancreas

# The ICD-10-CM code **C25** refers to:

### **C25 – Malignant neoplasm of pancreas**

This is a general code for **pancreatic cancer**. It includes more specific sub-codes based on the exact part of the pancreas affected:




| **Code**   | **Description**                                                                 |
|------------|----------------------------------------------------------------------------------|
| **C25.0**  | Head of pancreas                                                               |
| **C25.1**  | Body of pancreas                                                               |
| **C25.2**  | Tail of pancreas                                                               |
| **C25.3**  | Pancreatic duct                                                               |
| **C25.4**  | Islets of Langerhans                                                          |
| **C25.7**  | Other parts of pancreas                                                        |
| **C25.8**  | Overlapping lesion of pancreas                                                 |
| **C25.9**  | Pancreas, unspecified                                                          |


In [None]:
# !pip install -U "ray[data,train,tune,serve]"
# !conda remove --name phd --all -y && conda create --name phd python=3.9 -y && conda install pandas numpy openpyxl scikit-learn ipykernel notebook -y && conda -m ipykernel install --user --name indox --display-name "indox" -y 

In [6]:
# pip install modin swifter

In [9]:
#importing libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer

import modin.pandas as mpd
import multiprocessing
import swifter
import ray

import warnings
warnings.filterwarnings('ignore')

# check GPU

In [1]:
# import torch

# print("CUDA Available:", torch.cuda.is_available())
# if torch.cuda.is_available():
#     print("Device Name:", torch.cuda.get_device_name(0))
#     print("CUDA Version:", torch.version.cuda)


# Config Ray to use multiple Cpu

In [6]:
!ray stop --force

2025-05-05 18:38:29,753	VINFO scripts.py:1314 -- Killed `C:\Users\llmserver\.conda\envs\indox\Lib\site-packages\ray\core\src\ray\gcs\gcs_server.exe --log_dir=C:\Users\LLMSER~1\AppData\Local\Temp\ray\session_2025-05-05_17-38-57_712273_38512\logs --config_list=eyJvYmplY3Rfc3BpbGxpbmdfY29uZmlnIjogIntcInR5cGVcIjogXCJmaWxlc3lzdGVtXCIsIFwicGFyYW1zXCI6IHtcImRpcmVjdG9yeV9wYXRoXCI6IFwiQzpcXFxcVXNlcnNcXFxcTExNU0VSfjFcXFxcQXBwRGF0YVxcXFxMb2NhbFxcXFxUZW1wXFxcXHJheVxcXFxzZXNzaW9uXzIwMjUtMDUtMDVfMTctMzgtNTdfNzEyMjczXzM4NTEyXCJ9fSIsICJpc19leHRlcm5hbF9zdG9yYWdlX3R5cGVfZnMiOiB0cnVlfQ== --gcs_server_port=6379 --metrics-agent-port=59527 --node-ip-address=192.168.1.100 --session-name=session_2025-05-05_17-38-57_712273_38512 --ray-commit=4883bd5f66086771574a2f4f990effc505f569bc --stdout_filepath=C:\Users\LLMSER~1\AppData\Local\Temp\ray\session_2025-05-05_17-38-57_712273_38512\logs\gcs_server.out --stderr_filepath=C:\Users\LLMSER~1\AppData\Local\Temp\ray\session_2025-05-05_17-38-57_712273_38512\logs\gcs_ser

In [11]:
!ray start --head --port=6380 --num-gpus=1 --node-ip-address=192.168.1.100 --include-dashboard=false --disable-usage-stats

^C


In [1]:
# import ray
# ray.init(address="auto")  # Will connect to 127.0.0.1:6379

In [2]:
# import ray
# import torch



# @ray.remote(num_gpus=1)
# def gpu_task():
#     return {
#         "cuda_available": torch.cuda.is_available(),
#         "device": torch.cuda.get_device_name(0) if torch.cuda.is_available() else "None"
#     }

# result = ray.get(gpu_task.remote())
# print("Ray GPU Test Result:", result)


# Connecting to Database and extracting data

In [3]:
# conn_params_dic = {
#     "host"      : "compute-101",
#     "database"  : "nis_database",
#     "user"      : "",
#     "password"  : "",
# }

# conn = database.connect(conn_params_dic)

In [12]:
ls ../data/data_lung_cancer/

combined_died_only_2016_2019.csv


### Change the SQL Query to collect the data that you need.

In [13]:
import pandas as pd

# Read the CSV file into a DataFrame
Lung_cancer = pd.read_csv('../data/data_lung_cancer/combined_died_only_2016_2019.csv')

Lung_cancer.shape

(46995, 399)

In [14]:
# list of all of columns
print(Lung_cancer.columns.tolist())
len(Lung_cancer.columns.tolist())

['age', 'ageday', 'agemonth', 'atype', 'aweekend', 'bwt', 'died', 'dispub04', 'dispuniform', 'disp_x', 'dqtr', 'drg', 'drgver', 'drg_nopoa', 'dshospid', 'dxpoa1', 'dxpoa2', 'dxpoa3', 'dxpoa4', 'dxpoa5', 'dxpoa6', 'dxpoa7', 'dxpoa8', 'dxpoa9', 'dxpoa10', 'dxpoa11', 'dxpoa12', 'dxpoa13', 'dxpoa14', 'dxpoa15', 'dxpoa16', 'dxpoa17', 'dxpoa18', 'dxpoa19', 'dxpoa20', 'dxpoa21', 'dxpoa22', 'dxpoa23', 'dxpoa24', 'dxpoa25', 'dxpoa26', 'dxpoa27', 'dxpoa28', 'dxpoa29', 'dxpoa30', 'dxpoa31', 'dxpoa32', 'dxpoa33', 'dxpoa34', 'dxpoa35', 'dxpoa36', 'dxpoa37', 'dxpoa38', 'dxpoa39', 'dxpoa40', 'dxpoa41', 'dxpoa42', 'dxpoa43', 'dxpoa44', 'dxpoa45', 'dxpoa46', 'dxpoa47', 'dxpoa48', 'dxpoa49', 'dxpoa50', 'dxpoa51', 'dxpoa52', 'dxpoa53', 'dxpoa54', 'dxpoa55', 'dxpoa56', 'dxpoa57', 'dxpoa58', 'dxpoa59', 'dxpoa60', 'dxpoa61', 'dxpoa62', 'dxpoa63', 'dxpoa64', 'dxpoa65', 'dxpoa66', 'dxpoa67', 'dxpoa68', 'dxpoa69', 'dxpoa70', 'dxpoa71', 'dxpoa72', 'dxpoa73', 'dxpoa74', 'dxpoa75', 'dxpoa76', 'dxpoa77', 'dxpoa78'

399

In [19]:
del Lung_cancer["year"]

In [20]:
Lung_cancer["ayear"].value_counts()

ayear
2018.0    11929
2016.0    11881
2017.0    11776
2019.0    11130
2015.0      278
2014.0        1
Name: count, dtype: int64

In [21]:

# Save a CSV file of all Lung_cancer columns
columns_df = pd.DataFrame(Lung_cancer.columns.tolist(), columns=['Column Names'])
columns_df.to_csv('../data/data_lung_cancer/lung_cancer_columns.csv', index=False)
print(f" Column names saved to ../data/lung_cancer_columns.csv \n ({len(Lung_cancer.columns)} columns)")


 Column names saved to ../data/lung_cancer_columns.csv 
 (398 columns)


In [5]:

# Create a copy of the lung cancer data for further processing
Lung_cancer = Lung_cancer.copy()
print(f"Created Cancer_pancreas DataFrame with shape: {Lung_cancer.shape}")
Lung_cancer.shape

Created Cancer_pancreas DataFrame with shape: (28753, 397)


(28753, 397)

# Died or Alive

In [6]:
Lung_cancer["died"].value_counts().rename(index={0: "Alive", 1: "Died"}).rename_axis("Status").reset_index(name="Count")


Unnamed: 0,Status,Count
0,Alive,26345
1,Died,2397
2,-9,11


In [7]:
# Sort by visit date or 'daystoevent' or 'ayear' or 'los' (length of stay)
Lung_cancer = Lung_cancer.sort_values(by=['visitlink', 'daystoevent'], ascending=True)

# Keep first visit only
df_unique = Lung_cancer.drop_duplicates(subset='visitlink', keep='first')

Lung_cancer.shape

(28753, 397)

In [8]:
Lung_cancer["ageday"].value_counts()

ageday
-99    28751
-66        2
Name: count, dtype: int64

In [11]:
len(Lung_cancer["ageday"])

28753

In [9]:
Lung_cancer["ageday"].value_counts()

ageday
-99    28751
-66        2
Name: count, dtype: int64

# Preprocessing
## Cleaning Data

In [15]:
# Define column prefixes
diagnosis_column_prefix = 'i10_dx'
ecause_column_prefix = 'i10_ecause'
prosedures_column_prefix = 'i10_pr'
prday = 'prday'

# Get columns starting with each prefix
diagnosis_columns = list(Lung_cancer.columns[np.where(Lung_cancer.columns.str.startswith(diagnosis_column_prefix))])
ecause_columns = list(Lung_cancer.columns[np.where(Lung_cancer.columns.str.startswith(ecause_column_prefix))])
prosedures_columns = list(Lung_cancer.columns[np.where(Lung_cancer.columns.str.startswith(prosedures_column_prefix))])
prday_columns = list(Lung_cancer.columns[np.where(Lung_cancer.columns.str.startswith(prday))])

# Define excluded columns
excluded_columns = ['hosp_nis','key_nis','hosp_division','year','hosp_nis','dxver','prver']

# Safely drop columns (skip if any list doesn't exist)
columns_to_drop = []
for col_list in [excluded_columns, ecause_columns, prday_columns]:
    try:
        if col_list:  # Check if list exists and is not empty
            columns_to_drop.extend(col_list)
    except NameError:
        pass  # Skip if variable doesn't exist

# Only drop columns if there are any to drop and they exist in the DataFrame
if columns_to_drop:
    existing_columns = [col for col in columns_to_drop if col in Lung_cancer.columns]
    if existing_columns:
        Lung_cancer = Lung_cancer.drop(existing_columns, axis=1)

# Initialize other variables
other_features = []
disease_code = ""

In [16]:
Lung_cancer

Unnamed: 0,age,ageday,agemonth,asource,asource_x,atype,aweekend,bwt,daysburnunit,daysccu,...,payer3_x,pointoforigin_x,pointoforiginub04,zip,i10_birth,i10_delivery,i10_orproc,i10_serviceline,pstate_geo,pstco_geo
181,54,-99,-99,5.0,60.0,1,0,-999,-99,-99,...,,,,,,,,,,
227,57,-99,-99,5.0,60.0,1,0,-999,-99,-99,...,,,,,,,,,,
575,57,-99,-99,5.0,60.0,1,0,-999,-99,6,...,,,,,,,,,,
1353,58,-99,-99,5.0,60.0,1,0,-999,-99,-99,...,,,,,,,,,,
1357,75,-99,-99,5.0,29.0,-9,0,-999,-99,-99,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28717,67,-99,-99,,,3,0,-999,-99,-99,...,,01,1,20747,0.0,0.0,1.0,4.0,MD,24033.0
28718,76,-99,-99,,,3,0,-999,-99,-99,...,,01,1,20902,0.0,0.0,1.0,4.0,MD,24031.0
28750,56,-99,-99,,,1,0,-999,-99,-99,...,,04,4,21207,0.0,0.0,0.0,3.0,MD,24005.0
28751,79,-99,-99,,,1,1,-999,-99,-99,...,,04,4,21653,0.0,0.0,1.0,4.0,MD,24041.0


In [27]:
import numpy as np

print("Shape of dataframe before cleanup: {}".format(Lung_cancer.shape))

# Clean up the extracted data. Also remove the disease code that we are about to predict
for col in Lung_cancer.columns:
    if not (str(col).startswith(diagnosis_column_prefix[:4]) or str(col).startswith('prday') or str(col) in excluded_columns):
        other_features.append(str(col))
    Lung_cancer[col]=Lung_cancer[col].replace(['','invl','-99','-9','invl','incn' , np.nan],['0','0','0','0','0','0' , '0'])
    Lung_cancer[col]=Lung_cancer[col].replace([-9,-99],[0,0])
print("Shape of dataframe after cleanup: {}".format(Lung_cancer.shape))

Shape of dataframe before cleanup: (28753, 322)
Shape of dataframe after cleanup: (28753, 322)


In [28]:
X = Lung_cancer.copy()
X.shape

(28753, 322)

In [29]:

X = X[X.replace([np.inf, -np.inf], np.nan).notnull().all(axis=1)]
print("shape of X" , X.shape)

shape of X (28753, 322)


## Combining columns

In [30]:
X = mpd.DataFrame(X)
X.shape

(28753, 322)

In [31]:
len(diagnosis_columns)

78

In [32]:
X[prosedures_columns]

Unnamed: 0,i10_pr1,i10_pr2,i10_pr3,i10_pr4,i10_pr5,i10_pr6,i10_pr7,i10_pr8,i10_pr9,i10_pr10,...,i10_pr55,i10_pr56,i10_pr57,i10_pr58,i10_pr59,i10_pr60,i10_pr61,i10_pr62,i10_pr63,i10_pr64
181,0BBF3ZX,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
227,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
575,0W9D40Z,0BNP4ZZ,5A1935Z,0W9B40Z,8E0W4CZ,0W9D30Z,0W9930Z,0W9930Z,BW24YZZ,B030YZZ,...,0,0,0,0,0,0,0,0,0,0
1353,0BB38ZX,0BB78ZX,0BJ08ZZ,05HH33Z,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1357,0JR007Z,0JBL0ZZ,04HL33Z,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28717,0BBJ8ZZ,0BNJ4ZZ,07B74ZX,0W9B30Z,0W9B30Z,0W9B30Z,3E02340,0,0,0,...,0,0,0,0,0,0,0,0,0,0
28718,0BTG4ZZ,07B74ZZ,0W9B40Z,8E0W4CZ,0WPBX0Z,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
28750,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
28751,0QS836Z,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [33]:
del prosedures_columns[30]

In [51]:
pwd

'/Users/parsanemati/Library/CloudStorage/OneDrive-FUTUREHOPES/PHD/PHD-Proposal/preprocessing'

In [53]:
from utils import database, pipeline, ICD_Mapper,database

In [54]:
X = pipeline.combine_disease_codes(X,diagnosis_columns,"disease_codes")

In [55]:
X = pipeline.combine_disease_codes(X,prosedures_columns,"prosedures_codes")

In [56]:
del X["bwt"]

In [57]:
X.head()

Unnamed: 0,age,ageday,agemonth,asource,asource_x,atype,aweekend,daysburnunit,daysccu,daysicu,...,pointoforiginub04,zip,i10_birth,i10_delivery,i10_orproc,i10_serviceline,pstate_geo,pstco_geo,disease_codes,prosedures_codes
181,54,0,0,5.0,60.0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,"[C3431, J9601, J441, Z6841, J440, J9811, F1721...","[0BBF3ZX, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
227,57,0,0,5.0,60.0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,"[C3490, A419, J440, J189, C7951, C771, E871, J...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
575,57,0,0,5.0,60.0,1,0,0,6,1,...,0,0,0,0,0,0,0,0,"[C7989, J9601, I314, J189, J910, I82621, J449,...","[0W9D40Z, 0BNP4ZZ, 5A1935Z, 0W9B40Z, 8E0W4CZ, ..."
1353,58,0,0,5.0,60.0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,"[C3490, F1120, K7460, N189, B182, E1140, E1151...","[0BB38ZX, 0BB78ZX, 0BJ08ZZ, 05HH33Z, 0, 0, 0, ..."
1357,75,0,0,5.0,29.0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,"[T8131XA, C3490, B399, M349, I272, I4891, I350...","[0JR007Z, 0JBL0ZZ, 04HL33Z, 0, 0, 0, 0, 0, 0, ..."


## Map the combined Column to new categories
In this section, we convert the pandas dataframe to modin dataframe which is really similar to pandas, however, modin uses multiple cpu which make the process faster

In [58]:
%%time
X['disease_codes'] = X['disease_codes'].swifter.set_npartitions(4*multiprocessing.cpu_count()).apply(ICD_Mapper.icd10cm_to_ccsr)
X['prosedures_codes'] = X['prosedures_codes'].swifter.set_npartitions(4*multiprocessing.cpu_count()).apply(ICD_Mapper.icd10prc_to_ccsr)

CPU times: user 1.96 s, sys: 335 ms, total: 2.3 s
Wall time: 2min 46s


In [59]:
mlb = MultiLabelBinarizer(sparse_output=True)
print("Shape of dataset before one hot encoding of disease CM : ",X.shape)
X = X.join(
            mpd.DataFrame.sparse.from_spmatrix(
                mlb.fit_transform(X.pop('disease_codes')),
                index=X.index,
                columns=mlb.classes_))

print("Shape of dataset after on hot encoding of disease CM : ",X.shape)

X = X.join(
            mpd.DataFrame.sparse.from_spmatrix(
                mlb.fit_transform(X.pop('prosedures_codes')),
                index=X.index,
                columns=mlb.classes_))

X = X[X.replace([np.inf, -np.inf], np.nan).notnull().all(axis=1)]
print("Shape of dataset after one-hot-encoding disease PR : ",X.shape)

Shape of dataset before one hot encoding of disease CM :  (28753, 182)
Shape of dataset after on hot encoding of disease CM :  (28753, 593)
Shape of dataset after one-hot-encoding disease PR :  (28753, 864)


In [60]:
X

Unnamed: 0,age,ageday,agemonth,asource,asource_x,atype,aweekend,daysburnunit,daysccu,daysicu,...,URN002,URN003,URN004,URN005,URN006,URN007,URN008,URN009,URN010,URN012
181,54,0,0,5.0,60.0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
227,57,0,0,5.0,60.0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
575,57,0,0,5.0,60.0,1,0,0,6,1,...,0,0,0,0,0,0,0,0,0,0
1353,58,0,0,5.0,60.0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1357,75,0,0,5.0,29.0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28717,67,0,0,0,0,3,0,0,0,13,...,0,0,0,0,0,0,0,0,0,0
28718,76,0,0,0,0,3,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
28750,56,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
28751,79,0,0,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [65]:
ls ../data/

[34mLung_Cancer_one_hot[m[m/     [31mlung_cancer_columns.csv[m[m*
[31melement of data.xlsx[m[m*    [31mmd_sid_core_C34.zip[m[m*


In [71]:
import zipfile

# Save DataFrame to a ZIP-compressed CSV
output_zip_path = '../data/Lung_Cancer_one_hot/md_sid_one_hot_encoding_core_Lung_Cancer.zip'
csv_filename_inside_zip = 'md_sid_one_hot_encoding_core_Lung_Cancer.csv'

X.to_csv(output_zip_path, index=False, compression=dict(method='zip', archive_name=csv_filename_inside_zip))
print(f"Compressed CSV saved to: {output_zip_path}")


Compressed CSV saved to: ../data/Lung_Cancer_one_hot/md_sid_one_hot_encoding_core_Lung_Cancer.zip


In [67]:
# Convert column names list to DataFrame and save to Excel file
import pandas as pd

# Get column names as list
columns_list = X.columns.tolist()

# Convert to DataFrame (as a single column)
columns_df = pd.DataFrame(columns_list, columns=['Column Names'])

# Save to Excel file
columns_df.to_excel("../data/column_names.xlsx", index=False)

print(f"Column names saved to ../data/column_names.xlsx ({len(columns_list)} columns)")

Column names saved to ../data/column_names.xlsx (864 columns)


In [68]:
X.columns.tolist()

['age',
 'ageday',
 'agemonth',
 'asource',
 'asource_x',
 'atype',
 'aweekend',
 'daysburnunit',
 'daysccu',
 'daysicu',
 'daysnicu',
 'dayspicu',
 'daysshockunit',
 'daystoevent',
 'died',
 'disp_x',
 'dispuniform',
 'dqtr',
 'drg',
 'drg_nopoa',
 'drgver',
 'dshospid',
 'dxpoa1',
 'dxpoa2',
 'dxpoa3',
 'dxpoa4',
 'dxpoa5',
 'dxpoa6',
 'dxpoa7',
 'dxpoa8',
 'dxpoa9',
 'dxpoa10',
 'dxpoa11',
 'dxpoa12',
 'dxpoa13',
 'dxpoa14',
 'dxpoa15',
 'dxpoa16',
 'dxpoa17',
 'dxpoa18',
 'dxpoa19',
 'dxpoa20',
 'dxpoa21',
 'dxpoa22',
 'dxpoa23',
 'dxpoa24',
 'dxpoa25',
 'dxpoa26',
 'dxpoa27',
 'dxpoa28',
 'dxpoa29',
 'dxpoa30',
 'e_poa1',
 'e_poa2',
 'e_poa3',
 'e_poa4',
 'e_poa5',
 'e_poa6',
 'e_poa7',
 'e_poa8',
 'female',
 'hcup_ed',
 'hcup_os',
 'hispanic',
 'hispanic_x',
 'homeless',
 'hospitalunit',
 'hospst',
 'i10_ndx',
 'i10_necause',
 'i10_npr',
 'i10_proctype',
 'key',
 'los',
 'los_x',
 'maritalstatus_x',
 'maritalstatusub04',
 'mdc',
 'mdc_nopoa',
 'mdnum1_r',
 'mdnum2_r',
 'medincstq

# Remove Redundant Features

In [69]:
for i in X.columns:
    if np.count_nonzero(X[i])<5:
        X = X.drop(i, axis=1)
print("Shape of the DataFrame after the feature reduction:",X.shape)

Shape of the DataFrame after the feature reduction: (28753, 747)


In [70]:
np.count_nonzero(X["BLD001"])

1979

# We are not consider below code

# We are not consider below code

# Machine Learning
This is just a simple Machine learning Model. If you want, you could save the dataframe into csv file and load it for machine learning purpose. Check the todo list for making a better machine learning model.

In [38]:
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, f1_score,precision_score,recall_score
from sklearn.preprocessing import MinMaxScaler

In [39]:
target = "BLD001"#ICD_Mapper.icd10cm_to_ccsr("J45909")
target

'BLD001'

In [36]:
y = X[target].copy()
X = X.drop(target, axis=1)

In [43]:
scaler = MinMaxScaler()
scaler.fit(X)
X=scaler.transform(X)

In [4]:
models = {"Decision Tree": DecisionTreeClassifier(),
          "Logistic Regression":LogisticRegression(max_iter=1000),  
          "MLP":MLPClassifier(max_iter=1000),
          "Random Forest": RandomForestClassifier()}

NameError: name 'DecisionTreeClassifier' is not defined

In [3]:
def fit_and_score(models, x_train, x_test, y_train, y_test,result,n):
    
    np.random.seed(42)
    
    model_scores={}
    model_report={}
    model_f1={}
    model_precision={}
    model_recall={}
    
    for name,model in models.items():
        model.fit(x_train,y_train)
        y_predict = model.predict(x_test)
        model_scores[name] = model.score(x_test,y_test)
        model_f1[name] = f1_score(y_test,y_predict)
        model_precision[name] = precision_score(y_test,y_predict)
        model_recall[name] = recall_score(y_test,y_predict)

        result = result.append({"Model Name":name,"No features":n,"Accuracy":model_scores[name],"F1 Score":model_f1[name],"Precision":model_precision[name],"Recall":model_recall[name]},ignore_index=True)
        
    return result

In [44]:
start = 5
end = X.shape[1]
step = 100

result = pd.DataFrame(columns=["Model Name","No features","Accuracy","F1 Score","Precision","Recall"])

print("Start Training")
for n_feature in range(start,end,step):
    
    X_t = SelectKBest(chi2,k=n_feature).fit_transform(X,y)
    
    x_sparse = csr_matrix(X_t.astype(float))
    
    X_train, X_test, y_train, y_test = train_test_split(x_sparse, y, test_size=0.2, random_state=42)
    
    result = fit_and_score(models, X_train, X_test, y_train, y_test,result,n_feature)

print("Done.")

Start Training
Done.


In [45]:
result

Unnamed: 0,Model Name,No features,Accuracy,F1 Score,Precision,Recall
0,Decision Tree,5,0.9645,0.027397,1.0,0.013889
1,Logistic Regression,5,0.965,0.078947,0.75,0.041667
2,MLP,5,0.965,0.054054,1.0,0.027778
3,Random Forest,5,0.9645,0.027397,1.0,0.013889
4,Decision Tree,105,0.939,0.152778,0.152778,0.152778
5,Logistic Regression,105,0.965,0.102564,0.666667,0.055556
6,MLP,105,0.9575,0.158416,0.275862,0.111111
7,Random Forest,105,0.9635,0.026667,0.333333,0.013889
8,Decision Tree,205,0.931,0.158537,0.141304,0.180556
9,Logistic Regression,205,0.964,0.076923,0.5,0.041667


#### TO DO:
* Do Grid Search to find best parameter for models
* Modify the code to use 10 fold cross validation
* Create some Deep Neural Network model and add them into model 
    * note: You can use auto-Keras to do this
    * remember auto-keras may not be able to find the best NN, you can contact me for analyzing the model you created.
* After finding the best model Do not transfer it into sparse matrix or dense numpy. just train the dataframe again with the best model on the cluster and save the model. Use the model to analyze the performance of the model with SHAP (https://shap.readthedocs.io/en/latest/index.html).