<a href="https://colab.research.google.com/github/aymuos/masters-practise-repo/blob/main/TERM2/ML_Lab/Project/industrial-ai-project/project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

from sklearn.impute import KNNImputer

warnings.filterwarnings('ignore')

In [3]:
raw_data = pd.read_csv('train.csv')

id: This column likely represents a unique identifier for each patient or observation in the dataset.

N_Days: This column represents the number of days for which the data was collected or observed for each patient.

Drug: This column likely represents the type of drug or treatment that the patient was given.

Age: This column represents the age of the patient.

Sex: This column represents the sex of the patient,
typically coded as 0 for female and 1 for male.

Ascites: This column represents the presence or absence of ascites, which is the accumulation of fluid in the abdominal cavity. It is likely coded as 0 for absent and 1 for present.

Hepatomegaly: This column represents the presence or absence of hepatomegaly, which is the enlargement of the liver. It is likely coded as 0 for absent and 1 for present.

Spiders: This column represents the presence or absence of spider angiomas, which are small blood vessels that appear on the skin. It is likely coded as 0 for absent and 1 for present.

Edema: This column represents the presence or absence of edema, which is the swelling of tissues due to the accumulation of fluid. It is likely coded as 0 for absent and 1 for present.

Bilirubin: This column represents the level of bilirubin in the patient's blood, which is a measure of liver function.

Cholesterol: This column represents the level of cholesterol in the patient's blood.

Albumin: This column represents the level of albumin in the patient's blood, which is a measure of liver function.

Copper: This column represents the level of copper in the patient's blood.

Alk_Phos: This column represents the level of alkaline phosphatase in the patient's blood, which is a measure of liver function.

SGOT: This column represents the level of serum glutamic-oxaloacetic transaminase in the patient's blood, which is a measure of liver function.

Tryglicerides: This column represents the level of triglycerides in the patient's blood.

Platelets: This column represents the number of platelets in the patient's blood.

Prothrombin: This column represents the prothrombin time, which is a measure of the time it takes for the blood to clot.

Stage: This column likely represents the stage or severity of the patient's liver disease.

Status: This column likely represents the outcome or status of the patient, such as alive or dead.


In [4]:
numerical_cols = raw_data.select_dtypes(include=['number']).columns
categorical_cols = raw_data.select_dtypes(exclude=['number']).columns

numerical_cols , categorical_cols

(Index(['id', 'N_Days', 'Age', 'Bilirubin', 'Cholesterol', 'Albumin', 'Copper',
        'Alk_Phos', 'SGOT', 'Tryglicerides', 'Platelets', 'Prothrombin',
        'Stage'],
       dtype='object'),
 Index(['Drug', 'Sex', 'Ascites', 'Hepatomegaly', 'Spiders', 'Edema', 'Status'], dtype='object'))

In [5]:
raw_data.isnull().sum()

Unnamed: 0,0
id,0
N_Days,0
Drug,6551
Age,0
Sex,0
Ascites,6541
Hepatomegaly,6550
Spiders,6553
Edema,0
Bilirubin,0


In [6]:
raw_data.describe()

Unnamed: 0,id,N_Days,Age,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage
count,15000.0,15000.0,15000.0,15000.0,6663.0,15000.0,8360.0,8445.0,8444.0,6615.0,14423.0,14982.0,15000.0
mean,7499.5,1960.824867,19293.9764,1.877947,327.944034,3.523881,75.660299,1682.848147,107.009773,109.772789,254.426465,10.620505,3.031533
std,4330.271354,1288.185417,3653.413132,2.744575,177.842672,0.372401,76.446063,1860.896133,50.983101,47.979168,104.762092,0.732634,0.878895
min,0.0,20.0,26.0,0.3,120.0,2.0,1.0,7.0,12.0,33.0,0.3,9.0,1.0
25%,3749.75,1094.25,16467.0,0.6,242.0,3.29,31.0,728.0,71.0,80.0,182.0,10.1,2.0
50%,7499.5,1776.0,19577.0,0.9,282.0,3.58,52.0,1083.0,97.65,99.0,249.0,10.6,3.0
75%,11249.25,2635.0,22388.0,1.8,361.0,3.77,88.0,1716.0,133.3,133.0,312.0,11.0,4.0
max,14999.0,25594.0,40392.0,28.0,2468.0,4.64,1588.0,24101.2,601.35,598.0,4214.0,18.0,4.0


In [17]:
# finding unique values in each columns
def unique_values(df):
    for cols in df.columns:
        print(cols,':',df[cols].nunique())

unique_values(raw_data)

id : 15000
N_Days : 552
Drug : 3
Age : 384
Sex : 2
Ascites : 2
Hepatomegaly : 2
Spiders : 2
Edema : 3
Bilirubin : 116
Cholesterol : 212
Albumin : 164
Copper : 175
Alk_Phos : 382
SGOT : 202
Tryglicerides : 153
Platelets : 262
Prothrombin : 50
Stage : 4
Status : 3


In [8]:
X = raw_data.drop('Status',axis=1)
y = raw_data['Status']

Pre-processing data

In [32]:
processable_data = raw_data.copy(deep=True)
processable_data.shape

(15000, 20)

In [33]:
columns_to_binary_encode = ['Ascites', 'Hepatomegaly', 'Spiders','Edema']

# Perform binary encoding
for column in columns_to_binary_encode:
    # Create a new column with binary values (0 or 1)
    processable_data[column] = processable_data[column].map({'N': 0, 'Y': 1, np.nan: -1 , 'S':2})

    # Optionally, drop the original categorical column
    # processable_data.drop(column, axis=1, inplace=True)

In [34]:
processable_data.head()

Unnamed: 0,id,N_Days,Drug,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage,Status
0,0,1702.0,Placebo,18806.0,F,0,0,0,0,0.6,310.0,3.85,58.0,4332.0,43.4,112.0,273.0,10.6,3.0,C
1,1,2157.0,Placebo,22646.0,F,0,1,0,0,0.8,255.0,4.14,15.0,1758.0,106.95,172.0,514.0,10.0,3.0,C
2,2,1831.0,D-penicillamine,20483.0,F,0,0,0,0,0.8,256.0,3.85,20.0,646.0,57.0,84.0,204.0,9.9,3.0,C
3,3,971.0,,22646.0,F,-1,-1,-1,0,3.0,,3.03,,,,,258.0,11.8,4.0,D
4,4,1368.0,,21915.0,F,-1,-1,-1,0,0.8,,3.94,,,,,402.0,10.5,4.0,C


In [35]:
# CHECKING FOR UNIQUE VALUES
unique_values(processable_data)

id : 15000
N_Days : 552
Drug : 3
Age : 384
Sex : 2
Ascites : 3
Hepatomegaly : 3
Spiders : 3
Edema : 3
Bilirubin : 116
Cholesterol : 212
Albumin : 164
Copper : 175
Alk_Phos : 382
SGOT : 202
Tryglicerides : 153
Platelets : 262
Prothrombin : 50
Stage : 4
Status : 3


In [36]:
# performing imputations

# Columns to mean impute
columns_to_impute = ['Bilirubin', 'Cholesterol', 'Albumin', 'SGOT', 'Platelets']

# Calculate the mean for each column
for column in columns_to_impute:
    mean_value = processable_data[column].mode()

    # Impute missing values with the mean
    processable_data[column] = processable_data[column].fillna(mean_value)


In [37]:
processable_data.head()

Unnamed: 0,id,N_Days,Drug,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage,Status
0,0,1702.0,Placebo,18806.0,F,0,0,0,0,0.6,310.0,3.85,58.0,4332.0,43.4,112.0,273.0,10.6,3.0,C
1,1,2157.0,Placebo,22646.0,F,0,1,0,0,0.8,255.0,4.14,15.0,1758.0,106.95,172.0,514.0,10.0,3.0,C
2,2,1831.0,D-penicillamine,20483.0,F,0,0,0,0,0.8,256.0,3.85,20.0,646.0,57.0,84.0,204.0,9.9,3.0,C
3,3,971.0,,22646.0,F,-1,-1,-1,0,3.0,,3.03,,,,,258.0,11.8,4.0,D
4,4,1368.0,,21915.0,F,-1,-1,-1,0,0.8,,3.94,,,,,402.0,10.5,4.0,C


In [29]:
# doing KNN imputation for numerical columns

# Create KNN imputer
imputer = KNNImputer(n_neighbors=5)  # You can adjust the number of neighbors

# Perform imputation on numerical columns
processable_data[numerical_cols] = imputer.fit_transform(processable_data[numerical_cols])

# Encode 'Sex' column
processable_data['Sex'] = processable_data['Sex'].map({'M': 0, 'F': 1})

# Encode 'Drug'

In [38]:
# prompt: Find unique values in Drug and encode them

# Encode 'Drug'
processable_data['Drug'] = processable_data['Drug'].astype('category').cat.codes   # this assigns unique numerical code to each distinct category


In [39]:
processable_data.head()

Unnamed: 0,id,N_Days,Drug,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage,Status
0,0,1702.0,2,18806.0,F,0,0,0,0,0.6,310.0,3.85,58.0,4332.0,43.4,112.0,273.0,10.6,3.0,C
1,1,2157.0,2,22646.0,F,0,1,0,0,0.8,255.0,4.14,15.0,1758.0,106.95,172.0,514.0,10.0,3.0,C
2,2,1831.0,0,20483.0,F,0,0,0,0,0.8,256.0,3.85,20.0,646.0,57.0,84.0,204.0,9.9,3.0,C
3,3,971.0,-1,22646.0,F,-1,-1,-1,0,3.0,,3.03,,,,,258.0,11.8,4.0,D
4,4,1368.0,-1,21915.0,F,-1,-1,-1,0,0.8,,3.94,,,,,402.0,10.5,4.0,C


In [19]:
unique_values(processable_data)

id : 15000
N_Days : 552
Drug : 3
Age : 384
Sex : 2
Ascites : 3
Hepatomegaly : 3
Spiders : 3
Edema : 3
Bilirubin : 116
Cholesterol : 1697
Albumin : 164
Copper : 1003
Alk_Phos : 4790
SGOT : 4025
Tryglicerides : 664
Platelets : 627
Prothrombin : 63
Stage : 4
Status : 3


In [21]:
processable_data.head()

Unnamed: 0,id,N_Days,Drug,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage,Status
0,0.0,1702.0,Placebo,18806.0,1,0,0,0,0,0.6,310.0,3.85,58.0,4332.0,43.4,112.0,273.0,10.6,3.0,C
1,1.0,2157.0,Placebo,22646.0,1,0,1,0,0,0.8,255.0,4.14,15.0,1758.0,106.95,172.0,514.0,10.0,3.0,C
2,2.0,1831.0,D-penicillamine,20483.0,1,0,0,0,0,0.8,256.0,3.85,20.0,646.0,57.0,84.0,204.0,9.9,3.0,C
3,3.0,971.0,,22646.0,1,-1,-1,-1,0,3.0,293.8,3.03,144.0,1355.6,121.83,100.6,258.0,11.8,4.0,D
4,4.0,1368.0,,21915.0,1,-1,-1,-1,0,0.8,363.4,3.94,61.2,1042.0,116.87,94.4,402.0,10.5,4.0,C
