In [None]:
pip install pandas scipy


In [1]:
def reformat_arff_file(input_file, output_file):
    with open(input_file, 'r') as f:
        lines = f.readlines()

    # Find the index where @data starts
    data_start_index = None
    for i, line in enumerate(lines):
        if line.strip().lower() == '@data':
            data_start_index = i
            break

    if data_start_index is None:
        raise ValueError("No @data section found in the ARFF file.")

 
    data_lines = lines[data_start_index + 1:]  # Everything after @data

    #  adding line breaks after every row of data
    formatted_data = []
    for line in data_lines:
        line = line.strip()  # Remove extra spaces
        if line:  # Make sure it's not an empty line
            formatted_data.append(line)

    # Write into a new ARFF file
    with open(output_file, 'w') as f:
        # Write the header (everything before @data)
        for line in lines[:data_start_index + 1]:
            f.write(line)

        # Write the formatted data
        for row in formatted_data:
            f.write(row + '\n')

    print(f"Formatted ARFF file saved to: {output_file}")

# Example usage
input_file = r"C:\Users\gnand\Downloads\chronic+kidney+disease\Chronic_Kidney_Disease\Chronic_Kidney_Disease\chronic_kidney_disease_full_cleaned.arff"
output_file = r"C:\Users\gnand\Downloads\chronic+kidney+disease\Chronic_Kidney_Disease\Chronic_Kidney_Disease\chronic_kidney_disease_full_cleaned_formatted.arff"

reformat_arff_file(input_file, output_file)


Formatted ARFF file saved to: C:\Users\gnand\Downloads\chronic+kidney+disease\Chronic_Kidney_Disease\Chronic_Kidney_Disease\chronic_kidney_disease_full_cleaned_formatted.arff


In [39]:
import pandas as pd
from scipy.io import arff

arff_file = r"C:\Users\gnand\Downloads\chronic+kidney+disease\Chronic_Kidney_Disease\Chronic_Kidney_Disease\chronic_kidney_disease_full_cleaned_formatted.arff"

csv_file = r"C:\Users\gnand\Downloads\chronic_kidney_disease.csv"

data, meta = arff.loadarff(arff_file)

# Convert data to pandas DataFrame
df = pd.DataFrame(data)

# Decoding byte strings to regular strings for all object-type columns
for column in df.select_dtypes([object]).columns:
    df[column] = df[column].str.decode('utf-8')

# Save the DataFrame as a CSV file
df.to_csv(csv_file, index=False)

print(f"ARFF file converted to CSV and saved at: {csv_file}")

ARFF file converted to CSV and saved at: C:\Users\gnand\Downloads\chronic_kidney_disease.csv


In [6]:
#START
input_file = r"C:\Users\gnand\Downloads\chronic+kidney+disease\Chronic_Kidney_Disease\Chronic_Kidney_Disease\chronic_kidney_disease_full_cleaned_formatted.arff"
output_file = r"C:\Users\gnand\Downloads\chronic_kidney_disease.csv"

with open(input_file, 'r') as file:
    lines = file.readlines()

formatted_data = []
current_row = []

# Start processing after the @data section
is_data_section = False

for line in lines:
    line = line.strip()  
    if "@data" in line.lower():  # Identify @data section
        is_data_section = True
        formatted_data.append(line)
        continue

    if is_data_section:
        if line:  # Non-empty line
            current_row.append(line)
            if "ckd" in line.lower() or "notckd" in line.lower(): 
                formatted_data.append(",".join(current_row))
                current_row = []


with open(output_file, 'w') as file:
    file.write("\n".join(formatted_data))

print(f"Reformatted file saved to: {output_file}")


Reformatted file saved to: C:\Users\gnand\Downloads\chronic_kidney_disease.csv


In [7]:
import pandas as pd
import re


input_file = r"C:\Users\gnand\Downloads\chronic+kidney+disease\Chronic_Kidney_Disease\Chronic_Kidney_Disease\chronic_kidney_disease_full.arff"
output_file = r"D:\chronic_kidney_disease_formatted.csv"

# Open the ARFF file and read 
with open(input_file, 'r') as file:
    lines = file.readlines()

headers = []
data_lines = []
data_start = False

# Loop through the lines to extract headers and data
for line in lines:
    # Extract the headers from @attribute lines
    if line.strip().startswith("@attribute"):
        # header name and store in headers list
        header = re.sub(r"@attribute\s+'(.*?)'.*", r'\1', line.strip())
        headers.append(header)
    
    # after @data section
    if "@data" in line:
        data_start = True
        continue  # Skip the @data line
    
    if data_start:
        data_lines.append(line.strip()) 


data_split = [re.split(r',\s*', line) for line in data_lines]
data_split = [row[:25] for row in data_split]  # first 25 columns of each row

df = pd.DataFrame(data_split, columns=headers)


df.to_csv(output_file, index=False)

print(f"CSV file created successfully at {output_file}!")


CSV file created successfully at D:\chronic_kidney_disease_formatted.csv!


In [8]:
import pandas as pd

# Load the CSV file
df = pd.read_csv(r"D:\chronic_kidney_disease_formatted.csv")

# Display the first few rows of the data
print(df)

     age   bp     sg   al   su     rbc        pc         pcc          ba  bgr  \
0     48   80  1.020    1    0       ?    normal  notpresent  notpresent  121   
1      7   50  1.020    4    0       ?    normal  notpresent  notpresent    ?   
2     62   80  1.010    2    3  normal    normal  notpresent  notpresent  423   
3     48   70  1.005    4    0  normal  abnormal     present  notpresent  117   
4     51   80  1.010    2    0  normal    normal  notpresent  notpresent  106   
..   ...  ...    ...  ...  ...     ...       ...         ...         ...  ...   
397   12   80  1.020    0    0  normal    normal  notpresent  notpresent  100   
398   17   60  1.025    0    0  normal    normal  notpresent  notpresent  114   
399   58   80  1.025    0    0  normal    normal  notpresent  notpresent  131   
400  NaN  NaN    NaN  NaN  NaN     NaN       NaN         NaN         NaN  NaN   
401  NaN  NaN    NaN  NaN  NaN     NaN       NaN         NaN         NaN  NaN   

     ...  pcv  wbcc rbcc  h

In [9]:

#REPLACING ? WITH NAN


import pandas as pd

df = pd.read_csv(r"D:\chronic_kidney_disease_formatted.csv", dtype=str)

df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)  # Remove extra spaces
df.replace('?', pd.NA, inplace=True)

output_path = r"D:\chronic_kidney_disease_formatted.csv"
df.to_csv(output_path, index=False)

print(df.head())

  age  bp     sg al su     rbc        pc         pcc          ba   bgr  ...  \
0  48  80  1.020  1  0    <NA>    normal  notpresent  notpresent   121  ...   
1   7  50  1.020  4  0    <NA>    normal  notpresent  notpresent  <NA>  ...   
2  62  80  1.010  2  3  normal    normal  notpresent  notpresent   423  ...   
3  48  70  1.005  4  0  normal  abnormal     present  notpresent   117  ...   
4  51  80  1.010  2  0  normal    normal  notpresent  notpresent   106  ...   

  pcv  wbcc  rbcc  htn   dm cad appet   pe  ane class  
0  44  7800   5.2  yes  yes  no  good   no   no   ckd  
1  38  6000  <NA>   no   no  no  good   no   no   ckd  
2  31  7500  <NA>   no  yes  no  poor   no  yes   ckd  
3  32  6700   3.9  yes   no  no  poor  yes  yes   ckd  
4  35  7300   4.6   no   no  no  good   no   no   ckd  

[5 rows x 25 columns]


  df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)  # Remove extra spaces


In [10]:
#ENCODING CATEGORICAL VALUES WITH NUMERIC BINARY VALUES

import pandas as pd

df = pd.read_csv(r"D:\chronic_kidney_disease_formatted.csv")

# Mapping for categorical variables
mappings = {
    'rbc': {'normal': 1, 'abnormal': 0},
    'pc': {'normal': 1, 'abnormal': 0},
    'pcc': {'present': 1, 'notpresent': 0},
    'ba': {'present': 1, 'notpresent': 0},
    'htn': {'yes': 1, 'no': 0},
    'dm': {'yes': 1, 'no': 0},
    'cad': {'yes': 1, 'no': 0},
    'pe': {'yes': 1, 'no': 0},
    'ane': {'yes': 1, 'no': 0},
    'appet': {'good': 1, 'poor': 0}
}

for column, mapping in mappings.items():
    if column in df.columns:
        df[column] = df[column].map(mapping)

df.to_csv(r"D:\chronic_kidney_disease_formatted.csv", index=False)

print(df.head())


    age    bp     sg   al   su  rbc   pc  pcc   ba    bgr  ...   pcv    wbcc  \
0  48.0  80.0  1.020  1.0  0.0  NaN  1.0  0.0  0.0  121.0  ...  44.0  7800.0   
1   7.0  50.0  1.020  4.0  0.0  NaN  1.0  0.0  0.0    NaN  ...  38.0  6000.0   
2  62.0  80.0  1.010  2.0  3.0  1.0  1.0  0.0  0.0  423.0  ...  31.0  7500.0   
3  48.0  70.0  1.005  4.0  0.0  1.0  0.0  1.0  0.0  117.0  ...  32.0  6700.0   
4  51.0  80.0  1.010  2.0  0.0  1.0  1.0  0.0  0.0  106.0  ...  35.0  7300.0   

   rbcc  htn   dm  cad  appet   pe  ane  class  
0   5.2  1.0  1.0  0.0    1.0  0.0  0.0    ckd  
1   NaN  0.0  0.0  0.0    1.0  0.0  0.0    ckd  
2   NaN  0.0  1.0  0.0    0.0  0.0  1.0    ckd  
3   3.9  1.0  0.0  0.0    0.0  1.0  1.0    ckd  
4   4.6  0.0  0.0  0.0    1.0  0.0  0.0    ckd  

[5 rows x 25 columns]


In [11]:
import pandas as pd

df = pd.read_csv(r"D:\chronic_kidney_disease_formatted.csv")

mappings = {
    'class':{'ckd':1,'notckd':0}
}

for column, mapping in mappings.items():
    if column in df.columns:
        df[column] = df[column].map(mapping)

df.to_csv(r"D:\chronic_kidney_disease_formatted.csv", index=False)

print(df.head())

    age    bp     sg   al   su  rbc   pc  pcc   ba    bgr  ...   pcv    wbcc  \
0  48.0  80.0  1.020  1.0  0.0  NaN  1.0  0.0  0.0  121.0  ...  44.0  7800.0   
1   7.0  50.0  1.020  4.0  0.0  NaN  1.0  0.0  0.0    NaN  ...  38.0  6000.0   
2  62.0  80.0  1.010  2.0  3.0  1.0  1.0  0.0  0.0  423.0  ...  31.0  7500.0   
3  48.0  70.0  1.005  4.0  0.0  1.0  0.0  1.0  0.0  117.0  ...  32.0  6700.0   
4  51.0  80.0  1.010  2.0  0.0  1.0  1.0  0.0  0.0  106.0  ...  35.0  7300.0   

   rbcc  htn   dm  cad  appet   pe  ane  class  
0   5.2  1.0  1.0  0.0    1.0  0.0  0.0    1.0  
1   NaN  0.0  0.0  0.0    1.0  0.0  0.0    1.0  
2   NaN  0.0  1.0  0.0    0.0  0.0  1.0    1.0  
3   3.9  1.0  0.0  0.0    0.0  1.0  1.0    1.0  
4   4.6  0.0  0.0  0.0    1.0  0.0  0.0    1.0  

[5 rows x 25 columns]


In [12]:
import pandas as pd

df = pd.read_csv(r"D:\chronic_kidney_disease_formatted.csv")

columns_to_fill = [
    'rbc', 'pc', 'pcc', 'ba', 'htn', 'dm', 'cad', 'pe', 'ane', 'appet'
]

# Replace NaN values with the(mode) in the specified columns
for column in columns_to_fill:
    mode_value = df[column].mode()[0]  
    df[column].fillna(mode_value, inplace=True)  # Replace NaN with mode

df.to_csv(r"D:\chronic_kidney_disease_formatted.csv", index=False)

print(df[columns_to_fill].head())


   rbc   pc  pcc   ba  htn   dm  cad   pe  ane  appet
0  1.0  1.0  0.0  0.0  1.0  1.0  0.0  0.0  0.0    1.0
1  1.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0    1.0
2  1.0  1.0  0.0  0.0  0.0  1.0  0.0  0.0  1.0    0.0
3  1.0  0.0  1.0  0.0  1.0  0.0  0.0  1.0  1.0    0.0
4  1.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0    1.0


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].fillna(mode_value, inplace=True)  # Replace NaN with mode


In [1]:
import pandas as pd

df = pd.read_csv(r"D:\chronic_kidney_disease_formatted.csv")

# Replace NaN in numeric columns with their median value
numeric_columns = df.select_dtypes(include=['number']).columns  # Get numeric columns
for column in numeric_columns:
    median_value = df[column].median()  # Calculate median for the column
    df[column].fillna(median_value, inplace=True)  

df.to_csv(r"D:\chronic_kidney_disease_formatted.csv", index=False)

print(df.head())

    age    bp     sg   al   su  rbc   pc  pcc   ba    bgr  ...   pcv    wbcc  \
0  48.0  80.0  1.020  1.0  0.0  1.0  1.0  0.0  0.0  121.0  ...  44.0  7800.0   
1   7.0  50.0  1.020  4.0  0.0  1.0  1.0  0.0  0.0  121.0  ...  38.0  6000.0   
2  62.0  80.0  1.010  2.0  3.0  1.0  1.0  0.0  0.0  423.0  ...  31.0  7500.0   
3  48.0  70.0  1.005  4.0  0.0  1.0  0.0  1.0  0.0  117.0  ...  32.0  6700.0   
4  51.0  80.0  1.010  2.0  0.0  1.0  1.0  0.0  0.0  106.0  ...  35.0  7300.0   

   rbcc  htn   dm  cad  appet   pe  ane  class  
0   5.2  1.0  1.0  0.0    1.0  0.0  0.0    1.0  
1   4.8  0.0  0.0  0.0    1.0  0.0  0.0    1.0  
2   4.8  0.0  1.0  0.0    0.0  0.0  1.0    1.0  
3   3.9  1.0  0.0  0.0    0.0  1.0  1.0    1.0  
4   4.6  0.0  0.0  0.0    1.0  0.0  0.0    1.0  

[5 rows x 25 columns]


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].fillna(median_value, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].fillna(median_value, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values a

In [14]:
!pip install matplotlib




In [27]:
pip install --upgrade pip


Note: you may need to restart the kernel to use updated packages.


In [6]:
pip install numpy pandas matplotlib scikit-learn


Collecting scikit-learn
  Downloading scikit_learn-1.6.1-cp313-cp313-win_amd64.whl.metadata (15 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Downloading joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.6.1-cp313-cp313-win_amd64.whl (11.1 MB)
   ---------------------------------------- 0.0/11.1 MB ? eta -:--:--
   ---------------------------------------- 0.0/11.1 MB ? eta -:--:--
    --------------------------------------- 0.3/11.1 MB ? eta -:--:--
   - -------------------------------------- 0.5/11.1 MB 1.2 MB/s eta 0:00:09
   - -------------------------------------- 0.5/11.1 MB 1.2 MB/s eta 0:00:09
   - -------------------------------------- 0.5/11.1 MB 1.2 MB/s eta 0:00:09
   -- ------------------------------------- 0.8/11.1 MB 548.1 kB/s eta 0:00:19
   -- ------------------------------------- 0.8/11.1 MB 548.1 kB/s eta 0:00:19
   --- 

In [15]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler


In [16]:

df = pd.read_csv(r"D:\chronic_kidney_disease_formatted.csv")  # Replace with the actual file name

# Drop non-numeric columns (if any)
df = df.select_dtypes(include=[np.number]).dropna()


In [17]:
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df)


In [18]:

pca = PCA(n_components=24)
principal_components = pca.fit_transform(scaled_data)

explained_variance = pca.explained_variance_ratio_
print("Total Variance Captured:", sum(explained_variance))


Total Variance Captured: 0.9947429041554916


In [27]:
explained_variance = pca.explained_variance_ratio_

In [26]:
import pandas as pd

# Convert PCA-transformed data to DataFrame
pca_df = pd.DataFrame(principal_components, columns=[f'PC{i+1}' for i in range(24)])
y = df['class']

pca_df['class'] = y.values 




In [7]:
#with pca adaboost
import pandas as pd
import numpy as np
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.decomposition import PCA
from sklearn.feature_selection import RFE
from sklearn.preprocessing import StandardScaler

df = pd.read_csv(r"D:\chronic_kidney_disease_formatted.csv")

X = df.drop(columns=['class']).values  
y = df['class'].values  

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

#  PCA 
pca = PCA(n_components=0.98)  
X_pca = pca.fit_transform(X_scaled)

#  Decision Tree as the weak learner
weak_classifier = DecisionTreeClassifier(max_depth=2, min_samples_split=5, random_state=42)

#(RFE)
rfe = RFE(estimator=weak_classifier, n_features_to_select=10)
X_rfe = rfe.fit_transform(X_pca, y)

#  (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X_rfe, y, test_size=0.2, random_state=42, stratify=y)

adaboost_model = AdaBoostClassifier(
    estimator=weak_classifier,  
    n_estimators=100,  
    learning_rate=1.0,  
    random_state=42
)

adaboost_model.fit(X_train, y_train)

y_pred = adaboost_model.predict(X_test)

train_accuracy = accuracy_score(y_train, adaboost_model.predict(X_train))
test_accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
cv_scores = cross_val_score(adaboost_model, X_rfe, y, cv=cv, scoring='accuracy')

print("Training Accuracy:", train_accuracy)
print("Testing Accuracy:", test_accuracy)  
print("Confusion Matrix:\n", conf_matrix)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)


Training Accuracy: 1.0
Testing Accuracy: 0.9876543209876543
Confusion Matrix:
 [[30  0]
 [ 1 50]]
Precision: 0.98805256869773
Recall: 0.9876543209876543
F1-score: 0.987694397855086


In [8]:
#wihtout pca adaboost
import pandas as pd
import numpy as np
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.feature_selection import RFE
from sklearn.preprocessing import StandardScaler

# Load dataset
df = pd.read_csv(r"D:\chronic_kidney_disease_formatted.csv")

# Separate features and target
X = df.drop(columns=['class']).values  
y = df['class'].values  

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Decision Tree as the weak learner
weak_classifier = DecisionTreeClassifier(max_depth=2, min_samples_split=5, random_state=42)

# Apply RFE (Recursive Feature Elimination) to select 10 most important features
rfe = RFE(estimator=weak_classifier, n_features_to_select=10)
X_rfe = rfe.fit_transform(X_scaled, y)

# Split data into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X_rfe, y, test_size=0.2, random_state=42, stratify=y)

# Define the AdaBoost model
adaboost_model = AdaBoostClassifier(
    estimator=weak_classifier,  
    n_estimators=100,  
    learning_rate=1.0,  
    random_state=42
)

# Train the AdaBoost model
adaboost_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = adaboost_model.predict(X_test)

# Evaluation metrics
train_accuracy = accuracy_score(y_train, adaboost_model.predict(X_train))
test_accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Cross-validation with StratifiedKFold
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
cv_scores = cross_val_score(adaboost_model, X_rfe, y, cv=cv, scoring='accuracy')

# Output evaluation results
print("Training Accuracy:", train_accuracy)
print("Testing Accuracy:", test_accuracy)  
print("Confusion Matrix:\n", conf_matrix)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)



Training Accuracy: 0.9937694704049844
Testing Accuracy: 0.9753086419753086
Confusion Matrix:
 [[29  1]
 [ 1 50]]
Precision: 0.9753086419753086
Recall: 0.9753086419753086
F1-score: 0.9753086419753086


In [10]:
#without pca decision tree
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.feature_selection import RFE
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score

df = pd.read_csv(r"D:\chronic_kidney_disease_formatted.csv")

X = df.drop(columns=['class']).values  
y = df['class'].values  

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# (RFE) with 6 features 
base_tree = DecisionTreeClassifier(random_state=42, min_samples_split=8, min_samples_leaf=5, max_depth=7, class_weight="balanced")
rfe = RFE(estimator=base_tree, n_features_to_select=6)  
X_rfe = rfe.fit_transform(X_scaled, y)  

X_train, X_test, y_train, y_test = train_test_split(X_rfe, y, test_size=0.2, random_state=42, stratify=y)
# Train Decision Tree 
decision_tree = DecisionTreeClassifier(random_state=42, min_samples_split=8, min_samples_leaf=5, max_depth=7, class_weight="balanced")
decision_tree.fit(X_train, y_train)
y_pred = decision_tree.predict(X_test)

# Metrics
train_accuracy = accuracy_score(y_train, decision_tree.predict(X_train))
test_accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

cv_stratified = StratifiedKFold(n_splits=15, shuffle=True, random_state=42)
stratified_scores = cross_val_score(decision_tree, X_rfe, y, cv=cv_stratified, scoring='accuracy')

print("Training Accuracy:", train_accuracy)  
print("Testing Accuracy:", test_accuracy)   
print("Confusion Matrix:\n", conf_matrix)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)



Training Accuracy: 0.9875389408099688
Testing Accuracy: 0.9753086419753086
Confusion Matrix:
 [[30  0]
 [ 2 49]]
Precision: 0.9768518518518519
Recall: 0.9753086419753086
F1-score: 0.9754599761051372


In [11]:
#with pca decision tree
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.feature_selection import RFE
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score

# Load dataset
df = pd.read_csv(r"D:\chronic_kidney_disease_formatted.csv")

# Separate features and target
X = df.drop(columns=['class']).values  
y = df['class'].values  

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply PCA to reduce dimensionality (e.g., preserve 95% variance)
pca = PCA(n_components=0.95, random_state=42)  # Preserve 95% of the variance
X_pca = pca.fit_transform(X_scaled)

# (RFE) with 6 features on PCA data
base_tree = DecisionTreeClassifier(random_state=42, min_samples_split=8, min_samples_leaf=5, max_depth=7, class_weight="balanced")
rfe = RFE(estimator=base_tree, n_features_to_select=6)  
X_rfe = rfe.fit_transform(X_pca, y)  

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_rfe, y, test_size=0.2, random_state=42, stratify=y)

# Train Decision Tree
decision_tree = DecisionTreeClassifier(random_state=42, min_samples_split=8, min_samples_leaf=5, max_depth=7, class_weight="balanced")
decision_tree.fit(X_train, y_train)
y_pred = decision_tree.predict(X_test)

# Metrics
train_accuracy = accuracy_score(y_train, decision_tree.predict(X_train))
test_accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Cross-validation
cv_stratified = StratifiedKFold(n_splits=15, shuffle=True, random_state=42)
stratified_scores = cross_val_score(decision_tree, X_rfe, y, cv=cv_stratified, scoring='accuracy')

# Output
print("Training Accuracy:", train_accuracy)  
print("Testing Accuracy:", test_accuracy)   
print("Confusion Matrix:\n", conf_matrix)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)



Training Accuracy: 0.9937694704049844
Testing Accuracy: 0.9753086419753086
Confusion Matrix:
 [[30  0]
 [ 2 49]]
Precision: 0.9768518518518519
Recall: 0.9753086419753086
F1-score: 0.9754599761051372
