In [2]:
from src.logger import logging
from dataclasses import dataclass
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from src.exception import CustomException
import sys

In [22]:
from sklearn.ensemble import RandomForestClassifier

In [45]:
@dataclass
class DataIngestionConfig:
    train_data_path:str=os.path.join('artifacts','train.csv')
    test_data_path:str=os.path.join('artifacts','test.csv')
    raw_data_path:str=os.path.join('artifacts','data.csv')

class DataIngestion:
    def __init__(self):
        self.ingestion_config=DataIngestionConfig()
    def initiate_data_ingestion(self):
        logging.info("started data ingestion method")
        try:
            df=pd.read_csv('diabetes.csv')
            logging.info("data read as df")
            os.makedirs(os.path.dirname(self.ingestion_config.train_data_path),exist_ok=True)
            df.to_csv(self.ingestion_config.raw_data_path,index=False,header=True)
            logging.info("train test split initiated")
            train_set,test_set=train_test_split(df,test_size=0.2,random_state=42)
            train_set.to_csv(self.ingestion_config.train_data_path,index=False,header=True)
            test_set.to_csv(self.ingestion_config.test_data_path,index=False,header=True)
            logging.info("ingestion of data is completed")

            return(self.ingestion_config.train_data_path,
            self.ingestion_config.test_data_path)
        except Exception as e:
            raise CustomException(e,sys)


In [46]:
obj=DataIngestion()
train_data,test_data=obj.initiate_data_ingestion()

In [59]:

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.preprocessing import FunctionTransformer
import numpy as np
from sklearn.pipeline import Pipeline
from src.utils import save_object

@dataclass
class DataTransformationConfig:
    preprocessor_obj_file_path=os.path.join('artifacts',"preprocessor.pkl")

class DataTransformation:
    def __init__(self):
        self.data_transformation_config=DataTransformationConfig()
    def get_data_transformer_object(self):
        try:
            impute_scale_columns=['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin','BMI', 'DiabetesPedigreeFunction', 'Age']
            scale_only_columns=['Pregnancies',]
            impute_scale_pipe=Pipeline(
                steps=[("replace_zeros_with_nan", FunctionTransformer(lambda X: np.where(X == 0, np.nan, X), validate=False)),
                       ("imputer",SimpleImputer(strategy="median")),
                       ("scaler",StandardScaler())]
            )
            scale_pipe=Pipeline(
                steps=[("scaler",StandardScaler()),]
            )
            logging.info("imputation and scaling done")
            preprocessor=ColumnTransformer(
                [
                    ("impute_scale_pipeline",impute_scale_pipe,impute_scale_columns),
                    ("scale_only_pipeline",scale_pipe,scale_only_columns)

                ]
            )
            return preprocessor
     

        except Exception as e :
            raise CustomException(e,sys)
    def initiate_data_transformation(self,train_path,test_path):
        try:
            train_df=pd.read_csv(train_path)
            test_df=pd.read_csv(test_path)
            logging.info("read train and test data completed")
            logging.info("obtaining preprocessing object")
            preprocessing_obj=self.get_data_transformer_object()
            target_column="Outcome"
            impute_scale_columns=['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin','BMI', 'DiabetesPedigreeFunction', 'Age']
            scale_only_columns=['Pregnancies',]
            input_feature_train_df=train_df.drop(columns=[target_column],axis=1)
            target_feature_train_df=train_df[target_column]
            input_feature_test_df=test_df.drop(columns=[target_column],axis=1)
            target_feature_test_df=test_df[target_column]
            logging.info("applying preprocessing object on training dataframe and testing data frame")
            input_feature_train_arr=preprocessing_obj.fit_transform(input_feature_train_df)
            input_feature_test_arr=preprocessing_obj.fit_transform(input_feature_test_df)
            train_arr=np.c_[
                input_feature_train_arr,np.array(target_feature_train_df)
            ]
            test_arr=np.c_[
                input_feature_test_arr,np.array(target_feature_test_df)
            ]
            logging.info("saving preprocessed objects")
            save_object(file_path=self.data_transformation_config.preprocessor_obj_file_path,
            obj=preprocessing_obj)
            return(train_arr,test_arr,self.data_transformation_config.preprocessor_obj_file_path,)
        except:
            pass

In [60]:
data_transformation=DataTransformation()
train_arr,test_arr,_=data_transformation.initiate_data_transformation(train_data,test_data)

In [61]:
x_train,y_train,x_test,y_test=(train_arr[:,:-1],
                                           train_arr[:,-1],
                                           test_arr[:,:-1],
                                           test_arr[:,-1])

In [62]:

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import r2_score as r2 
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from xgboost import XGBClassifier


In [63]:
models={"RandomForest":RandomForestClassifier(),
        "KNeighborsClassifier" :KNeighborsClassifier(),
        "DecisionTreeClassifier":DecisionTreeClassifier(),
        "XGBClassifier":XGBClassifier(),
        "GradientBoostingClassifier":GradientBoostingClassifier(),
        "LogisticRegression":LogisticRegression()}

In [64]:
from sklearn.metrics import r2_score ,accuracy_score 

In [67]:
for i in range(len(list(models))):
    model_name = list(models.keys())[i]
    model=list(models.values())[i]
    model.fit(x_train,y_train)
    y_train_pred=model.predict(x_train)
    y_test_pred=model.predict(x_test)
    train_model_score=accuracy_score(y_train,y_train_pred)
    test_model_score=accuracy_score(y_test,y_test_pred)
    print("----")
    print(train_model_score)
    print(test_model_score)
    print(model_name)

----
1.0
0.7597402597402597
RandomForest
----
0.8192182410423453
0.6818181818181818
KNeighborsClassifier
----
1.0
0.7012987012987013
DecisionTreeClassifier
----
1.0
0.7467532467532467
XGBClassifier
----
0.9315960912052117
0.7337662337662337
GradientBoostingClassifier
----
0.7719869706840391
0.7662337662337663
LogisticRegression


In [37]:
x_train

array([[-1.25832567e+00,  0.00000000e+00,  4.22787785e-16, ...,
        -4.90734790e-01, -1.03594038e+00, -5.26396861e-01],
       [-3.27397242e-01,  8.07407436e-01, -5.46930179e-01, ...,
         2.41502991e+00,  1.48710085e+00,  1.58804586e+00],
       [ 5.70283740e-01, -2.17042447e+00, -1.14195109e+00, ...,
         5.49160552e-01, -9.48938958e-01, -8.28460107e-01],
       ...,
       [-6.93119124e-01,  1.13827765e+00,  1.00012419e+00, ...,
         1.98124500e+00,  4.43083787e-01,  1.89010910e+00],
       [ 6.36778628e-01,  0.00000000e+00,  4.22787785e-16, ...,
        -7.84876615e-01, -3.39929007e-01, -1.13052335e+00],
       [ 1.04819527e-01,  1.96545318e+00,  4.22787785e-16, ...,
        -6.15522231e-01, -1.03594038e+00, -1.13052335e+00]])

In [54]:
x_train.shape


(614, 8)

In [56]:
type(y_train)

numpy.ndarray

In [58]:
from itertools import combinations
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Assuming x_train, y_train, x_test, y_test are numpy arrays:
# x_train: numpy array of shape (614, 8)
# y_train: numpy array of shape (614,)
# x_test: numpy array for test set
# y_test: numpy array for test set

combination_sizes = [2,3,4, 5, 6, 7,8]  # Sizes of feature combinations
n_features = x_train.shape[1]  # Get the number of features (8 in this case)

combinations_list = []
for size in combination_sizes:
    combinations_list += list(combinations(range(n_features), size))  # Feature indices

# Loop through each feature combination
for combo in combinations_list:
    X_combo = x_train[:, list(combo)]  # Select columns in the combination using indices
    
    # Initialize the logistic regression model
    model = LogisticRegression(max_iter=1000)  # max_iter increased for convergence
    
    # Train the model
    model.fit(X_combo, y_train)
    
    # Select the same feature combination for the test set
    X_test_combo = x_test[:, list(combo)]  # Apply the same feature combination on x_test
    
    # Predict on the test set
    y_pred = model.predict(X_test_combo)
    
    # Evaluate the model (accuracy)
    accuracy = accuracy_score(y_test, y_pred)
    
    # Print the combination and corresponding accuracy
    print(f"Combination: {combo}, Accuracy: {accuracy:.4f}")


Combination: (0, 1), Accuracy: 0.7532
Combination: (0, 2), Accuracy: 0.7338
Combination: (0, 3), Accuracy: 0.7597
Combination: (0, 4), Accuracy: 0.7662
Combination: (0, 5), Accuracy: 0.7597
Combination: (0, 6), Accuracy: 0.7468
Combination: (0, 7), Accuracy: 0.7792
Combination: (1, 2), Accuracy: 0.6558
Combination: (1, 3), Accuracy: 0.6688
Combination: (1, 4), Accuracy: 0.6299
Combination: (1, 5), Accuracy: 0.6753
Combination: (1, 6), Accuracy: 0.6039
Combination: (1, 7), Accuracy: 0.6948
Combination: (2, 3), Accuracy: 0.6688
Combination: (2, 4), Accuracy: 0.6299
Combination: (2, 5), Accuracy: 0.6818
Combination: (2, 6), Accuracy: 0.5714
Combination: (2, 7), Accuracy: 0.6948
Combination: (3, 4), Accuracy: 0.6104
Combination: (3, 5), Accuracy: 0.6948
Combination: (3, 6), Accuracy: 0.6364
Combination: (3, 7), Accuracy: 0.7013
Combination: (4, 5), Accuracy: 0.6623
Combination: (4, 6), Accuracy: 0.5584
Combination: (4, 7), Accuracy: 0.6364
Combination: (5, 6), Accuracy: 0.5779
Combination: