## Part 1: Building up a basic predictive model

### Model Building 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
data = pd.read_csv('cleaned_data.csv')
data.head()

Unnamed: 0,patient_nbr,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,payer_code,medical_specialty,num_lab_procedures,...,number_diagnoses,metformin,glipizide,glyburide,pioglitazone,rosiglitazone,insulin,change,diabetesMed,readmitted
0,20377854,Female,[60-70),0.2,0.0,0.0,0.153846,MC,Nephrology,0.644444,...,0.333333,No,No,No,No,No,Steady,No,Yes,0
1,20408121,Female,[90-100),0.0,0.0,0.285714,0.230769,MC,Emergency/Trauma,0.611111,...,0.333333,No,No,No,No,No,No,No,Yes,0
2,20542797,Male,[70-80),0.0,0.071429,0.285714,0.692308,MC,InternalMedicine,0.744444,...,0.333333,Steady,No,No,No,No,Steady,Ch,Yes,0
3,7239654,Female,[70-80),0.0,0.142857,0.238095,0.846154,UN,InternalMedicine,0.844444,...,0.266667,No,No,No,No,No,Steady,No,Yes,0
4,15466212,Male,[70-80),0.0,0.142857,0.238095,0.846154,MC,InternalMedicine,0.655556,...,0.266667,No,No,No,No,No,No,No,No,0


### Logestic Regression Model 

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler, OneHotEncoder  # Import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

In [3]:
X = data.drop('readmitted', axis=1)  # Features
y = data['readmitted']  # Target variable

In [4]:
numeric_features = X.select_dtypes(include=['float64', 'int64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', LogisticRegression(max_iter=1000, solver='saga'))])

In [7]:
# Define hyperparameters for tuning
param_grid = {
    'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100],  # Regularization parameter
    'classifier__penalty': ['l1', 'l2'],  # Type of regularization
}

In [8]:
# Perform grid search with cross-validation to find the best hyperparameters
grid_search = GridSearchCV(clf, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

