## Part 1: Building up a basic predictive model

### Model Building 

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv('cleaned_data.csv')
df.head()

Unnamed: 0,patient_nbr,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,payer_code,medical_specialty,num_lab_procedures,...,number_diagnoses,metformin,glipizide,glyburide,pioglitazone,rosiglitazone,insulin,change,diabetesMed,readmitted
0,20377854,Female,[60-70),0.2,0.0,0.0,0.153846,MC,Nephrology,0.644444,...,0.333333,No,No,No,No,No,Steady,No,Yes,0
1,20408121,Female,[90-100),0.0,0.0,0.285714,0.230769,MC,Emergency/Trauma,0.611111,...,0.333333,No,No,No,No,No,No,No,Yes,0
2,20542797,Male,[70-80),0.0,0.071429,0.285714,0.692308,MC,InternalMedicine,0.744444,...,0.333333,Steady,No,No,No,No,Steady,Ch,Yes,0
3,7239654,Female,[70-80),0.0,0.142857,0.238095,0.846154,UN,InternalMedicine,0.844444,...,0.266667,No,No,No,No,No,Steady,No,Yes,0
4,15466212,Male,[70-80),0.0,0.142857,0.238095,0.846154,MC,InternalMedicine,0.655556,...,0.266667,No,No,No,No,No,No,No,No,0


In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
# Encode categorical variables
le = LabelEncoder()
df['gender'] = le.fit_transform(df['gender'])
df['age'] = le.fit_transform(df['age'])
df['payer_code'] = le.fit_transform(df['payer_code'])
df['medical_specialty'] = le.fit_transform(df['medical_specialty'])
df['change'] = le.fit_transform(df['change'])
df['diabetesMed'] = le.fit_transform(df['diabetesMed'])
df['readmitted'] = le.fit_transform(df['readmitted'])

In [4]:
# Select predictors and target variable
X = df[['gender', 'age', 'admission_type_id', 'discharge_disposition_id', 'admission_source_id', 
        'time_in_hospital', 'payer_code', 'medical_specialty', 'num_lab_procedures', 
        'num_procedures', 'num_medications', 'number_outpatient', 'number_emergency', 
        'number_inpatient', 'change', 'diabetesMed']]
y = df['readmitted']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

# Check the shapes of the training and test sets
print("Training set shape:", X_train.shape, y_train.shape)
print("Test set shape:", X_test.shape, y_test.shape)

Training set shape: (14977, 16) (14977,)
Test set shape: (9985, 16) (9985,)


In [7]:
from sklearn.preprocessing import PolynomialFeatures

# Create interaction terms for pairs of numerical features
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

# Fit the Random Forest classifier on the training data with interaction terms
rf_classifier.fit(X_train_poly, y_train)

# Predict on the test data with interaction terms
y_pred_rf_poly = rf_classifier.predict(X_test_poly)

# Evaluate performance with interaction terms
print("Random Forest Classifier with Interaction Terms:")
print(classification_report(y_test, y_pred_rf_poly))

Random Forest Classifier with Interaction Terms:
              precision    recall  f1-score   support

           0       0.91      1.00      0.95      9055
           1       0.14      0.00      0.00       930

    accuracy                           0.91      9985
   macro avg       0.52      0.50      0.48      9985
weighted avg       0.84      0.91      0.86      9985

