In [None]:
# Adithya Sunilkumar - IMT2021068
# Kevin Adesara - IMT2021070
# Anant Ojha - IMT2021102

# All the imports and the input datasets

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier

train_data = pd.read_csv("/kaggle/input/hospital/train.csv")
test_data = pd.read_csv("/kaggle/input/hospital/test.csv")

backup_train = train_data
backup_test = test_data

In [None]:
# Feature Engineering

train_frequency = train_data['patient_id'].value_counts().to_dict()
test_frequency = test_data['patient_id'].value_counts().to_dict()
frequency = {}

for i in train_frequency:
    frequency[i] = 0
for i in test_frequency:
    frequency[i] = 0
    
for i in train_frequency:
    frequency[i] += train_frequency[i]
for i in test_frequency:
    frequency[i] += test_frequency[i]
    
train_data['frequency'] = train_data['patient_id'].map(frequency)
test_data['frequency'] = test_data['patient_id'].map(frequency)

drugs = ['metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone', 'tolazamide', 'examide', 'citoglipton', 'glyburide-metformin', 'glipizide-metformin', 'glimepiride-pioglitazone', 'metformin-rosiglitazone', 'metformin-pioglitazone', 'insulin']


In [None]:
# Only run this cell if you want to find frequencies of drugs. (Doesn't really help accuracy)

for category in ['Up', 'Down', 'Steady']:
    train_data[category.lower()] = train_data[drugs].eq(category).sum(axis=1)
for category in ['Up', 'Down', 'Steady']:
    test_data[category.lower()] = test_data[drugs].eq(category).sum(axis=1)
train_data.to_csv('modified.csv', index=False)

In [None]:
# Preprocessing, enter the list of categorical columns and the columns to be dropped here

categorical_columns = ['race', 'age', 'gender', 'diabetesMed', 'change', 'diag_1', 'diag_2', 'diag_3']
columns_to_drop = ['weight', 'medical_specialty', 'payer_code', 'max_glu_serum', 'A1Cresult']
columns_to_drop.extend(drugs)

for column in columns_to_drop:
    if column in categorical_columns:
        categorical_columns.remove(column)

train_data = train_data.drop(columns=columns_to_drop, axis=1)
print("Initial row count: " + str(train_data.shape[0]))

#Rows with more than 2 null values are dropped, rest are replaced with Mode
train_data = train_data.dropna(thresh=train_data.shape[1]-2)
print("Row count after dropping: " + str(train_data.shape[0]))
train_data = train_data.apply(lambda x: x.fillna(x.mode().iloc[0]))

label_encoder = LabelEncoder()
for column in categorical_columns:
    train_data[column] = label_encoder.fit_transform(train_data[column])

In [None]:
# Model: Random Forest with Randomized Grid Search. Set n_iter to 10 for fine tuning but longer execution time. (Usually increases accuracy by 0.1%)

X = train_data.drop(columns=['readmission_id'])
y = train_data['readmission_id']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

param_dist = {
    'n_estimators': [50, 100, 150],
    'max_depth': [3, 4, 5, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

model = RandomForestClassifier(random_state=42)
random_search = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=5, cv=5, random_state=42)
random_search.fit(X_train, y_train)

best_params = random_search.best_params_
print("Best Parameters:", best_params)

best_model = random_search.best_estimator_

In [None]:
# Output predicted values of test data and print accuracy

test_x = test_data.drop(columns=columns_to_drop, axis=1)
test_x = test_x.apply(lambda x: x.fillna(x.mode().iloc[0]))

for column in categorical_columns:
    test_x[column] = label_encoder.fit_transform(test_x[column])

test_x = test_x[X.columns]
test_predictions = best_model.predict(test_x)
result_df = pd.DataFrame({'enc_id': test_data['enc_id'], 'predicted_readmission_id': test_predictions})

result_df.to_csv('predicted_results.csv', index=False)

val_predictions = best_model.predict(X_test)
accuracy = accuracy_score(y_test, val_predictions)
print(f'Accuracy on the validation set: {accuracy}')

In [None]:
# Other tried models.
# Model: XGBoost
from xgboost import XGBClassifier

# Define your features (X) and target (y)
X = train_data.drop(columns=['readmission_id'])
y = train_data['readmission_id']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create an XGBoost classifier with RandomizedSearchCV
param_dist = {
    'n_estimators': [50, 100, 150],
    'max_depth': [3, 4, 5, None],
    'min_child_weight': [1, 2, 4, 6],
    'learning_rate': [0.01, 0.1, 0.2],
}

xgb_classifier = XGBClassifier(random_state=42)  # Use XGBClassifier
random_search = RandomizedSearchCV(xgb_classifier, param_distributions=param_dist, n_iter=5, cv=5, random_state=42)
random_search.fit(X_train, y_train)

best_params = random_search.best_params_
print("Best Parameters:", best_params)

best_model = random_search.best_estimator_

# Make predictions on the test data
y_pred = best_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Generate a classification report to see precision, recall, F1-score, etc.
report = classification_report(y_test, y_pred)
print(report)

In [None]:
# Other tried models.
# Model: AdaBoost
from sklearn.ensemble import AdaBoostClassifier

# Define your features (X) and target (y)
X = train_data.drop(columns=['readmission_id'])
y = train_data['readmission_id']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create an AdaBoost classifier with RandomizedSearchCV
param_dist = {
    'n_estimators': [50, 100, 150],
    'learning_rate': [0.01, 0.1, 0.2],
}

adaboost_classifier = AdaBoostClassifier(random_state=42)  # Use AdaBoostClassifier
random_search = RandomizedSearchCV(adaboost_classifier, param_distributions=param_dist, n_iter=5, cv=5, random_state=42)
random_search.fit(X_train, y_train)

best_params = random_search.best_params_
print("Best Parameters:", best_params)

best_model = random_search.best_estimator_

# Make predictions on the test data
y_pred = best_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Generate a classification report to see precision, recall, F1-score, etc.
report = classification_report(y_test, y_pred)
print(report)

In [None]:
# Other tried models.
# Model: KNN
from sklearn.neighbors import KNeighborsClassifier

# Define your features (X) and target (y)
X = train_data.drop(columns=['readmission_id'])
y = train_data['readmission_id']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a K-Nearest Neighbors (KNN) classifier with RandomizedSearchCV
param_dist = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'p': [1, 2]
}

knn_classifier = KNeighborsClassifier()  # Use KNN classifier
random_search = RandomizedSearchCV(knn_classifier, param_distributions=param_dist, n_iter=5, cv=5, random_state=42)
random_search.fit(X_train, y_train)

best_params = random_search.best_params_
print("Best Parameters:", best_params)

best_model = random_search.best_estimator_

# Make predictions on the test data
y_pred = best_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Generate a classification report to see precision, recall, F1-score, etc.
report = classification_report(y_test, y_pred)
print(report)