In [1]:
# Adithya Sunilkumar - IMT2021068
# Kevin Adesara - IMT2021070
# Anant Ojha - IMT2021102

# All the imports and the input datasets

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# train_data = pd.read_csv("/kaggle/input/hospital/train.csv")
# test_data = pd.read_csv("/kaggle/input/hospital/test.csv")

train_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/train.csv')
test_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/test.csv')

backup_train = train_data
backup_test = test_data

In [4]:
# Feature Engineering

train_frequency = train_data['patient_id'].value_counts().to_dict()
test_frequency = test_data['patient_id'].value_counts().to_dict()
frequency = {}

for i in train_frequency:
    frequency[i] = 0
for i in test_frequency:
    frequency[i] = 0

for i in train_frequency:
    frequency[i] += train_frequency[i]
for i in test_frequency:
    frequency[i] += test_frequency[i]

train_data['frequency'] = train_data['patient_id'].map(frequency)
test_data['frequency'] = test_data['patient_id'].map(frequency)

drugs = ['metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone', 'tolazamide', 'examide', 'citoglipton', 'glyburide-metformin', 'glipizide-metformin', 'glimepiride-pioglitazone', 'metformin-rosiglitazone', 'metformin-pioglitazone', 'insulin']


In [None]:
# Only run this cell if you want to find frequencies of drugs. (Doesn't really help accuracy)

for category in ['Up', 'Down', 'Steady']:
    train_data[category.lower()] = train_data[drugs].eq(category).sum(axis=1)
for category in ['Up', 'Down', 'Steady']:
    test_data[category.lower()] = test_data[drugs].eq(category).sum(axis=1)
train_data.to_csv('modified.csv', index=False)

In [5]:
# Preprocessing, enter the list of categorical columns and the columns to be dropped here

categorical_columns = ['race', 'age', 'gender', 'diabetesMed', 'change', 'diag_1', 'diag_2', 'diag_3']
columns_to_drop = ['weight', 'medical_specialty', 'payer_code', 'max_glu_serum', 'A1Cresult']
columns_to_drop.extend(drugs)

for column in columns_to_drop:
    if column in categorical_columns:
        categorical_columns.remove(column)

train_data = train_data.drop(columns=columns_to_drop, axis=1)
print("Initial row count: " + str(train_data.shape[0]))

#Rows with more than 2 null values are dropped, rest are replaced with Mode
train_data = train_data.dropna(thresh=train_data.shape[1]-2)
print("Row count after dropping: " + str(train_data.shape[0]))
train_data = train_data.apply(lambda x: x.fillna(x.mode().iloc[0]))

label_encoder = LabelEncoder()
for column in categorical_columns:
    train_data[column] = label_encoder.fit_transform(train_data[column])

Initial row count: 71236
Row count after dropping: 71225


In [None]:
# Model: Random Forest with Randomized Grid Search. Set n_iter to 10 for fine tuning but longer execution time. (Usually increases accuracy by 0.1%)

X = train_data.drop(columns=['readmission_id'])
y = train_data['readmission_id']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

param_dist = {
    'n_estimators': [50, 100, 150],
    'max_depth': [3, 4, 5, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

model = RandomForestClassifier(random_state=42)
random_search = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=5, cv=5, random_state=42)
random_search.fit(X_train, y_train)

best_params = random_search.best_params_
print("Best Parameters:", best_params)

best_model = random_search.best_estimator_

In [None]:
# Output predicted values of test data and print accuracy

test_x = test_data.drop(columns=columns_to_drop, axis=1)
test_x = test_x.apply(lambda x: x.fillna(x.mode().iloc[0]))

for column in categorical_columns:
    test_x[column] = label_encoder.fit_transform(test_x[column])

test_x = test_x[X.columns]
test_predictions = best_model.predict(test_x)
result_df = pd.DataFrame({'enc_id': test_data['enc_id'], 'predicted_readmission_id': test_predictions})

result_df.to_csv('predicted_results.csv', index=False)

val_predictions = best_model.predict(X_test)
accuracy = accuracy_score(y_test, val_predictions)
print(f'Accuracy on the validation set: {accuracy}')

In [None]:
# Other tried models.
# Model: XGBoost
from xgboost import XGBClassifier

# Define your features (X) and target (y)
X = train_data.drop(columns=['readmission_id'])
y = train_data['readmission_id']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create an XGBoost classifier with RandomizedSearchCV
param_dist = {
    'n_estimators': [50, 100, 150],
    'max_depth': [3, 4, 5, None],
    'min_child_weight': [1, 2, 4, 6],
    'learning_rate': [0.01, 0.1, 0.2],
}

xgb_classifier = XGBClassifier(random_state=42)  # Use XGBClassifier
random_search = RandomizedSearchCV(xgb_classifier, param_distributions=param_dist, n_iter=5, cv=5, random_state=42)
random_search.fit(X_train, y_train)

best_params = random_search.best_params_
print("Best Parameters:", best_params)

best_model = random_search.best_estimator_

# Make predictions on the test data
y_pred = best_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Generate a classification report to see precision, recall, F1-score, etc.
report = classification_report(y_test, y_pred)
print(report)

In [None]:
# Other tried models.
# Model: AdaBoost
from sklearn.ensemble import AdaBoostClassifier

# Define your features (X) and target (y)
X = train_data.drop(columns=['readmission_id'])
y = train_data['readmission_id']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create an AdaBoost classifier with RandomizedSearchCV
param_dist = {
    'n_estimators': [50, 100, 150],
    'learning_rate': [0.01, 0.1, 0.2],
}

adaboost_classifier = AdaBoostClassifier(random_state=42)  # Use AdaBoostClassifier
random_search = RandomizedSearchCV(adaboost_classifier, param_distributions=param_dist, n_iter=5, cv=5, random_state=42)
random_search.fit(X_train, y_train)

best_params = random_search.best_params_
print("Best Parameters:", best_params)

best_model = random_search.best_estimator_

# Make predictions on the test data
y_pred = best_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Generate a classification report to see precision, recall, F1-score, etc.
report = classification_report(y_test, y_pred)
print(report)

In [None]:
# Other tried models.
# Model: KNN
from sklearn.neighbors import KNeighborsClassifier

# Define your features (X) and target (y)
X = train_data.drop(columns=['readmission_id'])
y = train_data['readmission_id']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a K-Nearest Neighbors (KNN) classifier with RandomizedSearchCV
param_dist = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'p': [1, 2]
}

knn_classifier = KNeighborsClassifier()  # Use KNN classifier
random_search = RandomizedSearchCV(knn_classifier, param_distributions=param_dist, n_iter=5, cv=5, random_state=42)
random_search.fit(X_train, y_train)

best_params = random_search.best_params_
print("Best Parameters:", best_params)

best_model = random_search.best_estimator_

# Make predictions on the test data
y_pred = best_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Generate a classification report to see precision, recall, F1-score, etc.
report = classification_report(y_test, y_pred)
print(report)

In [6]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.2-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.2


In [12]:
from catboost import CatBoostClassifier

# Define your features (X) and target (y)
X = train_data.drop(columns=['readmission_id'])
y = train_data['readmission_id']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a CatBoost classifier
catboost_classifier = CatBoostClassifier(iterations=100, depth=6, learning_rate=0.1, loss_function='MultiClass', random_seed=42)

# Train the CatBoost model on the training data
catboost_classifier.fit(X_train, y_train, cat_features=categorical_columns)

# Make predictions on the test data
y_pred = catboost_classifier.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Generate a classification report to see precision, recall, F1-score, etc.
report = classification_report(y_test, y_pred)
print(report)

0:	learn: 1.0298302	total: 845ms	remaining: 1m 23s
1:	learn: 0.9738634	total: 1.51s	remaining: 1m 13s
2:	learn: 0.9298692	total: 2.03s	remaining: 1m 5s
3:	learn: 0.8940318	total: 2.45s	remaining: 58.7s
4:	learn: 0.8654168	total: 2.82s	remaining: 53.5s
5:	learn: 0.8410671	total: 3.05s	remaining: 47.8s
6:	learn: 0.8215152	total: 3.31s	remaining: 44s
7:	learn: 0.8032477	total: 3.56s	remaining: 41s
8:	learn: 0.7881926	total: 3.82s	remaining: 38.6s
9:	learn: 0.7749015	total: 4.04s	remaining: 36.4s
10:	learn: 0.7633676	total: 4.29s	remaining: 34.7s
11:	learn: 0.7538011	total: 4.55s	remaining: 33.4s
12:	learn: 0.7470725	total: 4.73s	remaining: 31.6s
13:	learn: 0.7399942	total: 4.98s	remaining: 30.6s
14:	learn: 0.7340450	total: 5.25s	remaining: 29.7s
15:	learn: 0.7278544	total: 5.44s	remaining: 28.5s
16:	learn: 0.7224710	total: 5.58s	remaining: 27.2s
17:	learn: 0.7183840	total: 5.71s	remaining: 26s
18:	learn: 0.7144555	total: 5.84s	remaining: 24.9s
19:	learn: 0.7114834	total: 5.97s	remaining: 

In [17]:
# Output predicted values of test data and print accuracy

# Save the 'enc_id' column to create the submission DataFrame
enc_id = test_data['enc_id'].values  # Convert to a simple array

test_x = test_data.drop(columns=columns_to_drop, axis=1)
test_x = test_x.apply(lambda x: x.fillna(x.mode().iloc[0]))

for column in categorical_columns:
    test_x[column] = label_encoder.fit_transform(test_x[column])

test_x = test_x[X.columns]

# Make predictions on the test data
catboost_predictions = catboost_classifier.predict(test_x)

# Convert to simple arrays
enc_id = enc_id.flatten()
catboost_predictions = catboost_predictions.flatten()

# Check lengths
if len(enc_id) != len(catboost_predictions):
    raise ValueError("Length mismatch between 'enc_id' and 'catboost_predictions'.")

# Create a DataFrame with 'enc_id' and 'predicted readmission_id' for CatBoost
catboost_result_df = pd.DataFrame({'enc_id': enc_id, 'predicted_readmission_id': catboost_predictions})

# Save the CatBoost result to a CSV file
catboost_result_df.to_csv('catboost_predictions.csv', index=False)

In [8]:
!pip install lightgbm



In [9]:
from lightgbm import LGBMClassifier

# Define your features (X) and target (y)
X = train_data.drop(columns=['readmission_id'])
y = train_data['readmission_id']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a LightGBM classifier
lgb_classifier = LGBMClassifier(num_iterations=100, max_depth=6, learning_rate=0.1, random_seed=42)

# Train the LightGBM model on the training data
lgb_classifier.fit(X_train, y_train, categorical_feature=categorical_columns)

# Make predictions on the test data
y_pred = lgb_classifier.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Generate a classification report to see precision, recall, F1-score, etc.
report = classification_report(y_test, y_pred)
print(report)



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.041350 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2123
[LightGBM] [Info] Number of data points in the train set: 56980, number of used features: 22
[LightGBM] [Info] Start training from score -2.191101
[LightGBM] [Info] Start training from score -1.054194
[LightGBM] [Info] Start training from score -0.616680
Accuracy: 0.72
              precision    recall  f1-score   support

           0       0.40      0.08      0.13      1578
           1       0.64      0.66      0.65      5025
           2       0.78      0.90      0.84      7642

    accuracy                           0.72     14245
   macro avg       0.61      0.55      0.54     14245
weighted avg       0.69      0.72      0.69     14245



In [11]:
# Output predicted values of test data and print accuracy

# Save the 'enc_id' column to create the submission DataFrame
enc_id = test_data['enc_id']

test_x = test_data.drop(columns=columns_to_drop, axis=1)
test_x = test_x.apply(lambda x: x.fillna(x.mode().iloc[0]))

for column in categorical_columns:
    test_x[column] = label_encoder.fit_transform(test_x[column])

test_x = test_x[X.columns]

# Make predictions on the test data
lgb_predictions = lgb_classifier.predict(test_x)

# Create a DataFrame with 'enc_id' and 'predicted readmission_id' for LightGBM
lgb_result_df = pd.DataFrame({'enc_id': enc_id, 'predicted_readmission_id': lgb_predictions})

# Save the LightGBM result to a CSV file
lgb_result_df.to_csv('lgb_predictions.csv', index=False)

################################################################################################################
# test_predictions = best_model.predict(test_x)
# result_df = pd.DataFrame({'enc_id': test_data['enc_id'], 'predicted_readmission_id': test_predictions})

# result_df.to_csv('predicted_results.csv', index=False)

# val_predictions = best_model.predict(X_test)
# accuracy = accuracy_score(y_test, val_predictions)
# print(f'Accuracy on the validation set: {accuracy}')

