In [1]:
# Adithya Sunilkumar - IMT2021068
# Kevin Adesara - IMT2021070
# Anant Ojha - IMT2021102

from google.colab import drive

drive.mount('/content/drive')


Mounted at /content/drive


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from tensorflow.keras.utils import to_categorical
import numpy as np
import tensorflow as tf

# Check if TensorFlow is using the GPU
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
    print('GPU device not found')
else:
    print('Found GPU at: {}'.format(device_name))


# Load data
train_data = pd.read_csv('/content/drive/MyDrive/data/train.csv')
test_data = pd.read_csv('/content/drive/MyDrive/data/test.csv')

Found GPU at: /device:GPU:0


In [3]:
#Feature engineering, same as assignment 1
train_frequency = train_data['patient_id'].value_counts().to_dict()
test_frequency = test_data['patient_id'].value_counts().to_dict()
frequency = {}

for i in train_frequency:
    frequency[i] = 0
for i in test_frequency:
    frequency[i] = 0

for i in train_frequency:
    frequency[i] += train_frequency[i]
for i in test_frequency:
    frequency[i] += test_frequency[i]

train_data['frequency'] = train_data['patient_id'].map(frequency)
test_data['frequency'] = test_data['patient_id'].map(frequency)

# Assuming 'patient_id' and 'enc_id' are not features for training
features = train_data.drop(['patient_id', 'enc_id', 'readmission_id'], axis=1)
labels = to_categorical(train_data['readmission_id'])  # One-hot encoding the labels

# Handling missing values and encoding categorical variables
numeric_features = features.select_dtypes(include=['int64', 'float64']).columns
categorical_features = features.select_dtypes(include=['object']).columns

# Create a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='mean'), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)])

# Splitting the training data for training and validation
X_train, X_val, y_train, y_val = train_test_split(features, labels, test_size=0.2, random_state=42)

# Apply the preprocessing
X_train = preprocessor.fit_transform(X_train)
X_val = preprocessor.transform(X_val)
test_data_processed = preprocessor.transform(test_data.drop(['patient_id', 'enc_id'], axis=1))


In [None]:
# 0.2 droupout rate, RELU, 0.001 learning rate, 0.722 accuracy
model = tf.keras.Sequential([
    tf.keras.layers.Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(3, activation='softmax')  # Output layer for 3 classes
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])


In [None]:
# 0.5 droupout rate, RELU, 0.001 learning rate, 0.719
model = tf.keras.Sequential([
    tf.keras.layers.Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(3, activation='softmax')  # Output layer for 3 classes
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])


In [None]:
# 0.2 droupout rate, sigmoid, 0.001 learning rate, 0.720
model = tf.keras.Sequential([
    tf.keras.layers.Dense(128, activation='sigmoid', input_shape=(X_train.shape[1],)),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(64, activation='sigmoid'),
    tf.keras.layers.Dense(3, activation='softmax')  # Output layer for 3 classes
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])


In [None]:
# 0.2 droupout rate, RELU, 0.01 learning rate, 0.712
model = tf.keras.Sequential([
    tf.keras.layers.Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(3, activation='softmax')  # Output layer for 3 classes
])

custom_optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)

model.compile(optimizer=custom_optimizer, loss='categorical_crossentropy', metrics=['accuracy'])


In [None]:
# Training the model (10 epochs, 32 batch size)
history = model.fit(X_train, y_train, epochs=10, validation_data=(X_val, y_val))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [4]:
# Fine tuned parameters, 6 layers, 64 batch size, 20 epochs, 0.723 accuracy

model = tf.keras.Sequential([
    tf.keras.layers.Dense(256, activation='relu', input_shape=(X_train.shape[1],)),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(3, activation='softmax')
])

custom_optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)

model.compile(optimizer=custom_optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

# Fine-tuned parameters
batch_size = 64
epochs = 20

history = model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val))


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [5]:
# Making predictions
predictions = model.predict(test_data_processed)
predicted_labels = np.argmax(predictions, axis=1)

# Saving to CSV
output = pd.DataFrame({'enc_id': test_data['enc_id'], 'readmission_id': predicted_labels})
output.to_csv('/content/drive/MyDrive/data/output.csv', index=False)


