In [541]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import seaborn as sns
import matplotlib.pyplot as plt
import torch
from torch.utils.data import Dataset, DataLoader, random_split
from sklearn.preprocessing import MinMaxScaler
import torch.nn as nn
import torch.optim as optim



In [542]:
df = pd.read_csv('train.csv')

In [543]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1590 entries, 0 to 1589
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   CustomerId           1590 non-null   object
 1   Age                  1590 non-null   int64 
 2   EmploymentType       1590 non-null   object
 3   GraduateOrNot        1590 non-null   object
 4   AnnualIncome         1590 non-null   int64 
 5   FamilyMembers        1590 non-null   int64 
 6   ChronicDiseases      1590 non-null   int64 
 7   FrequentFlyer        1590 non-null   object
 8   EverTravelledAbroad  1590 non-null   object
 9   TravelInsurance      1590 non-null   object
dtypes: int64(4), object(6)
memory usage: 124.3+ KB


# Data Preprocessing

In [544]:
scaler_ob = MinMaxScaler()
numeric_features = ['Age', 'AnnualIncome','FamilyMembers']
dummies_features = ['EmploymentType']
categorical_features = ['GraduateOrNot','FrequentFlyer','EverTravelledAbroad','TravelInsurance']
def data_preprocessing(df,scaler=scaler_ob,numeric_features=numeric_features,dummies_features=dummies_features,
                       categorical_features=categorical_features, test=False ):
    if not test:
        global scaler_ob
        df[numeric_features] = scaler.fit_transform(df[numeric_features])
        scaler_ob = scaler
    else:
        df[numeric_features] = scaler.transform(df[numeric_features])
        categorical_features.remove('TravelInsurance')
        


    df = pd.get_dummies(df, columns=dummies_features)
    df[categorical_features] = df[categorical_features].replace({'Yes': 1, 'No': 0})
    df = df.drop(columns='CustomerId')
    return df
        
    

In [545]:
df_preprocessed = data_preprocessing(df)

# make Dataset

In [546]:
features = df_preprocessed.loc[:, ~df.columns.isin(['TravelInsurance'])]

labels = df_preprocessed.loc[:,'TravelInsurance']

In [547]:
class CustomDataset(Dataset):
    def __init__(self, features, labels=None ):
        self.features = features
        self.labels = labels
        
    def __len__(self):
        return len(self.features)
    def __getitem__(self, index):
        features =  torch.tensor(self.features.iloc[index].values)
        if self.labels is not None:
            labels = torch.tensor(self.labels.iloc[index])
            return features.float(),labels.float()
         
        return features.float()
    
dataset = CustomDataset(features, labels)


In [548]:
train_size = int(0.75 * len(dataset))
dev_size = int(0.125 * len(dataset))
test_size = len(dataset) - train_size - dev_size
train_dataset, dev_dataset, test_dataset = random_split(dataset, [train_size, dev_size,test_size ])

In [549]:
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle = True)
dev_dataloader = DataLoader(dev_dataset, batch_size=32, shuffle = False)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle = False)

In [550]:
train_features, train_labels = next(iter(train_dataloader))
print(f"Feature batch shape: {train_features.size()}")
print(f"Labels batch shape: {train_labels.size()}")


Feature batch shape: torch.Size([32, 9])
Labels batch shape: torch.Size([32])


# Build Model


In [551]:
class InsuranceModel(nn.Module):
    def __init__(self,input_dim):
        super().__init__()
        self.linear_stack = nn.Sequential(
            nn.Linear(input_dim,128),
            nn.ReLU(),
            nn.Linear(128,64),
            nn.ReLU(),
            nn.Linear(64,32),
            nn.ReLU(),
            nn.Linear(32,16),
            nn.ReLU(),
            nn.Linear(16,8),
            nn.ReLU(),
            nn.Linear(8,1),
            nn.Sigmoid()
        )
    def forward(self, x):
        x = self.linear_stack(x)
        return x
inpu_dim = features.shape[1]
model = InsuranceModel(inpu_dim)

In [552]:
model

InsuranceModel(
  (linear_stack): Sequential(
    (0): Linear(in_features=9, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=64, bias=True)
    (3): ReLU()
    (4): Linear(in_features=64, out_features=32, bias=True)
    (5): ReLU()
    (6): Linear(in_features=32, out_features=16, bias=True)
    (7): ReLU()
    (8): Linear(in_features=16, out_features=8, bias=True)
    (9): ReLU()
    (10): Linear(in_features=8, out_features=1, bias=True)
    (11): Sigmoid()
  )
)

In [553]:
lr = 0.001
epochs = 150
loss_fn = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr)

In [554]:
class EarlyStopper:
    def __init__(self, patience=3):
        self.patience = patience
        self.counter = 0
        self.best_accuracy = 0

    def early_stop(self, accuracy):
        if accuracy > self.best_accuracy:
            self.best_accuracy = accuracy
            self.counter = 0
        elif accuracy >= self.best_accuracy:
            self.counter += 1
            if self.counter >= self.patience:
                return True
        return False
early_stopper = EarlyStopper(patience=15)

In [None]:
def train_loop(dataloader, model, loss_fn, optimizer, epoch):
    model.train()
    
    running_loss = 0
    for inputs, labels in dataloader:
        
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = loss_fn(outputs.squeeze(), labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    print(f"Epoch {epoch+1}/{epochs}, Loss: {running_loss/len(dataloader):.4f}")

def test_loop(dataloader, model, predict=False):
    model.eval()
    correct = 0
    total = 0
    predictions = []
    with torch.no_grad():
        for data in dataloader:
            if predict:
                inputs = data
                outputs = model(inputs)
                predicted = outputs.squeeze()
                predictions.extend(predicted.tolist())
            else:
                inputs, labels = data
                outputs = model(inputs)
                predicted = (outputs.squeeze() > 0.5).float()
                total += labels.size(0)
                correct += (predicted == labels).sum().item()
        
        if predict:
            return predictions
        else:
            accuracy = 100 * correct / total
            print(f'Accuracy: {accuracy:.2f}%')
            return accuracy


for t in range (epochs):
    train_loop(train_dataloader, model, loss_fn, optimizer, t)
    accuracy = test_loop(dev_dataloader, model)
    if early_stopper.early_stop(accuracy):
        print('stopping training')             
        break


In [None]:
epochs = 150
final_train_dataloader = DataLoader(dataset, batch_size=32, shuffle = True)
for t in range (epochs):
    train_loop(final_train_dataloader, model, loss_fn, optimizer, t)
    

In [557]:
df_test = pd.read_csv('test.csv')
df_test_preprocessed = data_preprocessing(df_test, test=True)
dataset = CustomDataset(df_test_preprocessed)


In [558]:
final_dataset = DataLoader(dataset, batch_size=32)

In [559]:
answer = test_loop(final_dataset, model, predict=True)
df_test['prediction']= answer