<a href="https://www.kaggle.com/code/averma111/pytorch-icr?scriptVersionId=129360212" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
import torch
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from tqdm.notebook import tqdm
import torch.nn.functional as F
import warnings
warnings.filterwarnings('ignore')
seed = 42
torch.manual_seed(seed)
np.random.seed(seed)
plt.style.use('ggplot')
sns.set_style("darkgrid")
pd.set_option('mode.chained_assignment',None)

## Defining the paths for various dataset

In [None]:
def get_datasets(path):
    df = pd.read_csv(path)
    return df

train_path='/kaggle/input/icr-identify-age-related-conditions/train.csv'
test_path ='/kaggle/input/icr-identify-age-related-conditions/test.csv'
greeks_path = '/kaggle/input/icr-identify-age-related-conditions/greeks.csv'
submission_path ='/kaggle/input/icr-identify-age-related-conditions/sample_submission.csv'

## Top 5 rows from train dataset

In [None]:
get_datasets(train_path).head()

## Summary function

In [None]:
def summary(text, df):
    print(f'{text} shape: {df.shape}')
    summ = pd.DataFrame(df.dtypes, columns=['dtypes'])
    summ['null'] = df.isnull().sum()
    summ['unique'] = df.nunique()
    summ['min'] = df.min()
    summ['median'] = df.median()
    summ['max'] = df.max()
    summ['mean'] = df.mean()
    summ['std'] = df.std()
    summ['duplicate'] = df.duplicated().sum()
    return summ

## Summary of train dataset 

* There are no duplicates.
* BQ,CB,CC,DU,EL,FC,FL,FS and GL are nulls
* All the features are in float

In [None]:
summary('train',get_datasets(train_path))

## Replace null with means of the same columns

In [None]:
def replace_null_with_mean(df):
    fields = ['BQ','CB','CC','DU','EL','FC','FL','FS','GL']
    for value in fields:
        df.fillna(df[value].mean(),axis =1,inplace=True)
    return df

train_non_nan=replace_null_with_mean(get_datasets(train_path))


## Count plot for the label class 

In [None]:
sns.countplot(train_non_nan,x='Class',color='r')


## Encoding the categorical values of EJ column

In [None]:
def to_numerical_ecoding(df,column):
    encoding = {
        'A':0,
        'B':1
    }
    for values in df[column]:
        if values =='A':
            df[column] = encoding['A']
        else:
            df[column] = encoding['B']
    return df

train = to_numerical_ecoding(train_non_nan,'EJ')
train = train.set_index('Id')

## Funcation to drop the columns

In [None]:
def drop_column(df,column):
    df.drop(columns=column,axis=1,inplace=True)
    return df

## Creating the features and label classes

In [None]:
y = train['Class']
X = drop_column(train,['Class'])
y = y.astype(float)
X = X.astype(float)

## Distribution plot for data in training set 
* Negative and Positve Skewness 

In [None]:
def plot_kde_train(features):
    fig,ax = plt.subplots(int(np.ceil(len(features.columns)/4)),4, figsize = (40,30))
    for i,col in enumerate(features.columns):
        ax = np.ravel(ax)
        sns.kdeplot(x= features[col] , label = 'Class', ax = ax[i],color='b')
        ax[i].legend()
        ax[i].set_title(f"col")

    plt.suptitle("Distribution of Train Dataset",fontsize = 40)
    plt.tight_layout(pad =3)
    plt.show()
    
plot_kde_train(X)

## Plotting the correlation with respect to Label Class

In [None]:
def  plot_correlation_dataset(df):
    plt.figure(figsize = (25,12))
    corr = df.corr()
    upper_triangle = np.triu(np.ones_like(corr, dtype=bool))
    sns.heatmap(corr,vmin = -1, vmax = 1, cmap = "rocket", annot = True, mask = upper_triangle)
    plt.title("Correlation of all features and target", fontsize= 18)
    plt.show()
    
plot_correlation_dataset(get_datasets(train_path))

## Show outlier in the features

In [None]:
def show_outlier(features):
    fig,ax = plt.subplots(int(np.ceil(len(X.columns)/4)),4,figsize = (30,15))
    ax = np.ravel(ax)
    for i,col in enumerate(X.columns):
        sns.boxplot(ax = ax[i], x = X[col], color= "red")
    fig.suptitle("Box plots of all data ",fontsize = 20)
    plt.tight_layout(pad=3)
    plt.show()

show_outlier(X)

## Data preprocessing of the features

In [None]:
def preprocessing(features,label=None):
    scaler = StandardScaler()
    num_cols = list(features.select_dtypes(include=['float']))
    features = scaler.fit_transform(features[num_cols].values)
    if label is not None:
        return train_test_split(features,label.to_numpy(),test_size=0.2,random_state=42)
    elif label is None:
        return features

X_train,X_val,y_train,y_val = preprocessing(X,y)
print(X_train.shape,X_val.shape,y_train.shape,y_val.shape)



## Convert dataframes into torch tensors

In [None]:
def convert_to_torch(value):
    return torch.tensor(data=value,dtype=torch.float32,requires_grad=True)

## Class dataset 

In [None]:
class Data(Dataset):
    def __init__(self, X_data, y_data):
        self.X_data = X_data
        self.y_data = y_data
        
    def __getitem__(self, index):
            return self.X_data[index], self.y_data[index]
        
    def __len__ (self):
        return len(self.X_data)

X_data = convert_to_torch(X_train)
y_data = convert_to_torch(y_train).type(torch.LongTensor)
X_val = convert_to_torch(X_val)
y_val = convert_to_torch(y_val).type(torch.LongTensor)
train_data = Data(X_data,y_data)
val_data = Data(X_val,y_val)

## Baseline model for ICR classification 

In [None]:
class ICRClassificationBase(torch.nn.Module):
    
    def training_step(self,batch):
        fetures,labels = batch
        out = self(fetures)
        loss = F.cross_entropy(out,labels)
        return loss
    
    def validation_step(self, batch):
        fetures, labels = batch 
        out = self(fetures)                    
        loss = F.cross_entropy(out, labels)   
        acc = accuracy(out, labels)          
        return {'val_loss': loss.detach(), 'val_acc': acc}
        
    def validation_epoch_end(self, outputs):
        batch_losses = [x['val_loss'] for x in outputs]
        epoch_loss = torch.stack(batch_losses).mean()   
        batch_accs = [x['val_acc'] for x in outputs]
        epoch_acc = torch.stack(batch_accs).mean()      
        return {'val_loss': epoch_loss.item(), 'val_acc': epoch_acc.item()}
    
    def epoch_end(self, epoch, result):
        if epoch%10 ==0:
            print("Epoch [{}], train_loss: {:.4f}, val_loss: {:.4f}, val_acc: {:.4f}".format(
                epoch, result['train_loss'], result['val_loss'], result['val_acc']))

## Pytorch Classification Model

In [None]:
class ICRNNet(ICRClassificationBase):
    def __init__(self,input_features,num_classes):
        super().__init__()
        self.network = torch.nn.Sequential(
        torch.nn.Linear(input_features,24),
        torch.nn.ReLU(),
        torch.nn.Dropout(p=0.5),    
        torch.nn.Linear(24, 12),
        torch.nn.ReLU(),
        torch.nn.Dropout(p=0.5),    
        torch.nn.Linear(12, num_classes)   
        )      
    
    def forward(self,inputs):
        return self.network(inputs)
    

## CPU or GPU 

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
input_features = X_train.shape[1]
num_classes = 2
model = ICRNNet(input_features,num_classes)
model.to(device)

## The Dataloaders 

In [None]:
def get_dataloaders(dataset_type,batch,shuffle):
    if shuffle:
         return DataLoader(dataset=dataset_type, batch_size=batch, shuffle=True)
    else:
        return DataLoader(dataset=dataset_type, batch_size=batch,shuffle=False)

## Defing the accuracy and evaluate and fit functions

In [None]:

def accuracy(outputs, labels):
    _, preds = torch.max(outputs, dim=1)
    return torch.tensor(torch.sum(preds == labels).item() / len(preds))
  
@torch.no_grad()
def evaluate(model, test_loader):
    model.eval()
    outputs = [model.validation_step(batch) for batch in test_loader]
    return model.validation_epoch_end(outputs)

  
def fit(epochs, lr, model, train_loader, test_loader, opt_func):
    
    history = []
    optimizer = opt_func(model.parameters(),lr)
    
    for epoch in tqdm(range(epochs)):
        
        model.train()
        train_losses = []
        for batch in train_loader:
            loss = model.training_step(batch)    
            train_losses.append(loss)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
        result = evaluate(model, test_loader)
        result['train_loss'] = torch.stack(train_losses).mean().item()
        model.epoch_end(epoch, result)
        history.append(result)
    print('Training Completed!!')
    
    return history

## Hyper parameters 

In [None]:
num_epochs = 50
opt_func =  torch.optim.Adam
lr = 0.001
batch_size = 8



## Train and Test Loaders

In [None]:
train_dataloader = get_dataloaders(train_data,batch_size,True)
val_dataloader = get_dataloaders(val_data,batch_size,False)


## Capturing the history

In [None]:
history = fit(num_epochs, lr, model, train_dataloader, val_dataloader, opt_func)

## Plotting the accuracies

In [None]:
def plot_accuracies(history):
    """ Plot the history of accuracies"""
    accuracies = [x['val_acc'] for x in history]
    plt.plot(accuracies, '-x')
    plt.xlabel('epoch')
    plt.ylabel('accuracy')
    plt.title('Accuracy vs. No. of epochs');
    

plot_accuracies(history)



## Plotting the looses 

In [None]:
def plot_losses(history):
    """ Plot the losses in each epoch"""
    train_losses = [x.get('train_loss') for x in history]
    val_losses = [x['val_loss'] for x in history]
    plt.plot(train_losses, '-bx')
    plt.plot(val_losses, '-rx')
    plt.xlabel('epoch')
    plt.ylabel('loss')
    plt.legend(['Training', 'Validation'])
    plt.title('Loss vs. No. of epochs');

plot_losses(history)

## Save my model 

In [None]:
torch.save(model,'/kaggle/working/ICRClassification.pt')

## Class test dataset

In [None]:
class Data_Test(Dataset):
    
    def __init__(self, X_test_data):
        self.X_test_data = X_test_data
        
    def __getitem__(self, index):
        return self.X_test_data[index]
        
    def __len__ (self):
        return len(self.X_test_data)

In [None]:
submission_df = get_datasets(submission_path).set_index("Id")

## Making the predictions on the test data

In [None]:
def make_predictions(model):
    test_df = get_datasets(test_path)
    test = to_numerical_ecoding(test_df,'EJ')
    test['EJ'] = test['EJ'].astype(float)
    testing_data = Data_Test(convert_to_torch(preprocessing(test)))
    test_dataloader = DataLoader(testing_data, batch_size=1, shuffle=False)
    
    model.eval()
    Ids = np.empty(shape=(len(test),), dtype=object)
    Class_0 = np.empty(shape=(len(test),))
    Class_1 = np.empty(shape=(len(test),))
    
    with torch.no_grad():
        for i,features in tqdm(enumerate(test_dataloader)):
            output = model(features)
            probs=F.softmax(output, dim=1).squeeze().detach().cpu().numpy()   
            Class_0[i] = probs[0]
            Class_1[i] = probs[0]
            Ids[i] = test.index[i]

    submission = pd.DataFrame(data={"Id" : Ids, "class_0" : Class_0, "class_1" : Class_1})
    print("Prediction Completed")
    return submission

## Load my model 

In [None]:
model_test = torch.load('/kaggle/working/ICRClassification.pt')
model_test.to(device)


## Making the predictions

In [None]:
submission_df =  make_predictions(model_test)

## Submitting the results

In [None]:
submission_df.to_csv("/kaggle/working/submission.csv", index=False)
print("Submission Completed")