<a href="https://www.kaggle.com/code/averma111/pytorch-ps3e15?scriptVersionId=129878852" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
%%capture 
!pip install optuna

In [None]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import torch
from tqdm.notebook import tqdm
import torch.nn.functional as F
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
warnings.filterwarnings('ignore')
seed = 42
torch.manual_seed(seed)
np.random.seed(seed)
sns.set_style("darkgrid")
pd.set_option('mode.chained_assignment',None)

In [None]:
def get_dataframe(path):
    df=pd.read_csv(path)
    return df

In [None]:
data = get_dataframe('/kaggle/input/playground-series-s3e15/data.csv')
original = get_dataframe('/kaggle/input/predicting-heat-flux/Data_CHF_Zhao_2020_ATE.csv')

In [None]:
def summary(text, df):
    print(f'{text} shape: {df.shape}')
    summ = pd.DataFrame(df.dtypes, columns=['dtypes'])
    summ['null'] = df.isnull().sum()
    summ['unique'] = df.nunique()
    summ['min'] = df.min()
    summ['median'] = df.median()
    summ['max'] = df.max()
    summ['mean'] = df.mean()
    summ['std'] = df.std()
    summ['duplicate'] = df.duplicated().sum()
    return summ

In [None]:
summary('data',data)

In [None]:
summary('original',original)

In [None]:
sns.histplot(data,x='x_e_out [-]',color='r')

In [None]:
sns.histplot(original,x='x_e_out [-]',color='b')

In [None]:
def get_numerical_features(df):
    numerical_feature = df.select_dtypes(include=['float64'])
    return numerical_feature

numerical_features = get_numerical_features(data)
numerical_features.head()

In [None]:
def get_categorical_features(df):
    categorical_features = df.select_dtypes(include=['object'])
    return categorical_features

categorical_features = get_categorical_features(data)
categorical_features.head()

In [None]:
def plot_numerical_histogram():
    fig, ax = plt.subplots(7, 1, figsize = (5, 15), dpi = 90)
    ax = ax.flatten()

    for i, column in enumerate(numerical_features):
        sns.histplot(data[column], ax=ax[i], color='r')
        sns.histplot(original[column], ax=ax[i], color='b')
    
        ax[i].set_title(f'{column} Distribution', size = 5)
        ax[i].set_xlabel(None)
        ax[i].set_ylabel(None)
    
    fig.suptitle('Distribution of Numerical Feature', fontsize = 8)
    plt.tight_layout()
    
plot_numerical_histogram()

In [None]:
def plot_categorical_data(df,column_name,palette,dataset_name):
    fig, ax = plt.subplots(1, 1, figsize = (12, 4))
    #ax = ax.flatten()
    sns.countplot(data = df, y = column_name, ax = ax, palette = palette, 
                  order = data[column_name].value_counts().index)
    ax.yaxis.label.set_size(20)
    plt.yticks(fontsize = 12)
    ax.set_xlabel('Count', fontsize = 20)
    ax.set_ylabel(None)
    plt.xticks(fontsize = 12)

    fig.suptitle(f'{column_name.title()} in {dataset_name} Dataset', fontsize = 15, fontweight = 'bold')
    plt.tight_layout()

In [None]:
plot_categorical_data(data,'author','flare','competition')

In [None]:
plot_categorical_data(data,'geometry','flare','competition')

In [None]:
plot_categorical_data(original,'author','ch:s=.25,rot=-.25','original')

In [None]:
plot_categorical_data(original,'geometry','ch:s=.25,rot=-.25','original')

In [None]:
def show_correlation(dataset, column_name,cmap):
    corr = dataset.corr(method = 'kendall')
    plt.figure(figsize = (10, 10), dpi = 90)
    mask = np.zeros_like(corr)
    mask[np.triu_indices_from(mask)] = True
    sns.heatmap(corr, mask = mask, cmap = cmap, annot = True, annot_kws = {'size' : 12})
    plt.title(f'{column_name} Dataset Correlation Matrix\n', fontsize = 15, weight = 'bold')
    plt.show()

In [None]:
show_correlation(data[numerical_features.columns],'Competition','flare')

In [None]:
show_correlation(original[numerical_features.columns],'Original','coolwarm')

In [None]:
def generate_train_test_data(data,original):
    
    feature = data[['pressure [MPa]', 'mass_flux [kg/m2-s]', 'chf_exp [MW/m2]', 'length [mm]']].copy()
    feature['mass_flux_missing'] = np.where(feature['mass_flux [kg/m2-s]'].isnull(), 1, 0)
    feature['pressure_missing'] = np.where(feature['pressure [MPa]'].isnull(), 1, 0)
    feature['chf_missing'] = np.where(feature['chf_exp [MW/m2]'].isnull(), 1, 0)
    feature['generated'] = 1

    feature_org = original[['pressure [MPa]', 'mass_flux [kg/m2-s]', 'chf_exp [MW/m2]', 'length [mm]']].copy()
    feature_org['mass_flux_missing'] = np.where(feature_org['mass_flux [kg/m2-s]'].isnull(), 1, 0)
    feature_org['pressure_missing'] = np.where(feature_org['pressure [MPa]'].isnull(), 1, 0)
    feature_org['chf_missing'] = np.where(feature_org['chf_exp [MW/m2]'].isnull(), 1, 0)
    feature_org['generated'] = 0

    label = data['x_e_out [-]']
    label_org = original['x_e_out [-]']

    X = pd.concat([feature, feature_org], axis = 0).reset_index(drop = True)
    y = pd.concat([label, label_org], axis = 0).reset_index(drop = True)

    X.columns = ['pressure', 'mass_flux', 'chf_exp', 'length', 'mass_flux_missing', 'pressure_missing', 'chf_missing', 'generated']

    test = X[y.isnull()]
    X = X[~y.isnull()]
    y = y[~y.isnull()]
    
    return X,y,test


X,y,test=generate_train_test_data(data,original)

In [None]:
def split_data(X,y):
    return train_test_split(X,y.to_numpy(),test_size=0.3,random_state=42)
X_train,X_val,y_train,y_val  = split_data(X,y)

In [None]:
def normalize_dataset(X_train,X_val):
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_val = scaler.transform(X_val)
    return X_train,X_val

X_train,X_val=normalize_dataset(X_train,X_val)
print(X_train.shape,X_val.shape,y_train.shape,y_val.shape)

In [None]:
def convert_to_torch(value):
    return torch.tensor(data=value,dtype=torch.float32,requires_grad=True)


In [None]:
class FluxData(Dataset):
    def __init__(self, X_data, y_data):
        self.X_data = X_data
        self.y_data = y_data
        
    def __getitem__(self, index):
            return self.X_data[index], self.y_data[index]
        
    def __len__ (self):
        return len(self.X_data)

X_data = convert_to_torch(X_train)
y_data = convert_to_torch(y_train)
X_val = convert_to_torch(X_val)
y_val = convert_to_torch(y_val)
train_data = FluxData(X_data,y_data)
test_data = FluxData(X_val,y_val)

In [73]:
class RegressionFluxModel(torch.nn.Module):
    def training_step(self,batch):
        features,label = batch
        output = self(features)
        loss = torch.sqrt(torch.nn.MSELoss(output,label.unsqueeze(1)))
        return loss
    
    def validation_step(self,batch):
        features, labels = batch 
        output = self(features)                    
        loss = torch.sqrt(torch.nn.MSELoss(output,label.unsqueeze(1)))       
        return {'Validation_loss': loss.detach()}
    
    def validation_epoch_end(self, outputs):
        batch_losses = [x['Validation_loss'] for x in outputs]
        epoch_loss = torch.stack(batch_losses).mean()   
        return {'Validation_loss': epoch_loss.item()}
    
    
    def epoch_end(self, epoch, result):
        if epoch %10 ==0:
            print("Epoch [{}], Train_loss: {:.5f}, Validation_loss: {:.5f}".format(
                (epoch+10), result['train_loss'], result['Validation_loss']))
        

In [74]:
class RegressionFluxNNet(RegressionFluxModel):
    def __init__(self,input_features):
        super().__init__()
        self.network = torch.nn.Sequential(
        torch.nn.Linear(input_features,1),
        torch.nn.LeakyReLU()
        )
        
    def forward(self,inputs):
        return self.network(inputs)

In [75]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [76]:
model = RegressionFluxNNet(X_train.shape[1])
model.to(device)

RegressionFluxNNet(
  (network): Sequential(
    (0): Linear(in_features=8, out_features=1, bias=True)
    (1): LeakyReLU(negative_slope=0.01)
  )
)

In [77]:
BATCH_SIZE = 32
EPOCHS = 100
LEARNING_RATE = 0.1
MOMENTUM = 0.9
OPT_FUNC= torch.optim.SGD

In [78]:
def get_dataloaders(dataset_type,batch,shuffle):
    if shuffle:
         return DataLoader(dataset=dataset_type, batch_size=batch, shuffle=True)
    else:
        return DataLoader(dataset=dataset_type, batch_size=batch,shuffle=False)

In [79]:
@torch.no_grad()
def evaluate(model, test_loader):
    model.eval()
    outputs = [model.validation_step(batch) for batch in test_loader]
    return model.validation_epoch_end(outputs)

  
def fit(epochs, lr, model, train_loader, test_loader, opt_func):
    model.train()
    history = []
    optimizer = opt_func(model.parameters(),lr,MOMENTUM)
    for epoch in tqdm(range(epochs)): 
        train_losses = []
        for batch in train_loader:
            optimizer.zero_grad()
            loss = model.training_step(batch)
            loss.backward()
            optimizer.step()
            train_losses.append(loss)
            
        result = evaluate(model, test_loader)
        result['Train_loss'] = torch.stack(train_losses).mean().item()
        model.epoch_end(epoch, result)
        history.append(result)
    print('Training is completed!!')
    return history

In [80]:
train_dataloader = get_dataloaders(train_data,BATCH_SIZE,True)
test_dataloader = get_dataloaders(test_data,BATCH_SIZE,False)
history = fit(EPOCHS, LEARNING_RATE, model, train_dataloader, test_dataloader,OPT_FUNC)

  0%|          | 0/100 [00:00<?, ?it/s]

RuntimeError: Boolean value of Tensor with more than one value is ambiguous

In [None]:
def plot_losses(history):
    """ Plot the losses in each epoch"""
    train_losses = [x.get('train_loss') for x in history]
    test_losses = [x['test_loss'] for x in history]
    plt.plot(train_losses, '-bx')
    plt.plot(test_losses, '-rx')
    plt.xlabel('epoch')
    plt.ylabel('loss')
    plt.legend(['Training', 'Testing'])
    plt.title('Loss vs. No. of epochs');