<a href="https://www.kaggle.com/code/averma111/pytorch-ps3e15?scriptVersionId=129862237" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
%%capture 
!pip install optuna

In [None]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import torch
from tqdm.notebook import tqdm
import torch.nn.functional as F
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
warnings.filterwarnings('ignore')
seed = 42
torch.manual_seed(seed)
np.random.seed(seed)
sns.set_style("darkgrid")
pd.set_option('mode.chained_assignment',None)

In [None]:
def get_dataframe(path):
    df=pd.read_csv(path)
    return df

In [None]:
data = get_dataframe('/kaggle/input/playground-series-s3e15/data.csv')
original = get_dataframe('/kaggle/input/predicting-heat-flux/Data_CHF_Zhao_2020_ATE.csv')

In [None]:
def summary(text, df):
    print(f'{text} shape: {df.shape}')
    summ = pd.DataFrame(df.dtypes, columns=['dtypes'])
    summ['null'] = df.isnull().sum()
    summ['unique'] = df.nunique()
    summ['min'] = df.min()
    summ['median'] = df.median()
    summ['max'] = df.max()
    summ['mean'] = df.mean()
    summ['std'] = df.std()
    summ['duplicate'] = df.duplicated().sum()
    return summ

In [None]:
summary('data',data)

In [None]:
summary('original',original)

In [None]:
sns.histplot(data,x='x_e_out [-]',color='r')

In [None]:
sns.histplot(original,x='x_e_out [-]',color='b')

In [None]:
def get_numerical_features(df):
    numerical_feature = df.select_dtypes(include=['float64'])
    return numerical_feature

numerical_features = get_numerical_features(data)
numerical_features.head()

In [None]:
def get_categorical_features(df):
    categorical_features = df.select_dtypes(include=['object'])
    return categorical_features

categorical_features = get_categorical_features(data)
categorical_features.head()

In [None]:
def plot_numerical_histogram():
    fig, ax = plt.subplots(7, 1, figsize = (5, 15), dpi = 90)
    ax = ax.flatten()

    for i, column in enumerate(numerical_features):
        sns.histplot(data[column], ax=ax[i], color='r')
        sns.histplot(original[column], ax=ax[i], color='b')
    
        ax[i].set_title(f'{column} Distribution', size = 5)
        ax[i].set_xlabel(None)
        ax[i].set_ylabel(None)
    
    fig.suptitle('Distribution of Numerical Feature', fontsize = 8)
    plt.tight_layout()
    
plot_numerical_histogram()

In [None]:
def plot_categorical_data(df,column_name,palette,dataset_name):
    fig, ax = plt.subplots(1, 1, figsize = (12, 4))
    #ax = ax.flatten()
    sns.countplot(data = df, y = column_name, ax = ax, palette = palette, 
                  order = data[column_name].value_counts().index)
    ax.yaxis.label.set_size(20)
    plt.yticks(fontsize = 12)
    ax.set_xlabel('Count', fontsize = 20)
    ax.set_ylabel(None)
    plt.xticks(fontsize = 12)

    fig.suptitle(f'{column_name.title()} in {dataset_name} Dataset', fontsize = 15, fontweight = 'bold')
    plt.tight_layout()

In [None]:
plot_categorical_data(data,'author','flare','competition')

In [None]:
plot_categorical_data(data,'geometry','flare','competition')

In [None]:
plot_categorical_data(original,'author','ch:s=.25,rot=-.25','original')

In [None]:
plot_categorical_data(original,'geometry','ch:s=.25,rot=-.25','original')

In [None]:
def show_correlation(dataset, column_name,cmap):
    corr = dataset.corr(method = 'kendall')
    plt.figure(figsize = (10, 10), dpi = 150)
    mask = np.zeros_like(corr)
    mask[np.triu_indices_from(mask)] = True
    sns.heatmap(corr, mask = mask, cmap = cmap, annot = True, annot_kws = {'size' : 12})
    plt.title(f'{column_name} Dataset Correlation Matrix\n', fontsize = 15, weight = 'bold')
    plt.show()

In [None]:
show_correlation(data[numerical_features.columns],'Competition','flare')

In [None]:
show_correlation(original[numerical_features.columns],'Original','coolwarm')

In [None]:
def preprocessing_data_categorical(df):
    cat_col = get_categorical_features(df)
    for elements in cat_col.columns:
        df.fillna(df[elements].value_counts().index[0],axis=1,inplace=True)
    return df

    


In [None]:
def preprocessing_data_numerical(df):
    num_col = get_numerical_features(df)
    for elements in num_col.columns:
        df.fillna(df[elements].mean(),axis=1,inplace=True)
    return df


In [None]:
preprocessing_data_categorical(data)