<a href="https://www.kaggle.com/code/averma111/pytorch-widedeep-pss3e18?scriptVersionId=134978857" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [50]:
%%capture
!pip install pytorch-widedeep

In [51]:
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import os


import torch
from pytorch_widedeep.preprocessing import WidePreprocessor, TabPreprocessor
from pytorch_widedeep.training import Trainer
from pytorch_widedeep.models import Wide, TabMlp, WideDeep
from pytorch_widedeep.metrics import Accuracy, Precision,F1Score,Recall

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder,StandardScaler


import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('fivethirtyeight')
import plotly.express as px

import warnings
warnings.filterwarnings('ignore')
import itertools

In [52]:
class Datapreparation(object):
    
    def __init__(self,root_path):
        self.root_path = root_path
        
    def get_dataframe(self,filename):
        return pd.read_csv(os.path.join(self.root_path,filename))
    
    def summary(self,text, df):
        summary = pd.DataFrame(df.dtypes, columns=['dtypes'])
        summary['null'] = df.isnull().sum()
        summary['unique'] = df.nunique()
        summary['min'] = df.min()
        summary['median'] = df.median()
        summary['max'] = df.max()
        summary['mean'] = df.mean()
        summary['std'] = df.std()
        summary['duplicate'] = df.duplicated().sum()
        return summary
    
    

    
data = Datapreparation('/kaggle/input/playground-series-s3e18')
train=data.get_dataframe('train.csv')


In [53]:
data.summary('train',train)

Unnamed: 0,dtypes,null,unique,min,median,max,mean,std,duplicate
id,int64,0,14838,0.0,7418.5,14837.0,7418.5,4283.505982,0
BertzCT,float64,0,2368,0.0,290.987941,4069.95978,515.153604,542.45637,0
Chi1,float64,0,1259,0.0,6.48527,69.551167,9.135189,6.819989,0
Chi1n,float64,0,3157,0.0,4.052701,50.174588,5.854307,4.647064,0
Chi1v,float64,0,3306,0.0,4.392859,53.431954,6.738497,5.866444,0
Chi2n,float64,0,3634,0.0,2.970427,32.195368,4.43257,3.760516,0
Chi2v,float64,0,3725,0.0,3.242775,34.579313,5.253221,4.925065,0
Chi3v,float64,0,3448,0.0,1.948613,22.880836,3.418749,3.436208,0
Chi4n,float64,0,2930,0.0,1.073261,16.07281,1.773472,1.865898,0
EState_VSA1,float64,0,719,0.0,17.353601,363.705954,29.202823,31.728679,0


In [54]:
# Categorical columns
cat_embed_cols = ['fr_COO','fr_COO2','NumHeteroatoms']
# Continous columns
continuous_cols = ['BertzCT', 'Chi1', 'Chi1n', 'Chi1v', 'Chi2n', 'Chi2v', 'Chi3v',
       'Chi4n', 'EState_VSA1', 'EState_VSA2', 'ExactMolWt', 'FpDensityMorgan1',
       'FpDensityMorgan2', 'FpDensityMorgan3', 'HallKierAlpha',
       'HeavyAtomMolWt', 'Kappa3', 'MaxAbsEStateIndex', 'MinEStateIndex',
        'PEOE_VSA10', 'PEOE_VSA14', 'PEOE_VSA6', 'PEOE_VSA7',
       'PEOE_VSA8', 'SMR_VSA10', 'SMR_VSA5', 'SlogP_VSA3', 'VSA_EState9']

# TARGET
target_col = ['EC1','EC2','EC3','EC4','EC5','EC6']
target = train[target_col].values

In [56]:
# deeptabular
tab_preprocessor = TabPreprocessor(
    embed_cols=cat_embed_cols, continuous_cols=continuous_cols
)
X_tab = tab_preprocessor.fit_transform(train)
tab_preprocessor.cat_embed_input

[('fr_COO', 8, 5), ('fr_COO2', 8, 5), ('NumHeteroatoms', 40, 13)]

In [69]:
# Model 
tab_mlp = TabMlp(
    column_idx=tab_preprocessor.column_idx,
    cat_embed_input=tab_preprocessor.cat_embed_input,
    cat_embed_dropout=0.1,
    continuous_cols=continuous_cols,
    mlp_hidden_dims=[4, 2],
    mlp_dropout=0.5,
    mlp_activation="leaky_relu"
)

tab_model = WideDeep(deeptabular=tab_mlp,pred_dim=6)
tab_model

WideDeep(
  (deeptabular): Sequential(
    (0): TabMlp(
      (cat_and_cont_embed): DiffSizeCatAndContEmbeddings(
        (cat_embed): DiffSizeCatEmbeddings(
          (embed_layers): ModuleDict(
            (emb_layer_fr_COO): Embedding(9, 5, padding_idx=0)
            (emb_layer_fr_COO2): Embedding(9, 5, padding_idx=0)
            (emb_layer_NumHeteroatoms): Embedding(41, 13, padding_idx=0)
          )
          (embedding_dropout): Dropout(p=0.1, inplace=False)
        )
        (cont_norm): BatchNorm1d(28, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (encoder): MLP(
        (mlp): Sequential(
          (dense_layer_0): Sequential(
            (0): Dropout(p=0.5, inplace=False)
            (1): Linear(in_features=51, out_features=4, bias=True)
            (2): LeakyReLU(negative_slope=0.01, inplace=True)
          )
          (dense_layer_1): Sequential(
            (0): Dropout(p=0.5, inplace=False)
            (1): Linear(in_features=4, out_feature

In [72]:
tab_trainer = Trainer(
    model=tab_model,
    objective="multiclass",
    optimizers=torch.optim.Adam(tab_model.parameters(), lr=0.001),
    metrics=[Accuracy, Precision,F1Score,Recall],
)

In [73]:
tab_trainer.fit(X_tab=X_tab, target=target, n_epochs=10, batch_size=32, val_split=0.2)

ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.

In [None]:
test=data.get_dataframe('test.csv')
test = data.rename_column(test)
test.head()

In [None]:
#X_wide_te = wide_preprocessor.transform(test_updated)
X_tab_te = tab_preprocessor.transform(test_updated)
preds = tab_trainer.predict_proba(X_tab=X_tab_te)
print(f'The  probabilities are {preds}')

In [None]:
class Submit:
    
    def submit_predictions(self,test_updated):
        df_submit = pd.DataFrame(data={'id': test_updated['id'],'Machine Failure':prediction_1})
        df_submit.to_csv('submission.csv',index=False)
        print('Submission Completed!!')
        return df_submit
        
        
submit = Submit()
df_submit=submit.submit_predictions(test_updated)