<a href="https://www.kaggle.com/code/averma111/pycaret-emission-ps3e20?scriptVersionId=139857423" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [10]:
%%capture
!pip install pycaret[full]

In [11]:
import numpy as np # linear algebra
import pandas as pd
import pycaret
from pycaret.regression import *
from pycaret.regression import RegressionExperiment
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


/kaggle/input/playground-series-s3e20/sample_submission.csv
/kaggle/input/playground-series-s3e20/train.csv
/kaggle/input/playground-series-s3e20/test.csv


In [12]:
class DataAcquisition:
    
    def __init__(self):
        self.X_train=None
        self.X_val=None
        self.y_train=None
        self.y_val=None
        self.X_test= None
    
    def prepare_data(self):
        path = '/kaggle/input/playground-series-s3e20/train.csv'
        df = pd.read_csv(
            path, 
            sep=',', 
            infer_datetime_format=True, 
            low_memory=False
        )
        
        X = df.copy()
        return X
    
    def prepare_data_test(self):
        path = '/kaggle/input/playground-series-s3e20/test.csv'
        df = pd.read_csv(
            path, 
            sep=',', 
            infer_datetime_format=True, 
            low_memory=False
        )
        
        X = df.copy()
        return X
    
    
    def new_features(self,df):
        pass
    
    
    def summary(self,text, df):
        print(f'{text} shape: {df.shape}')
        summ = pd.DataFrame(df.dtypes, columns=['dtypes'])
        summ['null'] = df.isnull().sum()
        summ['unique'] = df.nunique()
        summ['min'] = df.min()
        summ['median'] = df.median()
        summ['max'] = df.max()
        summ['mean'] = df.mean()
        summ['std'] = df.std()
        summ['duplicate'] = df.duplicated().sum()
        return summ
    
    def _get_numerical_features(self,df):
        numerical_feature = df.select_dtypes(include=['float64'])
        return numerical_feature
    
    
    def plot_numerical_histogram(self,df):
        fig, ax = plt.subplots(5, 15, figsize = (5, 15), dpi = 90)
        ax = ax.flatten()

        for i, column in enumerate(self._get_numerical_features(df)):
            sns.histplot(df[column], ax=ax[i], color='r')
    
            ax[i].set_title(f'{column} Distribution', size = 5)
            ax[i].set_xlabel(None)
            ax[i].set_ylabel(None)
        fig.suptitle('Distribution of Numerical Feature', fontsize = 8)
        plt.tight_layout()
        
        
    def show_correlation(self,dataset, column_name,cmap):
        corr = dataset.corr(method = 'kendall')
        plt.figure(figsize = (10, 10), dpi = 90)
        mask = np.zeros_like(corr)
        mask[np.triu_indices_from(mask)] = True
        sns.heatmap(corr, mask = mask, cmap = cmap, annot = True, annot_kws = {'size' : 12})
        #plt.title(f'{column_name} Dataset Correlation Matrix\n', fontsize = 15, weight = 'bold')
        plt.show()
    
    
    def standardization_data(self,X_data):
        scaler = MinMaxScaler()
        std_X_data = scaler.fit_transform(X_data)
        return std_X_data
    
    def preprocessing(self,df):
        pass
    
    def removenan(self,df,column):
        df[column].fillna(df[column].mean(), inplace=True)
        
    def preprocessing_test(self,df):
        pass
    
        
acq = DataAcquisition()

train_df = acq.prepare_data()
test_df = acq.prepare_data_test()

In [13]:
# init setup on exp
s = setup(train_df, target = 'emission', session_id = 42)

Unnamed: 0,Description,Value
0,Session id,42
1,Target,emission
2,Target type,Regression
3,Original data shape,"(79023, 76)"
4,Transformed data shape,"(79023, 76)"
5,Transformed train set shape,"(55316, 76)"
6,Transformed test set shape,"(23707, 76)"
7,Numeric features,74
8,Categorical features,1
9,Rows with missing values,99.4%


In [None]:
# compare baseline models
best = compare_models()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
lr,Linear Regression,73.7573,21737.5817,147.0069,-0.0002,1.9453,102.5632,0.598
lasso,Lasso Regression,73.7573,21737.5682,147.0068,-0.0002,1.9453,102.5627,0.293
ridge,Ridge Regression,73.7573,21737.5816,147.0069,-0.0002,1.9453,102.5632,0.293
en,Elastic Net,73.7563,21737.3654,147.0061,-0.0002,1.9453,102.555,0.353
lar,Least Angle Regression,73.7573,21737.5817,147.0069,-0.0002,1.9453,102.5632,0.29
llar,Lasso Least Angle Regression,73.7573,21737.5675,147.0068,-0.0002,1.9453,102.5627,0.287
omp,Orthogonal Matching Pursuit,73.7573,21737.5817,147.0069,-0.0002,1.9453,102.5632,0.296
br,Bayesian Ridge,73.7573,21737.5817,147.0069,-0.0002,1.9453,102.5632,0.288
dt,Decision Tree Regressor,73.7575,21737.6067,147.0069,-0.0002,1.9453,102.5643,0.31
huber,Huber Regressor,67.0249,22958.4471,151.0971,-0.0569,1.7247,52.2872,0.281


Processing:   0%|          | 0/85 [00:00<?, ?it/s]

In [None]:
# plot residuals
plot_model(best, plot = 'residuals')

In [None]:
# plot error
plot_model(best, plot = 'error')

In [None]:
# plot feature importance
plot_model(best, plot = 'feature')

In [None]:
# predict on test set
holdout_pred = predict_model(best)

In [None]:
# predict model on new_data
predictions = predict_model(best, data = test_df)
predictions.head()

In [None]:
# save pipeline
save_model(best, 'emission_pipeline')

In [None]:
# load pipeline
loaded_best_pipeline = load_model('my_first_pipeline')
loaded_best_pipeline