<a href="https://www.kaggle.com/code/averma111/stacking-crabage-s3e16?scriptVersionId=131631721" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
import pandas as pd
import numpy as np 
import os
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.compose import make_column_selector
from sklearn.compose import make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
import xgboost
warnings.filterwarnings('ignore')
seed = 42
np.random.seed(seed)
sns.set_style("darkgrid")
pd.set_option('mode.chained_assignment',None)

## Data Acquisition

In [None]:
class Acquisition:
    
    def __init__(self,root_path):
        self.root_path = root_path
        
    def get_dataframe(self,filename):
        return pd.read_csv(os.path.join(self.root_path,filename))
    
    def summary(self,text, df):
        summary = pd.DataFrame(df.dtypes, columns=['dtypes'])
        summary['null'] = df.isnull().sum()
        summary['unique'] = df.nunique()
        summary['min'] = df.min()
        summary['median'] = df.median()
        summary['max'] = df.max()
        summary['mean'] = df.mean()
        summary['std'] = df.std()
        summary['duplicate'] = df.duplicated().sum()
        return summary
    
    def get_dataset_info(self,df):
            return df.info()
    
    
    
acq = Acquisition(root_path='/kaggle/input/playground-series-s3e16')
acq_original = Acquisition(root_path='/kaggle/input/crab-age-prediction')

## Train Dataset

In [None]:
train = acq.get_dataframe(filename='train.csv')
train=train.set_index('id')
train.head()

## Full Dataset

In [None]:
original = acq_original.get_dataframe(filename='CrabAgePrediction.csv')
original.head()

## Summary of Datasets

In [None]:
acq.summary('train',train)

In [None]:
acq.summary('original',original)

In [None]:
fulldata = pd.concat([train,original])
acq.summary('fulldata',fulldata)

## Information about Fulldata


In [None]:
print(f'The count of fulldataset is:{acq.get_dataset_info(fulldata)}')

## Segment Features and Labels

In [None]:
class Get_features_label:
    
    def get_features_label(self,df):
        
        features = df.loc[:, ~df.columns.isin(['Age'])]
        label = df['Age']
        
        return features,label
    
    
feat_label = Get_features_label()
feature,label=feat_label.get_features_label(fulldata)

## Segment Categorical and Numerical Columns

In [None]:
class Column_selector:
    
    def make_selector(self,feature):
        cat_selector = make_column_selector(dtype_include=object)
        num_selector = make_column_selector(dtype_include=np.float)
        return cat_selector,num_selector


cs = Column_selector()
cat_selector,num_selector = cs.make_selector(feature)

## Columnar Transformation

In [None]:
cat_tree_processor = OrdinalEncoder(
    handle_unknown="use_encoded_value",
    unknown_value=-1,
    encoded_missing_value=-2,
)
num_tree_processor = SimpleImputer(strategy="mean", add_indicator=True)

tree_preprocessor = make_column_transformer(
    (num_tree_processor, num_selector), (cat_tree_processor, cat_selector)
)
tree_preprocessor

In [None]:
cat_linear_processor = OneHotEncoder(handle_unknown="ignore")
num_linear_processor = make_pipeline(
    StandardScaler(), SimpleImputer(strategy="mean", add_indicator=True)
)

linear_preprocessor = make_column_transformer(
    (num_linear_processor, num_selector), (cat_linear_processor, cat_selector)
)
linear_preprocessor

## Model Selections

In [None]:
from sklearn.linear_model import LassoCV

lasso_pipeline = make_pipeline(linear_preprocessor, LassoCV())
lasso_pipeline

In [51]:
from sklearn.linear_model import LinearRegression

linear_pipeline = make_pipeline(linear_preprocessor, LinearRegression())
linear_pipeline

In [53]:
from sklearn.linear_model import Ridge
ridge_pipeline = make_pipeline(linear_preprocessor, Ridge(alpha=.5))
ridge_pipeline


In [54]:
from sklearn.linear_model import BayesianRidge
bayesridge_pipeline = make_pipeline(linear_preprocessor, BayesianRidge())
bayesridge_pipeline

In [None]:
from  xgboost import XGBRegressor
xgb_pipeline = make_pipeline(linear_preprocessor, XGBRegressor())
xgb_pipeline

In [None]:
from sklearn.ensemble import RandomForestRegressor

rf_pipeline = make_pipeline(tree_preprocessor, RandomForestRegressor(random_state=42))
rf_pipeline

In [None]:
from sklearn.ensemble import HistGradientBoostingRegressor

gbdt_pipeline = make_pipeline(
    tree_preprocessor, HistGradientBoostingRegressor(random_state=0)
)
gbdt_pipeline

## Stacking Estimators

In [None]:
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import RidgeCV

estimators = [
    ("Random Forest", rf_pipeline),
    ("Lasso", lasso_pipeline),
    ("Gradient Boosting", gbdt_pipeline),
    ('Xtreame Gradient Boosting',xgb_pipeline),
    ('Linear',linear_pipeline),
    ('Ridge',ridge_pipeline),
    ('BayesRidge',bayesridge_pipeline)
]

stacking_regressor = StackingRegressor(estimators=estimators, final_estimator=LinearRegression())
stacking_regressor

## Plotting Best Estimators

In [None]:
import time
import matplotlib.pyplot as plt
from sklearn.metrics import PredictionErrorDisplay
from sklearn.model_selection import cross_validate, cross_val_predict

fig, axs = plt.subplots(2, 2, figsize=(9, 7))
axs = np.ravel(axs)

for ax, (name, est) in zip(
    axs, estimators + [("Stacking Regressor", stacking_regressor)]
):
    scorers = {"R2": "r2", "MAE": "neg_mean_absolute_error"}

    start_time = time.time()
    scores = cross_validate(
        est, feature, label, scoring=list(scorers.values()), n_jobs=-1, verbose=0
    )
    elapsed_time = time.time() - start_time

    y_pred = cross_val_predict(est, feature, label, n_jobs=-1, verbose=0)
    scores = {
        key: (
            f"{np.abs(np.mean(scores[f'test_{value}'])):.2f} +- "
            f"{np.std(scores[f'test_{value}']):.2f}"
        )
        for key, value in scorers.items()
    }

    display = PredictionErrorDisplay.from_predictions(
        y_true=label,
        y_pred=y_pred,
        kind="actual_vs_predicted",
        ax=ax,
        scatter_kwargs={"alpha": 0.2, "color": "tab:blue"},
        line_kwargs={"color": "tab:red"},
    )
    ax.set_title(f"{name}\nEvaluation in {elapsed_time:.2f} seconds")

    for name, score in scores.items():
        ax.plot([], [], " ", label=f"{name}: {score}")
    ax.legend(loc="upper left")

plt.suptitle("Single predictors versus stacked predictors")
plt.tight_layout()
plt.subplots_adjust(top=0.9)
plt.show()

## Predictions on Test Data

In [None]:
test = acq.get_dataframe(filename='test.csv')
test.head()

In [None]:
stacking_regressor.fit(feature,label)

In [None]:
test = acq.get_dataframe(filename='test.csv')
test.head()

## Submission for Competition

In [None]:
yhat_age = stacking_regressor.predict(test)
df_submit = pd.DataFrame(data={'id': test['id'],'Age': yhat_age})
df_submit.to_csv('submission.csv', index=False)
print('Submission Completed!!')