# 🦀 **<span style="color: DarkGreen; font-family:Times New Roman;">Crab Age</span>**

---


* * *
# 📖 <span style="color: DarkGreen; font-family:Times New Roman;">INTRODUCTION</span> <a name="introduction"></a>
---


* * *
# 🛠️ <span style="color: DarkGreen; font-family:Times New Roman;">Import important libraries, Read the data</span> <a name="iaiil"></a>
---


In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from xgboost import XGBRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV

In [None]:
train = pd.read_csv("train.csv",index_col = "id")
test = pd.read_csv("test.csv",index_col = "id")

In [None]:
# Decide to run the model_selection part or not
model_selection = True

# Random State
RS = 13

# Folds Number
folds = 5


* * *
# 👁️ <span style="color: DarkGreen; font-family:Times New Roman;">Overview</span> <a name="overview"></a>
---


In [None]:
train.shape

In [None]:
train.head()

In [None]:
train.Sex.unique()

In [None]:
train.tail()

In [None]:
train.info()

In [None]:
train.describe()

In [None]:
train.dtypes

In [None]:
train = pd.get_dummies(train, drop_first = True)
test = pd.get_dummies(test, drop_first = True)

* * *
# 🔎 <span style="color: DarkGreen; font-family:Times New Roman;">EDA</span> <a name="eda"></a>
---


* * *
# 🔎 <span style="color: DarkGreen; font-family:Times New Roman;">Univariate Analysis</span> <a name="univariate-analysis"></a>
---


In [None]:
train.hist(figsize = (20,20))
plt.show()

In [None]:
test.hist(figsize = (20,20))
plt.show()

* * *
# 🔎 <span style="color: DarkGreen; font-family:Times New Roman;">Correlation Analysis</span> <a name="corr"></a>
---


In [None]:
train.corr()

In [None]:
fig, axes = plt.subplots(figsize=(20, 10))
sns.heatmap(train.corr() , cmap = sns.cubehelix_palette(as_cmap=True), mask=np.triu(train.corr()), linewidths=.5, cbar_kws={"shrink": .5}, annot = True)
plt.show()

In [None]:
train.corr()["Age"].sort_values(ascending = False)

In [None]:
test.corr()

In [None]:
fig, axes = plt.subplots(figsize=(20, 10))
sns.heatmap(test.corr() , cmap = sns.cubehelix_palette(as_cmap=True), mask=np.triu(test.corr()), linewidths=.5, cbar_kws={"shrink": .5}, annot = True)
plt.show()

* * *
# 🔎 <span style="color: DarkGreen; font-family:Times New Roman;">Feature Interactions</span> <a name="feature-interactions"></a>
---


In [None]:
sns.pairplot(train)
plt.show()

In [None]:
sns.pairplot(test)
plt.show()

* * *
# ⚙️ <span style="color: DarkGreen; font-family:Times New Roman;">Feature Engineering</span> <a name="feature-engineering"></a>
---


* * *
# ⚙️ <span style="color: DarkGreen; font-family:Times New Roman;">Dimensionality Reduction</span> <a name="dimensionality-reduction"></a>
---


* * *
# ⚙️ <span style="color: DarkGreen; font-family:Times New Roman;">Standardization</span> <a name="standardization"></a>
---


In [None]:
def scaling(feature):
    global X_train, X_test
    scaler = MinMaxScaler()
    scaler.fit
    scaler.fit(X_train[feature].to_numpy().reshape(-1,1))
    X_train[feature] = scaler.transform(X_train[feature].to_numpy().reshape(-1,1))
    X_test[feature] = scaler.transform(X_test[feature].to_numpy().reshape(-1,1))

In [None]:
scale_needed_features = [ "Weight", "Shucked Weight", "Viscera Weight", "Shell Weight"]

* * *
# 🧑‍🔬 <span style="color: DarkGreen; font-family:Times New Roman;">Model Selection</span> <a name="model-selection"></a>
---


In [None]:
if model_selection == True:
    X = train.drop(["Age"], axis = 1)
    y = train[["Age"]]
    list_mae_rfr = []
    list_mae_lr = []
    list_mae_xgb = []


    for i in range(1,folds):
        X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.33 , random_state = i)

        for feature in scale_needed_features:
            scaling(feature)

        # Random Forest
        rfr = RandomForestRegressor(random_state = RS, criterion = "absolute_error")
        rfr.fit(X_train,y_train.values.ravel())
        rfr_prediction = rfr.predict(X_test)
        mae_rfr = mean_absolute_error(y_test,rfr_prediction)
        list_mae_rfr.append(mae_rfr)

        # Linear Regression
        lr = LinearRegression()
        lr.fit(X_train,y_train)
        lr_prediction = lr.predict(X_test)
        mae_lr = mean_absolute_error(y_test,lr_prediction)
        list_mae_lr.append(mae_lr)

        # XGBoost
        xgb = XGBRegressor(random_state = RS, eval_metric = "mae") # max_depth = 3, n_estimators= 100
        xgb.fit(X_train,y_train)
        xgb_prediction = xgb.predict(X_test)
        mae_xgb = mean_absolute_error(y_test,xgb_prediction)
        list_mae_xgb.append(mae_xgb)



    print(f"Mean RFR {folds}-FOLD: {np.mean(list_mae_rfr)}")

    print(f"Mean LR {folds}-FOLD: {np.mean(list_mae_lr)}")

    print(f"Mean XGB {folds}-FOLD: {np.mean(list_mae_xgb)}")


* * *
# 🧫 <span style="color: DarkGreen; font-family:Times New Roman;">Final Evaluation</span> <a name="final-evaluation"></a>
---


In [None]:
X_train = train.drop(["Age"], axis = 1)
y_train = train[["Age"]]

X_test = test.copy()

for feature in scale_needed_features:
    scaling(feature)

xgb_final = XGBRegressor(random_state = RS,eval_metric = "mae", max_depth = 3, n_estimators= 100)
xgb_final.fit(X_train,y_train)
xgb_final_prediction = xgb_final.predict(X_test)

* * *
# 📋 <span style="color: DarkGreen; font-family:Times New Roman;">Result</span> <a name="result"></a>
---


In [None]:
result = pd.DataFrame({
    "Age" : xgb_final_prediction
}).set_index(X_test.index)

In [None]:
result

In [None]:
result.to_csv("output.csv")

In [None]:
# Author: amyrmahdy
# Date: 11 June 2023

* * *
# 📕 <span style="color: DarkGreen; font-family:Times New Roman;">CONCLUSION</span>  <a name="conclusion"></a>
---
