In [None]:
#Lets install the required libraries

!pip3 install xgboost

In [None]:
!pip install catboost

In [None]:
!pip install lightgbm

In [None]:
# All prerequisites libraries :

import pandas as pd
import numpy as npthe
import seaborn as sns

from sklearn.model_selection import train_test_split , GridSearchCV
from sklearn.metrics import make_scorer

# for Pre-process the data
from sklearn.preprocessing import LabelEncoderton, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer 

# Models to train and check
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor , GradientBoostingRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor

#to export a final model for web implementation
import joblib
import matplotlib.pyplot as plt

# to ignore the warnings 
from warnings import filterwarnings
filterwarnings(action='ignore')

In [None]:

# load train and test data

train_df = pd.read_csv('../Dataset/train.csv')
test_df = pd.read_csv('../Dataset/test.csv')

In [None]:
train_df.head()

In [None]:
test_df.head()

---

# lets analysis the features :

1. rings - this is our target variable that we gonna predict (discrete variable)
2. sex - male , female , infant (categorical variable)
3. lenght - represents the longest measurement of the cell in mm (continous variable)
4. diameter - represents the diameter of cells in mm which is perpendicular to length (continous variable)
5. height - represents height of the cells in mm (continous variable)
6. whole weight - represents whole weigth of abalone cells in grams (continous variable)
7. whole weight.1 - represents sucked weight (only the meat) in grams (continous variable)
8. whole weight.2 - represents viscera weight after bleeding in grams (continous varaible)
9. shell weight - represents shell weight after been dried in grams (continous variable) 

---

![image.png](attachment:image.png)

In [None]:
# performing some feature engineering functions

In [None]:
train_df.shape

In [None]:
test_df.shape

In [None]:
train_df.describe(include='all')

In [None]:
train_df.corr()

In [None]:
train_df.groupby('Rings').mean()

In [None]:
# Data Analysis :

train_df.hist(figsize=(20,20),bins=50)
plt.show()

In [None]:
def show_scatter(ax, df, f1, f2):
    ax.scatter(df[f1], df[f2], color='red', s=1)
    ax.set_xlabel(f1)
    ax.set_ylabel(f2)

features = train_df.columns[1:-1]

# Create a figure and a set of subplots
fig, axs = plt.subplots(3, 3, figsize=(15, 15))

# Flatten the axs array for easy iteration
axs = axs.flatten()

# Plot each feature against 'Rings'
for i, feature in enumerate(features):
    show_scatter(axs[i], train_df, feature, 'Rings')

# Hide any unused subplots
for j in range(i + 1, len(axs)):
    fig.delaxes(axs[j])

# Adjust layout to prevent overlap
plt.tight_layout()
plt.show()

In [None]:
corr = train_df.corr()
sns.heatmap(corr,annot=True)

In [None]:
sns.violinplot(x='Sex', y='Rings', data=train_df)

In [None]:
sns.countplot(train_df['Rings'])
plt.show()

In [None]:
# check for missing values 

train_df.isnull().sum()

In [None]:
# check for duplicated values

duplicates = train_df.duplicated()
num_duplicates = duplicates.sum()

print("Number of duplicated rows : " , num_duplicates)

In [None]:
# visualize the distribution of the target variable (rings)

train_df['Rings'].value_counts().plot(kind='bar')

In [None]:
# Encode categorical features (sex) :

le = LabelEncoder()
train_df['Sex'] = le.fit_transform( train_df['Sex'])
test_df['Sex'] = le.transform( test_df['Sex'])

train_df.head()

In [None]:
# lets split the data into features and target variable

X = train_df.drop(['id','Rings'], axis=1)
y = train_df['Rings']

x_test = test_df.drop('id' , axis=1 )

In [None]:
X.head(2)

In [None]:
y.head(2)

In [None]:
x_test.head(2)

In [None]:
# split data into train and validation sets

x_train , x_val , y_train , y_val = train_test_split( X , y , test_size=0.2 , random_state=4 )

In [None]:
x_train.shape

In [None]:
x_val.shape

In [None]:
x_val.head(2)

In [None]:
# Now lets evaluate the score using root mean square logarithmic error :

def rmsle_score( y_true , y_pred ):
    y_true = np.maximum(y_true , 0)    # Ensure y_true is non-negative
    y_pred = np.maximum(y_pred , 0)    # Ensure y_pred is non-negative
    
    sqr_log_er = (np.log1p(y_pred) - np.log1p(y_true))**2
    mean_sqr_log_er = np.mean(sqr_log_er)
    rmsle = np.sqrt( mean_sqr_log_er )
    
    return rmsle


In [None]:
#  It's used to create a custom scorer object that can be passed to functions
#  greater_is_better=False : This parameter indicates lower value of the scoring metric is considered better

rmsle_scorer = make_scorer( rmsle_score , greater_is_better=False )

In [None]:
# Define Models :

models = [
    LinearRegression() ,
    DecisionTreeRegressor() ,
    RandomForestRegressor( n_estimators=100 , random_state=42 ) ,
    GradientBoostingRegressor() ,
    SVR() ,
    XGBRegressor() ,
    CatBoostRegressor( verbose=-1 ) ,
    LGBMRegressor()
]

In [None]:
# Track Best model and its RMSLE score :

best_model = None
best_model_name = ""
best_model_rmsle = float('inf')

# Evaluate models :

for mod in models :
    
    # train model :
    mod.fit(x_train, y_train)
    
    # Prediction :
    y_pred = mod.predict(x_val)
    
    # calc RMSLE score :
    rmsle = rmsle_score( y_val , y_pred )
    
    print(f'{mod.__class__.__name__} : RMSLE = {rmsle}')
    
    # update best model if current model has lower RMSLE :
    if rmsle < best_model_rmsle :
        best_model = mod
        best_model_name = mod.__class__.__name__
        best_model_rmsle = rmsle
        
print(f'\nThe best Model is {best_model_name} and its RMSLE score : {best_model_rmsle}\n')


In [None]:
param_grid = {
    'learning_rate': [0.01, 0.05, 0.1],
    'depth': [4, 6, 8],
    'l2_leaf_reg': [1, 3, 5],
    'iterations': [100, 200, 300]
}

grid_search = GridSearchCV(
        estimator=catboost, 
        param_grid=param_grid, 
        cv=3, 
        scoring='neg_mean_squared_error'
)

# Perform grid search
grid_search.fit(X_train, y_train)
grid_search.best_params_

In [None]:
# Now, we have chose best model. Hence, Lets train the entire dataset with best model :

best_model.fit( X,y )

In [None]:
# lets see the prediction score :

best_model.score(x_val , y_val )

In [None]:
# save the best model using joblib :

joblib.dump( best_model , 'best_model')

In [None]:
# Prediction : 

predictions = best_model.predict(x_test)

In [None]:
predictions

In [None]:
# Prepare submission file

submission_df = pd.DataFrame( {'id' : test_df['id'] , 'Rings': predictions} )
submission_df.to_csv('submission.csv' , index=False)

In [None]:
submission_df.head()

In [None]:
train_df.columns

In [None]:
# Lets perform a single prediction using the best_model
# Load the saved model from joblib

import joblib
best_model = joblib.load('best_model')

# prepare a dataframe with the features for prediction
# example data

data = {
    'Sex' : ['female'] ,
    'Length' : [0.455] ,
    'Diameter' : [0.365] ,
    'Height': [0.095] ,
    'Whole weight' : [0.514] ,
    'Whole weight.1' : [0.2245] , 
    'Whole weight.2' : [0.101] ,
    'Shell weight' : [0.15] ,
}

predict_df = pd.DataFrame(data)

# Encoding categorical variables 

le = LabelEncoder()
predict_df['Sex'] = le.fit_transform( predict_df['Sex'])

# Make prediction

prediction = best_model.predict(predict_df)

print("Predicted Rings : ", prediction[0])
