In [None]:
# Import all the necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score,GridSearchCV
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder,MinMaxScaler

import warnings 
warnings.filterwarnings("ignore")

In [None]:
# Read the Dataset
dataframe=pd.read_csv("/kaggle/input/fish-market/Fish.csv")
dataframe.head()

In [None]:
# shape of the dataset
dataframe.shape

# Exploratory Data Analysis

In [None]:
# Info of data
dataframe.info()

In [None]:
# Describe the dataset
dataframe.describe()

In [None]:
# Check the duplicate values in the dataset
dataframe.duplicated().sum()

In [None]:
# Check Is there any null value in the dataset
dataframe.isna().sum()

In [None]:
# Lets make the correlation matrix
correlation_matrix= dataframe.corr()
correlation_matrix

# Visulaize the Correlation Matrix

In [None]:
plt.figure(figsize=(5,3))
sns.heatmap(correlation_matrix, annot=True, cmap="YlGnBu", fmt=".2f")
plt.show

Here we see that, the Length1 and Length2 are highly correlated with each other and their correlation is 1. But both represents the different features of fish, that is vertical length of fish(Length1 in cm)  and  Length2 is the diagonal length of fish in cm.Both have their individual importance.

In [None]:
# Lets check the no. of unique values in Species Column
dataframe["Species"].nunique()

In [None]:
# Lets see the unique values in Species Column
dataframe["Species"].unique()

# Convert the Categorical column into Numeriacal form Using Label Encoder

In [None]:
le= LabelEncoder()
dataframe["Species"]=le.fit_transform(dataframe["Species"])

# Univariate Analysis

# Frequency Plot of Species

In [None]:
dataframe["Species"].value_counts().plot(kind="bar", figsize=(5,5), rot=0)
plt.xlabel("Species")
plt.ylabel("Count")
plt.show()

In [None]:
plt.figure(figsize=(5,5))
sns.histplot(data=dataframe, x=dataframe["Weight"], kde=True)
sns.set(style="darkgrid")
plt.title("Weight of the fish", fontsize=14)
plt.xlabel("Weight")
plt.ylabel("Count")
plt.show()

In [None]:
plt.figure(figsize=(5,5))
sns.histplot(data=dataframe, x=dataframe["Length1"], kde=True)
sns.set(style="darkgrid")
plt.title("Length1(Vertical) of the fish", fontsize=14)
plt.xlabel("Length1")
plt.ylabel("Count")
plt.show()

In [None]:
plt.figure(figsize=(5,5))
sns.histplot(data=dataframe, x=dataframe["Length2"], kde=True)
sns.set(style="darkgrid")
plt.title("Length2(Diagonal) of the fish", fontsize=14)
plt.xlabel("Length2")
plt.ylabel("Count")
plt.show()

In [None]:
plt.figure(figsize=(5,5))
sns.histplot(data=dataframe, x=dataframe["Length3"], kde=True)
sns.set(style="darkgrid")
plt.title("Length3(CrossLength) of the fish", fontsize=14)
plt.xlabel("Length3")
plt.ylabel("Count")
plt.show()

In [None]:
plt.figure(figsize=(5,5))
sns.histplot(data=dataframe, x=dataframe["Height"], kde=True)
sns.set(style="darkgrid")
plt.title("Height of the fish", fontsize=14)
plt.xlabel("Height")
plt.ylabel("Count")
plt.show()

In [None]:
plt.figure(figsize=(5,5))
sns.histplot(data=dataframe, x=dataframe["Width"], kde=True)
sns.set(style="darkgrid")
plt.title("Width of the fish", fontsize=14)
plt.xlabel("Width")
plt.ylabel("Count")
plt.show()

# Let's visulize the Boxplot that helps us to see the outliers in the dataset

In [None]:
dataframe.boxplot(column=dataframe.columns.tolist(), figsize=(20,20), grid=True, rot=45, fontsize=16)
plt.suptitle("Percentile and Median base distribution of all the columns", fontsize=25)
plt.show()

# Read the dataset again

In [None]:
dataframe=pd.read_csv("/kaggle/input/fish-market/Fish.csv")
dataframe.head()

In [None]:
dataframe.shape

# Get the Numerical and Categorical Columns list

In [None]:
def get_numerical_and_categorical_columns(dataframe):
    
    numerical_cols = []
    categorical_cols = []
    for column in dataframe.columns:
        if pd.api.types.is_numeric_dtype(dataframe[column]):
            numerical_cols.append(column)
        else:
            categorical_cols.append(column)
            
    return numerical_cols, categorical_cols

In [None]:
numerical_cols, categorical_cols= get_numerical_and_categorical_columns(dataframe)

In [None]:
numerical_cols

In [None]:
categorical_cols

# Divide the Dataset into Train and Test Set

In [None]:
def train_test(dataframe):
    length= len(dataframe)
    dataframe1= dataframe.iloc[:int(0.7 * length)]
    dataframe2= dataframe.iloc[int(0.7 * length):]
    return dataframe1, dataframe2

In [None]:
dataframe1, dataframe2 =train_test(dataframe)

In [None]:
dataframe1.shape

In [None]:
dataframe2.shape

# Detect the outliers in the dataset using Winsorization Method

In [None]:
def Winsorization_Method(columns, dataframe, a, b):
    outliers=[]

    for col in columns:
        q1= np.percentile(dataframe[col], a)
        q2= np.percentile(dataframe[col],b)
        
        for pos in range(len(dataframe)):
            if dataframe[col].iloc[pos]>q2 or dataframe[col].iloc[pos]<q1:
                outliers.append(pos) 
                
    outliers= set(outliers)                   # remove the duplicates from the outliers
    outliers= list(outliers)
    
    ratio= round(len(outliers)/len(dataframe)*100, 2)                       # Ratio of outliers
    dataframe.drop(dataframe.index[outliers], inplace=True)    # remove the outliers from the dataset
    
    
    return ratio, dataframe

In [None]:
ratio_of_outliers, dataframe1= Winsorization_Method(dataframe1[numerical_cols].columns.tolist(), dataframe1, a=1, b=99)

In [None]:
ratio_of_outliers

In [None]:
dataframe1.shape

# Data Preprocessing

In [None]:
x_train=dataframe1.drop("Weight", axis=1)
y_train=dataframe1["Weight"]
x_test=dataframe2.drop("Weight", axis=1)
y_test=dataframe2["Weight"]

x_train.shape, y_train.shape, x_test.shape, y_test.shape

In [None]:
num_cols, cat_cols=get_numerical_and_categorical_columns(x_train)
num_cols

In [None]:
cat_cols

# X_train Encoding

In [None]:
ohe= OneHotEncoder(handle_unknown="ignore")

x_train_ohe= ohe.fit_transform(x_train[cat_cols])
x_train_ohe= x_train_ohe.toarray()

x_train_ohe_df= pd.DataFrame(x_train_ohe, columns=ohe.get_feature_names_out([cat_cols[i] for i in range(len(cat_cols))]))

# One-hot encoding removed an index. Let's put it back:
x_train_ohe_df.index= x_train.index

# Joining the tables
x_train = pd.concat([x_train, x_train_ohe_df], axis=1)

# Dropping old categorical columns
x_train.drop(cat_cols, axis=1, inplace=True)

# Checking result
x_train.head()

# X_test Encoding

In [None]:
x_test_ohe= ohe.transform(x_test[cat_cols])
x_test_ohe= x_test_ohe.toarray()

x_test_ohe_df= pd.DataFrame(x_test_ohe, columns=ohe.get_feature_names_out([cat_cols[i] for i in range(len(cat_cols))]))
#print(x_test_ohe_df)

# One-hot encoding removed an index. Let's put it back:
x_test_ohe_df.index= x_test.index

# Joining the tables
x_test= pd.concat([x_test, x_test_ohe_df], axis=1)

# Dropping old categorical columns
x_test.drop(cat_cols, axis=1, inplace=True)

# Checking result
x_test.head()

In [None]:
#Data Preprocessing (--normalise the values of dataset)
min_max= MinMaxScaler()
x_train[num_cols]= min_max.fit_transform(x_train[num_cols])  
x_test[num_cols]=min_max.transform(x_test[num_cols])

In [None]:
x_train.head()

# Linear Regression Model

In [None]:
reg= LinearRegression()
reg.fit(x_train, y_train)
y_pred = reg.predict(x_test)

print (f"model : {reg} and  rmse score is : {np.sqrt(mean_squared_error(y_test, y_pred))}, r2 score is {r2_score(y_test, y_pred)}")

# RandomForestregressor Model

In [None]:
rf= RandomForestRegressor()
rf.fit(x_train, y_train)
y_pred = rf.predict(x_test)

print (f"model : {rf} and  rmse score is : {np.sqrt(mean_squared_error(y_test, y_pred))}, r2 score is {r2_score(y_test, y_pred)}")

# Grid Search CV

In [None]:
rmse=[]
r2=[]
parameter_list= {'n_estimators':[100], 'max_depth':[10], 'min_samples_split':[2], 'criterion':['squared_error']}
rf_reg= GridSearchCV(rf,parameter_list, cv=5, scoring="r2", n_jobs=-1).fit(x_train, y_train)
y_pred =rf_reg.predict(x_test)
rmse.append(np.sqrt(mean_squared_error(y_test, y_pred)))
r2.append(r2_score(y_test, y_pred))

In [None]:
model_name = ['RandomForest']
result_df = pd.DataFrame({'RMSE':rmse,'R2_score': r2},index=model_name)
result_df

# Lets visualise the predictions vs actual values considering each feature sepatrately with Random Forest

In [None]:
plt.scatter(x_test['Length1'], y_test, color='red', alpha=0.6)
plt.plot(x_test['Length1'], y_pred, color='blue',alpha=0.6)
plt.xlabel('Length1 in cm')
plt.ylabel('Weight of the fish')
plt.title('Random Forest Regressor Model for Weight Estimation');

In [None]:
plt.scatter(x_test['Length2'], y_test, color='purple', alpha=0.6)
plt.plot(x_test['Length2'], y_pred, color='green',alpha=0.6)
plt.xlabel('Length2 in cm')
plt.ylabel('Weight of the fish')
plt.title('Random Forest Regressor Model for Weight Estimation');

In [None]:
plt.scatter(x_test['Length3'], y_test, color='purple', alpha=0.4)
plt.plot(x_test['Length3'], y_pred, color='orange',alpha=0.4)
plt.xlabel('Length3 in cm')
plt.ylabel('Weight of the fish')
plt.title('Random Forest Regressor Model for Weight Estimation');

In [None]:
plt.scatter(x_test['Height'], y_test, color='orange', alpha=0.4)
plt.plot(x_test['Height'], y_pred, color='blue',alpha=0.4)
plt.xlabel('Height in cm')
plt.ylabel('Weight of the fish')
plt.title('Random Forest Regressor Model for Weight Estimation');

In [None]:
plt.scatter(x_test['Width'], y_test, color='gray', alpha=0.5)
plt.plot(x_test['Width'], y_pred, color='red',alpha=0.5)
plt.xlabel('Width in cm')
plt.ylabel('Weight of the fish')
plt.title('Random Forest Regressor Model for Weight Estimation');