# EDA (Exploratory Data Analysis) of the dataset

In this notebook, explore the Abalone dataset, by showing relevant visualizations that help understand the problem you are modelling.

Please make sure to write down your conclusions in the final notebook and to remove these intructions.

# Imports

In [None]:
%load_ext autoreload
%autoreload 2

import pandas as pd
pd.set_option('display.max_columns', 500)

import kagglehub
# Download latest version
path = kagglehub.dataset_download("rodolfomendes/abalone-dataset")

print("Path to dataset files:", path)

: 

# Data

In [None]:
df = pd.read_csv(path)
print(df.head())

# EDA

In [None]:
# find null and nan values
nb_null_values = df.isnull().sum()
nb_nan_values = df.isna().sum()

print("number of null values: ", nb_null_values)
print("number of nan values: ", nb_nan_values)

We see that we have 0 null values

In [None]:
#lets look at the distribution of the data
print(df.describe())


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
numerical_features = [
    "Length",
    "Diameter",
    "Height",
    "Whole weight",
    "Shucked weight",
    "Viscera weight",
    "Shell weight",
    "Rings"
]

for feature in numerical_features:
    plt.figure(figsize=(10, 6))
    # plot distribution of feature
    sns.histplot(df[feature], bins=30, edgecolor='black')
    plt.title(f'Distribution of {feature}')
    plt.xlabel(feature)
    plt.ylabel('Frequency')
    plt.show()


In [None]:
df_age = df["Rings"] + 1.5
plt.figure(figsize=(10, 6))
sns.histplot(df_age, bins=30, edgecolor='black')
plt.title(f'Distribution of {feature}')
plt.xlabel(feature)
plt.ylabel('Frequency')
plt.show()

In [None]:
# plot the correlation matrix
corr_matrix = df.corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()