In [ ]:
# Step 0. Import libraries and custom modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [ ]:
# Step 1. Read the data
# 1.1 Show the basic info
df_raw = pd.read_csv('data/abalone-dataset.csv', header=None)
df_raw.info()

In [ ]:
# 1.2 Show a sample of the data
df_raw.sample(6, random_state=2024)

In [ ]:
# 1.3 Preprocess the data
feature_columns_name = [
    'sex',
    'length',
    'diameter',
    'height',
    'whole_weight',
    'shucked_weight',
    'viscera_weight',
    'shell_weight'
]
label_column = 'rings'

feature_columns_dtype = {
    'sex': str,
    'length': np.float64,
    'diameter': np.float64,
    'height': np.float64,
    'whole_weight': np.float64,
    'shucked_weight': np.float64,
    'viscera_weight': np.float64,
    'shell_weight': np.float64
}
label_column_dtype = {'rings': np.float64}

def merge_two_dicts(x, y):
    z = x.copy()
    z.update(y)
    return z

df_interim = pd.read_csv('data/abalone-dataset.csv', 
                 header=None,
                 names = feature_columns_name + [label_column],
                 dtype = merge_two_dicts(feature_columns_dtype, label_column_dtype)
                )
df = (
    df_raw
    .set_axis(feature_columns_name + [label_column], axis=1)
    .astype(merge_two_dicts(feature_columns_dtype, label_column_dtype))
    .iloc[:,[-1]+list(range(0,df_interim.shape[1]-1))]
    .dropna()
)

df.sample(6, random_state=2024)

In [ ]:
# 1.4 Create the train and test splits
df_train, df_test = train_test_split(df, stratify=df['sex'], test_size=0.2, random_state=2024)

In [ ]:
# Step 3. Exploratory Data Analysis
# 3.1 Get basic info
df_train.info()

In [ ]:
# 3.2 Describe the data
display(df_train.describe().T)
display(df_train[['sex']].describe(include='all').T)

In [ ]:
# 3.3 Univariate analysis
df_train.hist(figsize=(12,12))
plt.show()

In [50]:
# 3.4 Bivariate analysis
sns.pairplot(df_train, hue='sex')
plt.show()

In [ ]:
# 3.5 Correlation analysis
sns.heatmap(df_train.corr, annot=True)
plt.show()