# EDA (Exploratory Data Analysis) of the dataset

In this notebook, explore the Abalone dataset.

Add any relevant insight for future modelling.

# Imports

In [1]:
%load_ext autoreload
%autoreload 2

import os

import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

pd.set_option('display.max_columns', 500)

# Data

In [4]:
DATA_FOLDER = "../data/"
df = pd.read_csv(os.path.join(DATA_FOLDER, "raw/abalone.csv"))

In [11]:
df_train, df_test = train_test_split(df, test_size=0.3, random_state=42)

# EDA

In [7]:
df_train.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
3930,F,0.67,0.535,0.185,1.597,0.6275,0.35,0.47,21
4087,I,0.595,0.475,0.155,0.984,0.4865,0.184,0.2755,10
3331,F,0.51,0.34,0.18,0.7005,0.312,0.165,0.2,11
2803,M,0.65,0.51,0.175,1.446,0.6485,0.2705,0.45,12
532,I,0.47,0.37,0.12,0.4705,0.1845,0.1055,0.155,12


In [6]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2923 entries, 3930 to 4120
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Sex             2923 non-null   object 
 1   Length          2923 non-null   float64
 2   Diameter        2923 non-null   float64
 3   Height          2923 non-null   float64
 4   Whole weight    2923 non-null   float64
 5   Shucked weight  2923 non-null   float64
 6   Viscera weight  2923 non-null   float64
 7   Shell weight    2923 non-null   float64
 8   Rings           2923 non-null   int64  
dtypes: float64(7), int64(1), object(1)
memory usage: 228.4+ KB


In [8]:
df_train.isnull().sum()

Sex               0
Length            0
Diameter          0
Height            0
Whole weight      0
Shucked weight    0
Viscera weight    0
Shell weight      0
Rings             0
dtype: int64

In [9]:
def compute_target(df):
    df['age'] = df['Rings'] + 1.5
    df = df.drop(columns = ['Rings'])
    return df

In [12]:
df_train = compute_target(df_train)

In [13]:
df_train.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,age
2830,F,0.525,0.43,0.135,0.8435,0.4325,0.18,0.1815,10.5
925,I,0.43,0.325,0.1,0.3645,0.1575,0.0825,0.105,8.5
3845,M,0.455,0.35,0.105,0.416,0.1625,0.097,0.145,12.5
547,M,0.205,0.155,0.045,0.0425,0.017,0.0055,0.0155,8.5
2259,F,0.59,0.465,0.16,1.1005,0.506,0.2525,0.295,14.5


In [16]:
df_test = compute_target(df_test)

In [17]:
df_test.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,age
866,M,0.605,0.455,0.16,1.1035,0.421,0.3015,0.325,10.5
1483,M,0.59,0.44,0.15,0.8725,0.387,0.215,0.245,9.5
599,F,0.56,0.445,0.195,0.981,0.305,0.2245,0.335,17.5
1702,F,0.635,0.49,0.17,1.2615,0.5385,0.2665,0.38,10.5
670,M,0.475,0.385,0.145,0.6175,0.235,0.108,0.215,15.5


In [15]:
# save df_train processed
df_train.to_csv(DATA_FOLDER+"processed/train.csv")

In [18]:
# save df_test
df_test.to_csv(DATA_FOLDER+"processed/test.csv")