2. Load and Explore Dataset

In [16]:
# Launch magic commands to automatically reload modules
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [17]:
# Import pandas and numpy package
import pandas as pd
import numpy as np

In [21]:
# Load the dataset into dataframe called df
df = pd.read_csv('/Users/leigh/Desktop/adv_mla/adv_mla_lab_1/data/raw/insurance.csv')

In [24]:
# Display the first 5 rows of df
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,18,female,33.82,0,no,southeast,1630.6617
1,19,female,23.48,1,no,southeast,1836.8043
2,46,male,30.57,2,no,southeast,6632.3513
3,54,male,32.05,1,yes,southeast,31922.4295
4,21,male,21.345,4,no,northeast,1638.37255


In [25]:
# Display the dimensions (shape) of df
df.shape

(50000, 7)

In [26]:
# Display the summary (info) of 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       50000 non-null  int64  
 1   sex       50000 non-null  object 
 2   bmi       50000 non-null  float64
 3   children  50000 non-null  int64  
 4   smoker    50000 non-null  object 
 5   region    50000 non-null  object 
 6   charges   50000 non-null  float64
dtypes: float64(2), int64(2), object(3)
memory usage: 2.7+ MB


In [27]:
# Display the descriptive statistics of df
df.describe(include='all')

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
count,50000.0,50000,50000.0,50000.0,50000,50000,50000.0
unique,,2,,,2,4,
top,,male,,,no,southeast,
freq,,25176,,,38976,14197,
mean,39.46312,,30.713734,1.11376,,,13343.216363
std,14.117142,,6.092727,1.212835,,,12131.222744
min,18.0,,17.291,0.0,,,1137.5359
25%,27.0,,26.6,0.0,,,4694.4318
50%,40.0,,30.3,1.0,,,9399.232775
75%,51.0,,34.57,2.0,,,17340.746925


3. Prepare Data

In [32]:
# Create a copy of df and save it into a variable called df_cleaned
df_cleaned = df.copy()

In [33]:
# Extract the column charges and save it into variable called target
target = df_cleaned.pop('charges')

In [34]:
# Create 2 lists named num_cols and cat_cols containing respectively the names of numerical and categotical columns
num_cols = list(df_cleaned.select_dtypes('number').columns)
cat_cols = list(set(df_cleaned.columns) - set(num_cols))

In [35]:
# Import StandardScaler from sklearn.preprocessing
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [36]:
# Instantiate the OneHotEncoder
ohe = OneHotEncoder(sparse_output=False, drop='first')

In [37]:
# Fit and apply the OneHotEncoder on df_cleaned and save the resultin features
features = ohe.fit_transform(df_cleaned[cat_cols])

In [38]:
# Convert features into a dataframe
features = pd.DataFrame(features, columns=ohe.get_feature_names_out())

In [39]:
# Instantiate the StandardScaler
scaler = StandardScaler()

In [40]:
# Fit and apply the scaling on df and add the results into features
features[num_cols] = scaler.fit_transform(df_cleaned[num_cols])

In [41]:
# Import dump from joblib
from joblib import dump

In [42]:
# Save the one-hot encoder and scaler into the folder models and call the files respectively ohe.joblib and scaler.joblib
dump(ohe, '../models/ohe.joblib')
dump(scaler, '../models/scaler.joblib')

['../models/scaler.joblib']

4. Split Dataset

In [44]:
# Import train_test_split from sklearn.model_selection
from sklearn.model_selection import train_test_split

In [45]:
# Split the data into training validation and testing sets as Numpy arrays
X_data, X_test, y_data, y_test = train_test_split(features, target, test_size=0.2, random_state=8)
X_train, X_val, y_train, y_val = train_test_split(X_data, y_data, test_size=0.2, random_state=8)

In [46]:
# Print the dimensions of X_train, X_val, X_test
print(X_train.shape)
print(X_val.shape)
print(X_test.shape)

(32000, 8)
(8000, 8)
(10000, 8)


In [47]:
# Print the dimensions of y_train, y_val, y_test
print(y_train.shape)
print(y_val.shape)
print(y_test.shape)

(32000,)
(8000,)
(10000,)


In [49]:
# Save the sets into the folder data/processed
X_train.to_csv('../data/processed/X_train.csv', index=False)
X_val.to_csv('../data/processed/X_val.csv', index=False)
X_test.to_csv('../data/processed/X_test.csv', index=False)
y_train.to_csv('../data/processed/y_train.csv', index=False)
y_val.to_csv('../data/processed/y_val.csv', index=False)
y_test.to_csv('../data/processed/y_test.csv', index=False)

5. Get Baseline Model

In [50]:
# Calculate the average of the target variable for the training set and save it into a variable called pred_value
pred_value = y_train.mean()

In [51]:
# Generate a numpy array with same dimensions as y_train that contains only the value saved in pred_value
y_base = np.full((len(y_train), 1), pred_value)

In [52]:
# Import mean_squared_error and mean_absolute_error from sklearn.metrics
from sklearn.metrics import root_mean_squared_error as rmse
from sklearn.metrics import mean_absolute_error as mae

In [53]:
# Display the RMSE and MAE scores of this baseline model on the training set
print(rmse(y_base, y_train))
print(mae(y_base, y_train))

12116.584822448176
9118.852804794265


6. Push Changes
   
    # Add your changes to git staging area
    git add . 
    # Create the snapshot of your repository and add a description
    git commit -m "prepare data and baseline"
    # Push your snapshot to Github
    git push
    # Stop Jupyter Lab
    ctrl+c
   