In [1]:
# import libraries
import pandas as pd

# ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Random Forest
from sklearn.ensemble import RandomForestRegressor

# from sklearn 
from sklearn.model_selection import train_test_split
# to determine the p-values with anova
from sklearn.feature_selection import f_regression, SelectKBest

# model evaluation
from sklearn.metrics import mean_squared_error

# for pipleline
from sklearn.pipeline import make_pipeline

# for removing correled features
from feature_engine.selection import DropCorrelatedFeatures

# for model persistance
import pickle

In [2]:
# load dataset
# dataset reference : https://www.kaggle.com/datasets/fedesoriano/body-fat-prediction-dataset

df = pd.read_csv('bodyfat.csv')
df.head()

Unnamed: 0,Density,BodyFat,Age,Weight,Height,Neck,Chest,Abdomen,Hip,Thigh,Knee,Ankle,Biceps,Forearm,Wrist
0,1.0708,12.3,23,154.25,67.75,36.2,93.1,85.2,94.5,59.0,37.3,21.9,32.0,27.4,17.1
1,1.0853,6.1,22,173.25,72.25,38.5,93.6,83.0,98.7,58.7,37.3,23.4,30.5,28.9,18.2
2,1.0414,25.3,22,154.0,66.25,34.0,95.8,87.9,99.2,59.6,38.9,24.0,28.8,25.2,16.6
3,1.0751,10.4,26,184.75,72.25,37.4,101.8,86.4,101.2,60.1,37.3,22.8,32.4,29.4,18.2
4,1.034,28.7,24,184.25,71.25,34.4,97.3,100.0,101.9,63.2,42.2,24.0,32.2,27.7,17.7


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 252 entries, 0 to 251
Data columns (total 15 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Density  252 non-null    float64
 1   BodyFat  252 non-null    float64
 2   Age      252 non-null    int64  
 3   Weight   252 non-null    float64
 4   Height   252 non-null    float64
 5   Neck     252 non-null    float64
 6   Chest    252 non-null    float64
 7   Abdomen  252 non-null    float64
 8   Hip      252 non-null    float64
 9   Thigh    252 non-null    float64
 10  Knee     252 non-null    float64
 11  Ankle    252 non-null    float64
 12  Biceps   252 non-null    float64
 13  Forearm  252 non-null    float64
 14  Wrist    252 non-null    float64
dtypes: float64(14), int64(1)
memory usage: 29.7 KB


In [3]:
# first lets split our training and test data set

X_train, X_test, y_train, y_test = train_test_split(df.drop(['BodyFat'], axis = 1),
                                                   df['BodyFat'],
                                                   test_size=0.3,
                                                   random_state=39)

X_train.shape, X_test.shape

((176, 14), (76, 14))

## Removing Correlated Features

In [5]:
from feature_engine.selection import DropCorrelatedFeatures

In [6]:
# set up the selector
# remove correlated features having pearson correlation greater than 0.8

sel = DropCorrelatedFeatures(threshold=0.8,
                             method='pearson',
                             missing_values='ignore'
                            )

In [7]:
sel.fit(X_train)

DropCorrelatedFeatures(variables=['Density', 'Age', 'Weight', 'Height', 'Neck',
                                  'Chest', 'Abdomen', 'Hip', 'Thigh', 'Knee',
                                  'Ankle', 'Biceps', 'Forearm', 'Wrist'])

In [8]:
# drop the correlated features
train = sel.transform(pd.DataFrame(X_train))
test = sel.transform(pd.DataFrame(X_test))

train.shape, test.shape

((176, 7), (76, 7))

## Select only the best 5 features

In [9]:
# select the top 5 features
sel_ = SelectKBest(f_regression, k=5).fit(X_train, y_train)

In [10]:
# display selected feature names
X_train.columns[sel_.get_support()]

Index(['Density', 'Weight', 'Chest', 'Abdomen', 'Hip'], dtype='object')

In [11]:
# removing unwanted features from the Datasets
X_train = sel_.transform(X_train)
X_test = sel_.transform(X_test)

## Random Forest Regressor

In [12]:
# set up Random Forest Regressor
clf = RandomForestRegressor(n_estimators=10,
                                max_depth=2,
                                n_jobs=2,
                                random_state=39)
# fit the model
clf.fit(X_train, y_train)

RandomForestRegressor(max_depth=2, n_estimators=10, n_jobs=2, random_state=39)

## Model Evauation

In [13]:
# predict and evaluate model performance
train_preds = clf.predict(X_train)
test_preds = clf.predict(X_test)

In [14]:
# evaluate model performance
print('Training data score : {}'.format(mean_squared_error(y_train, train_preds)))
print('Test data score : {}'.format(mean_squared_error(y_test, test_preds)))

Training data score : 5.670209421147411
Test data score : 4.792522718971122


In [16]:
# save the model
pickle.dump(clf, open('body_fat_estimator.pkl', 'wb'))

In [17]:
# load and test
pickled_model = pickle.load(open('body_fat_estimator.pkl', 'rb'))
pickled_model.predict(X_test)

array([16.00131879, 25.21436805, 32.08398454, 32.08398454, 22.6558852 ,
       22.6558852 , 22.6558852 , 22.6558852 , 14.30326978, 22.6558852 ,
       14.30326978, 22.6558852 , 23.49722675, 14.30326978, 14.30326978,
       20.09862916, 23.49722675,  6.87336464, 10.47299581, 16.86804915,
       14.30326978, 22.6558852 , 22.6558852 , 32.08398454, 15.08336244,
        6.87336464, 16.00131879, 24.43114209, 14.30326978, 22.6558852 ,
       22.6558852 , 15.08336244, 22.6558852 , 22.6558852 , 22.6558852 ,
       25.21436805, 22.6558852 ,  6.87336464, 14.30326978, 23.49722675,
        6.87336464,  6.87336464,  6.87336464,  8.88782435, 14.30326978,
       22.6558852 , 22.6558852 , 14.30326978,  6.87336464, 10.47299581,
       25.21436805,  8.88782435, 22.6558852 , 24.43114209, 32.08398454,
       20.09862916, 23.49722675, 32.08398454, 16.86804915, 14.30326978,
       32.08398454, 14.30326978, 14.30326978,  6.87336464,  6.87336464,
       23.49722675, 32.08398454, 32.08398454, 20.09862916, 14.30

### Test with an unseen data

- The selected columns to be used are 
['Density', 'Weight', 'Chest', 'Abdomen', 'Hip']

In [24]:
test_value = X_test[1]
test_value

array([  1.0372, 219.15  , 117.6   , 113.8   , 111.8   ])

In [26]:
y_test.iloc[1]

27.3

In [27]:
# predict using the test value
pickled_model.predict([test_value])

array([25.21436805])