# Data Preprocessing

In [None]:
import numpy as np
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score

In [None]:
df = pd.read_csv('./data/BodyFat.csv')
df = df.drop(columns=['IDNO'])

df['WEIGHT'] = df['WEIGHT'].transform(lambda x: x * 0.453592).round(1)
df['HEIGHT'] = df['HEIGHT'].transform(lambda x: x * 2.54).round(1)

df.to_csv('./data/BodyFat_SI.csv', index = False)

df.head(10)

In [None]:
df.insert(1, 'CALC_BODYFAT', (495 / df['DENSITY'] - 450).round(1))
df.insert(2, 'BODYFAT_DIFF', np.abs(df['CALC_BODYFAT'] - df['BODYFAT']).round(1))
df.insert(8, 'CALC_BMI', (df['WEIGHT'] / (df['HEIGHT']/100)**2).round(1))
df.insert(9, 'BMI_DIFF', np.abs(df['CALC_BMI'] - df['ADIPOSITY']).round(1))
df['BODYFAT'] = np.where(df['BODYFAT_DIFF'] >= 3.0, df['CALC_BODYFAT'], df['BODYFAT'])
df['ADIPOSITY'] = np.where(df['BMI_DIFF'] >= 3.0, df['CALC_BMI'], df['ADIPOSITY'])
df = df.drop(columns=['DENSITY', 'ADIPOSITY', 'CALC_BODYFAT', 'BODYFAT_DIFF', 'CALC_BMI', 'BMI_DIFF'])
df = df[(np.abs(stats.zscore(df)) < 3).all(axis=1)]

df.to_csv('./data/BodyFat_cleaned.csv', index = False)
df.head(10)

In [None]:
df.hist(figsize=(10, 10))
plt.savefig('plot/feature_distribution.pdf')
plt.show()

In [None]:
corr = df.corr()
plt.figure(figsize=(12, 10))
sns.heatmap(corr, xticklabels=corr.columns, yticklabels=corr.columns, annot=True)
plt.savefig('plot/correlation_matrix.pdf')
plt.show()

In [None]:
X, y = df.iloc[:,1:], df.iloc[:,0]

dt = DecisionTreeRegressor()
dt.fit(X, y)

feature = list(df.columns)[1:]
importance = dt.feature_importances_
feature_importance = list(zip(feature, importance))
feature_importance.sort(key=lambda x:x[1], reverse=True)
print('--- Feature Importance (from High to Low) ---')
for n, v in feature_importance:
    print(f'{n}, Score: {v}')

plt.figure(figsize=(15, 6))
plt.bar([x[0] for x in feature_importance], [x[1] for x in feature_importance])
plt.title('Feature Importance (from High to Low)')
plt.xlabel('Feature')
plt.ylabel('Score')
plt.savefig('plot/feature_importance.pdf')
plt.show()