In [55]:
# For this project we will need the following libraries and modules
import numpy as np
import os
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import seaborn as sns
sns.set()

In [56]:
gym_data = pd.read_csv(r'C:\Users\hp\Downloads\workout.csv')

In [57]:
# Let's explore the top 5 rows of our data frame:
gym_data.head()

Unnamed: 0,Age,Height,Weight,Body_Type,Index,Shape
0,56,194,72,Ectomorph,0,slender
1,69,151,68,Ectomorph,0,slender
2,46,176,58,Ectomorph,0,slender
3,32,180,84,Ectomorph,0,slender
4,60,185,70,Ectomorph,0,slender


In [58]:
# Descriptive statistics are very useful for initial exploration of the variables
# By default, only descriptives for the numerical variables are shown when we write:

gym_data.describe()

Unnamed: 0,Age,Height,Weight,Index
count,4500.0,4500.0,4500.0,4500.0
mean,48.712444,177.682889,81.212222,1.0
std,17.998332,12.56215,15.961573,0.816587
min,18.0,150.0,50.0,0.0
25%,33.0,168.0,70.0,0.0
50%,49.0,178.0,80.0,1.0
75%,65.0,188.0,92.0,2.0
max,79.0,199.0,119.0,2.0


In [59]:
# To see our columns and the data types in each:
gym_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4500 entries, 0 to 4499
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Age        4500 non-null   int64 
 1   Height     4500 non-null   int64 
 2   Weight     4500 non-null   int64 
 3   Body_Type  4500 non-null   object
 4   Index      4500 non-null   int64 
 5   Shape      4500 non-null   object
dtypes: int64(4), object(2)
memory usage: 211.1+ KB


In [60]:
# We will keep columns of interest and drop others where and when necessary. 
data = gym_data.drop(['Body_Type',],axis=1)

# Let's check the descriptives without dropped columns
data.describe(include='all')

Unnamed: 0,Age,Height,Weight,Index,Shape
count,4500.0,4500.0,4500.0,4500.0,4500
unique,,,,,3
top,,,,,slender
freq,,,,,1500
mean,48.712444,177.682889,81.212222,1.0,
std,17.998332,12.56215,15.961573,0.816587,
min,18.0,150.0,50.0,0.0,
25%,33.0,168.0,70.0,0.0,
50%,49.0,178.0,80.0,1.0,
75%,65.0,188.0,92.0,2.0,


In [61]:
# We use: 
# data.isnull() 
# to show our data frame (df) with relevant information whether a data point is null 
# Since True = the data point is missing, while False = the data point is not missing, we can sum them
# This will give us the total number of missing values feature-wise
data.isnull().sum()

Age       0
Height    0
Weight    0
Index     0
Shape     0
dtype: int64

In [9]:
import pandas as pd

# Assuming df is your DataFrame containing the dataset
# Fill null values in the "Age" column with the median age
median_age = data['Age'].median()
data['Age'].fillna(median_age, inplace=True)

# Fill null values in the "Weight" column with the median weight
median_weight = data['Weight'].median()
data['Weight'].fillna(median_weight, inplace=True)


In [10]:
data.isnull().sum()

index     0
Height    0
Age       0
Weight    0
dtype: int64

In [None]:
# Random Forest Model

In [70]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Perform one-hot encoding on the 'Shape' column
encoded_data = pd.get_dummies(data, columns=['Shape'])

# Split data into features (X) and target (y)
X = encoded_data.drop('Index', axis=1)  # Assuming 'Index' is the target column
y = encoded_data['Index']

# Split data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [71]:
# Initialize and train the RandomForestClassifier
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)


RandomForestClassifier(random_state=42)

In [72]:
# Make predictions on the testing set
y_pred = model.predict(X_test)

In [73]:
# Evaluate the model
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       311
           1       1.00      1.00      1.00       295
           2       1.00      1.00      1.00       294

    accuracy                           1.00       900
   macro avg       1.00      1.00      1.00       900
weighted avg       1.00      1.00      1.00       900



In [78]:
# Example input data
new_data = {
    'Age': [25],     # Age of the individual
    'Weight': [65],  # Weight of the individual
    'Height': [170], # Height of the individual
    'Shape_athletic': [0], # One-hot encoded shape: 1 if athletic, 0 otherwise
    'Shape_round': [0],    # One-hot encoded shape: 1 if round, 0 otherwise
    'Shape_slender': [1]   # One-hot encoded shape: 1 if slender, 0 otherwise
}

# Create a DataFrame from the input data
new_df = pd.DataFrame(new_data)

# Make predictions using the trained model
predictions = model.predict(new_df)

# Print the predictions
print("Predicted body type:", predictions[0])


Predicted body type: 0


In [82]:
# Example input data
new_data = {
    'Age': [35],     # Age of the individual
    'Weight': [75],  # Weight of the individual
    'Height': [170], # Height of the individual
    'Shape_athletic': [1], # One-hot encoded shape: 1 if athletic, 0 otherwise
    'Shape_round': [0],    # One-hot encoded shape: 1 if round, 0 otherwise
    'Shape_slender': [0]   # One-hot encoded shape: 1 if slender, 0 otherwise
}

# Create a DataFrame from the input data
new_df = pd.DataFrame(new_data)

# Make predictions using the trained model
predictions = model.predict(new_df)

# Map predicted labels to actual body types
body_types = {0: 'ectomorph', 1: 'mesomorph', 2: 'endomorph'}

# Print the predicted body type
print("Predicted body type:", body_types[predictions[0]])


Predicted body type: mesomorph
