In [19]:
from pandas import read_csv
import numpy as np
import sklearn
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split

In [20]:
dataframe = read_csv("data.csv")
dataframe.head()

Unnamed: 0,age,gender,height_cm,weight_kg,body fat_%,diastolic,systolic,gripForce,sit and bend forward_cm,sit-ups counts,broad jump_cm,class
0,27.0,M,172.3,75.24,21.3,80.0,130.0,54.9,18.4,60.0,217.0,C
1,25.0,M,165.0,55.8,15.7,77.0,126.0,36.4,16.3,53.0,229.0,A
2,31.0,M,179.6,78.0,20.1,92.0,152.0,44.8,12.0,49.0,181.0,C
3,32.0,M,174.5,71.1,18.4,76.0,147.0,41.4,15.2,53.0,219.0,B
4,28.0,M,173.8,67.7,17.1,70.0,127.0,43.5,27.1,45.0,217.0,B


**LabelEncoder** is used to convert categorical data ("gender") into numerical form so that machine learning models can process it.

If the categorical variable has more than two categories, using One-Hot Encoding (pd.get_dummies) is often better to avoid unintended relationships.

In [21]:
le = LabelEncoder()
dataframe['gender'] = le.fit_transform(dataframe['gender'])

split the dataset into features (X) and labels (y) for machine learning training.


---



```
# data = dataframe.loc[:, ['age', 'gender', 'height_cm', 'weight_kg',
       'body fat_%', 'diastolic', 'systolic', 'gripForce', 'sit and bend forward_cm',
       'sit-ups counts', 'broad jump_cm']]
```


Extracts specific columns from the DataFrame as input features for the model.

Uses .loc[:, [...]] to select multiple columns efficiently.
 data now contains all the independent variables (features).


---


```
# labels = dataframe['class']
```


Extracts only the "class" column, which is the target variable (output).

This column will be used as the ground truth for training the model.



In [22]:
data = dataframe.loc[:, ['age', 'gender', 'height_cm', 'weight_kg',
       'body fat_%', 'diastolic', 'systolic', 'gripForce', 'sit and bend forward_cm',
       'sit-ups counts', 'broad jump_cm']]
labels = dataframe['class']

In [23]:
data

Unnamed: 0,age,gender,height_cm,weight_kg,body fat_%,diastolic,systolic,gripForce,sit and bend forward_cm,sit-ups counts,broad jump_cm
0,27.0,1,172.3,75.24,21.3,80.0,130.0,54.9,18.4,60.0,217.0
1,25.0,1,165.0,55.80,15.7,77.0,126.0,36.4,16.3,53.0,229.0
2,31.0,1,179.6,78.00,20.1,92.0,152.0,44.8,12.0,49.0,181.0
3,32.0,1,174.5,71.10,18.4,76.0,147.0,41.4,15.2,53.0,219.0
4,28.0,1,173.8,67.70,17.1,70.0,127.0,43.5,27.1,45.0,217.0
...,...,...,...,...,...,...,...,...,...,...,...
13388,25.0,1,172.1,71.80,16.2,74.0,141.0,35.8,17.4,47.0,198.0
13389,21.0,1,179.7,63.90,12.1,74.0,128.0,33.0,1.1,48.0,167.0
13390,39.0,1,177.2,80.50,20.1,78.0,132.0,63.5,16.4,45.0,229.0
13391,64.0,0,146.1,57.70,40.4,68.0,121.0,19.3,9.2,0.0,75.0


In [24]:
labels

Unnamed: 0,class
0,C
1,A
2,C
3,B
4,B
...,...
13388,C
13389,D
13390,A
13391,D


In [25]:
(trainData, testData, trainLabels, testLabels) = train_test_split(np.array(data),
	np.array(labels), test_size=0.25, random_state=42)

# creates a Random Forest Classifier with specific settings



```
model = RandomForestClassifier(n_estimators=30, random_state=42,criterion='gini',max_depth=10)
```


**n_estimators** =30 (Number of Trees)

>Defines how many Decision Trees the Random Forest will create.

>More trees increase accuracy but take longer to train.

>[link text](https://)Typical values: 100-500 trees (but here, 30 is used for a smaller model).


---


 **random_state=42** (Reproducibility)
>Ensures that the same random numbers are used each time the model is trained.

>Helps in getting consistent results every time you run the code.

>42 is just a common choice, but any fixed integer works.


---


**criterion='gini'** (Splitting Strategy)
>Determines how the Decision Trees split the data.

>'gini' (Gini Impurity) → Measures impurity (lower is better).

>Alternative: 'entropy' (Information Gain), which is more computationally expensive.


---


**max_depth=10** (Tree Depth)

>Limits how deep each tree can grow.

>Prevents overfitting by restricting tree complexity.

>If set too low → Underfitting (too simple).

>If set too high → Overfitting (too complex).

In [98]:
model = RandomForestClassifier(n_estimators=100, random_state=50,criterion='gini',max_depth=50)

In [99]:
model.fit(trainData, trainLabels)

In [100]:
print(model.score(trainData, trainLabels))

1.0


In [101]:
print(model.score(testData, testLabels))

0.7473872797850104


In [86]:
predictions = model.predict(testData)

In [87]:
print(classification_report(testLabels, predictions))

              precision    recall  f1-score   support

           A       0.74      0.83      0.78       857
           B       0.60      0.60      0.60       824
           C       0.70      0.68      0.69       800
           D       0.89      0.81      0.85       868

    accuracy                           0.73      3349
   macro avg       0.73      0.73      0.73      3349
weighted avg       0.74      0.73      0.73      3349

