In [1]:
import pandas as pd

# Load the datasets
data1 = pd.read_csv('Datasets/archive/final_dataset.csv')
data2 = pd.read_csv('Datasets/archive(2)/megaGymDataset.csv')

# Display the first few rows
print(data1.head())
print(data2.head())

       Weight    Height        BMI  Gender  Age      BMIcase  \
0   92.085190  1.760250  29.719488  Female   59  over weight   
1   61.089124  1.595499  23.997776  Female   25       normal   
2   82.454037  1.816538  24.987499  Female   50       normal   
3  101.713306  1.790696  31.720047  Female   62        obese   
4   99.609527  1.969726  25.673756    Male   57  over weight   

   Exercise Recommendation Plan  
0                             5  
1                             4  
2                             4  
3                             6  
4                             5  
   Unnamed: 0                         Title  \
0           0        Partner plank band row   
1           1  Banded crunch isometric hold   
2           2         FYR Banded Plank Jack   
3           3                 Banded crunch   
4           4                        Crunch   

                                                Desc      Type    BodyPart  \
0  The partner plank band row is an abdominal exe.

In [2]:
# Define the user profile columns and workout columns
user_profile_columns = ['Weight', 'Height', 'BMI', 'Gender', 'Age', 'BMIcase', 'Exercise Recommendation Plan']
workout_columns = ['Title', 'Desc', 'Type', 'BodyPart', 'Equipment', 'Level', 'Rating', 'RatingDesc']

In [3]:
# Ensure the datasets have these columns
assert all(col in data1.columns for col in user_profile_columns), "data1 does not have all required columns"
assert all(col in data2.columns for col in workout_columns), "data2 does not have all required columns"

In [4]:
# Fill missing values
data1['Weight'] = data1['Weight'].fillna(data1['Weight'].median())
data1['Height'] = data1['Height'].fillna(data1['Height'].median())
data1['BMI'] = data1['BMI'].fillna(data1['BMI'].median())
data1['Age'] = data1['Age'].fillna(data1['Age'].median())
data1['Gender'] = data1['Gender'].fillna('Unknown')
data1['BMIcase'] = data1['BMIcase'].fillna('Unknown')
data1['Exercise Recommendation Plan'] = data1['Exercise Recommendation Plan'].fillna('None')

In [5]:
data2['Rating'] = data2['Rating'].fillna(data2['Rating'].median())
for column in ['Title', 'Desc', 'Type', 'BodyPart', 'Equipment', 'Level', 'RatingDesc']:
    data2[column] = data2[column].fillna('Unknown')

In [6]:
# Ensure correct data types
data1['Weight'] = data1['Weight'].astype(float)
data1['Height'] = data1['Height'].astype(float)
data1['BMI'] = data1['BMI'].astype(float)
data1['Age'] = data1['Age'].astype(int)
data2['Rating'] = data2['Rating'].astype(float)

In [7]:
# Check the length of both datasets
print(f'Length of data1: {len(data1)}')
print(f'Length of data2: {len(data2)}')

Length of data1: 5000
Length of data2: 2918


In [8]:
# If lengths are not equal, check the discrepancy
if len(data1) != len(data2):
    print("Lengths do not match. You may need to align them.")

Lengths do not match. You may need to align them.


In [9]:
# Ensure both datasets have the same length by reindexing if necessary
# Here we are assuming that each row in data1 should match a row in data2
if len(data1) > len(data2):
    data1 = data1.head(len(data2))
elif len(data2) > len(data1):
    data2 = data2.head(len(data1))

In [10]:
# Verify lengths are now the same
print(f'Length of data1 after alignment: {len(data1)}')
print(f'Length of data2 after alignment: {len(data2)}')

Length of data1 after alignment: 2918
Length of data2 after alignment: 2918


In [11]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [12]:
# Define the column transformer for encoding categorical features
column_transformer = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), ['Gender', 'BMIcase', 'Exercise Recommendation Plan'])
    ],
    remainder='passthrough'
)

In [13]:
# Apply the transformer to the user profile data
X = column_transformer.fit_transform(data1[['Gender', 'BMIcase', 'Exercise Recommendation Plan', 'Weight', 'Height', 'BMI', 'Age']])

In [14]:
# Convert the numpy array back to a DataFrame for easier handling
X_df = pd.DataFrame(X, columns=column_transformer.get_feature_names_out())

In [15]:
# Prepare target variable (for example, using the 'Type' column from data2)
y = data2['Type']
y_encoded = pd.get_dummies(y).values

In [16]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [17]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_df, y_encoded, test_size=0.2, random_state=42)

In [18]:
# Initialize and train the model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [19]:
# Make predictions and evaluate
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test.argmax(axis=1), y_pred.argmax(axis=1))
print(f'Accuracy: {accuracy * 100:.2f}%')

Accuracy: 87.50%


In [20]:
from joblib import dump

In [21]:
# Save the fitted transformer
dump(column_transformer, './../Saved_Models/column_transformer.joblib')

['./../Saved_Models/column_transformer.joblib']

In [22]:
dump(model, './../Saved_Models/model3.joblib')

['./../Saved_Models/model3.joblib']