In [49]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, accuracy_score
import numpy as np

In [50]:
df=pd.read_csv('data/fitness_data.csv')

In [51]:
df.sample(5)

Unnamed: 0,age,weight,height,income_lpa,smoker,city,occupation,fitness_level
16,68,91.7,1.7,0.94,True,Orlando,retired,Low
80,55,95.8,1.67,50.0,False,Boston,unemployed,High
45,55,73.3,1.52,17.38,False,San Francisco,unemployed,Medium
42,21,68.2,1.77,2.46,True,Seattle,student,Medium
40,43,56.2,1.53,39.12,False,Boston,unemployed,Medium


In [52]:
df['occupation'].value_counts()

occupation
retired              26
unemployed           15
student              12
private_job          12
business_owner       10
government_job        9
freelancer            9
teacher               4
software_engineer     1
entrepreneur          1
manager               1
Name: count, dtype: int64

In [53]:
df['occupation'].unique()

array(['retired', 'freelancer', 'software_engineer', 'student',
       'business_owner', 'teacher', 'entrepreneur', 'private_job',
       'government_job', 'manager', 'unemployed'], dtype=object)

In [54]:
def age_group(age):
    if age < 18:
        return 'teen'
    elif 18 <= age < 30:
        return 'young_adult'
    elif 30 <= age < 50:
        return 'adult'
    else:
        return 'senior'

In [55]:
df_feat=df.copy()

### Feature 1 BMI

In [56]:
df_feat['bmi']=df_feat['weight']/(df_feat['height']/100)**2

### Feature 2: age group

In [57]:
df_feat['age_group']=df_feat['age'].apply(age_group)

### Feature 3: Income Category

In [58]:
def income_category(income):
    if income < 10:
        return 'low'
    elif 10 <= income < 30:
        return 'medium'
    else:
        return 'high'

df_feat['income_category'] = df_feat['income_lpa'].apply(income_category)

In [59]:
# Verify the new feature
df_feat[['income_lpa', 'income_category']].sample(10)

Unnamed: 0,income_lpa,income_category
32,48.78,high
5,35.56,high
84,0.62,low
79,30.0,high
20,28.77,medium
63,41.66,high
70,0.57,low
3,3.12,low
69,6.03,low
2,24.87,medium


In [60]:
# Define region/city categories (corrected)
tier_1_cities = ["New York", "Los Angeles", "Chicago", "Houston", "Phoenix", "San Antonio", 
                 "San Diego", "Dallas", "San Francisco", "Seattle", "Boston", "Miami"]
tier_2_cities = [
    "Denver", "Atlanta", "Portland", "Austin",
    "Tampa", "Minneapolis", "Detroit", "Charlotte", "Las Vegas", "Philadelphia", "Nashville",
    "Kansas City", "Indianapolis", "Cleveland", "Baltimore", "Orlando"
]

In [61]:
# Check which cities are in the dataset
cities_in_data = set(df_feat['city'].unique())
cities_in_tiers = set(tier_1_cities + tier_2_cities)

print("Cities in dataset:", sorted(cities_in_data))
print("\nCities covered in tiers:", sorted(cities_in_tiers))
print("\nMissing cities (in data but not in tiers):", sorted(cities_in_data - cities_in_tiers))
print("\nExtra cities (in tiers but not in data):", sorted(cities_in_tiers - cities_in_data))

Cities in dataset: ['Atlanta', 'Boston', 'Chicago', 'Dallas', 'Denver', 'Houston', 'Las Vegas', 'Los Angeles', 'Miami', 'New York', 'Orlando', 'Phoenix', 'Portland', 'San Diego', 'San Francisco', 'Seattle']

Cities covered in tiers: ['Atlanta', 'Austin', 'Baltimore', 'Boston', 'Charlotte', 'Chicago', 'Cleveland', 'Dallas', 'Denver', 'Detroit', 'Houston', 'Indianapolis', 'Kansas City', 'Las Vegas', 'Los Angeles', 'Miami', 'Minneapolis', 'Nashville', 'New York', 'Orlando', 'Philadelphia', 'Phoenix', 'Portland', 'San Antonio', 'San Diego', 'San Francisco', 'Seattle', 'Tampa']

Missing cities (in data but not in tiers): []

Extra cities (in tiers but not in data): ['Austin', 'Baltimore', 'Charlotte', 'Cleveland', 'Detroit', 'Indianapolis', 'Kansas City', 'Minneapolis', 'Nashville', 'Philadelphia', 'San Antonio', 'Tampa']


In [62]:
# Verify corrected tier assignments
current_tier_1 = sorted([city for city in cities_in_data if city in tier_1_cities])
current_tier_2 = sorted([city for city in cities_in_data if city in tier_2_cities])

print("✅ CORRECTED Tier 1 cities in dataset:", current_tier_1)
print(f"   Count: {len(current_tier_1)}")
print("\n✅ CORRECTED Tier 2 cities in dataset:", current_tier_2)
print(f"   Count: {len(current_tier_2)}")
print(f"\n📊 Total cities covered: {len(current_tier_1) + len(current_tier_2)}/{len(cities_in_data)}")

✅ CORRECTED Tier 1 cities in dataset: ['Boston', 'Chicago', 'Dallas', 'Houston', 'Los Angeles', 'Miami', 'New York', 'Phoenix', 'San Diego', 'San Francisco', 'Seattle']
   Count: 11

✅ CORRECTED Tier 2 cities in dataset: ['Atlanta', 'Denver', 'Las Vegas', 'Orlando', 'Portland']
   Count: 5

📊 Total cities covered: 16/16


In [63]:
def city_tier(city):
    if city in tier_1_cities:
        return 1
    elif city in tier_2_cities:
        return 2
    else:
        return 3

In [64]:
df_feat['city_tier']=df_feat['city'].apply(city_tier)

In [65]:
X=df_feat[['bmi', 'age_group', 'income_category', 'city_tier', 'occupation']]
y=df_feat['fitness_level']

In [66]:
X

Unnamed: 0,bmi,age_group,income_category,city_tier,occupation
0,364314.968952,senior,low,1,retired
1,214934.982957,adult,medium,1,freelancer
2,225000.000000,young_adult,medium,1,software_engineer
3,226718.488035,young_adult,low,1,student
4,255923.420751,senior,low,1,retired
...,...,...,...,...,...
95,214207.472920,adult,medium,1,business_owner
96,473519.986507,young_adult,high,1,private_job
97,187654.320988,senior,high,2,freelancer
98,305216.761261,young_adult,medium,1,business_owner


In [67]:
y

0        Low
1       High
2     Medium
3     Medium
4        Low
       ...  
95    Medium
96      High
97      High
98      High
99      High
Name: fitness_level, Length: 100, dtype: object

In [68]:
categorical_features = ['age_group', 'income_category', 'occupation']
numerical_features = ['bmi', 'city_tier']

In [69]:
#create col for transformer for one hot encoding categorical features  
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

In [70]:
#create pipeline with preprocessor and random forest classifier
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

In [71]:
#split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [72]:
pipeline.fit(X_train, y_train)

0,1,2
,steps,"[('preprocessor', ...), ('classifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [73]:
y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")

Model Accuracy: 0.70


In [74]:
X_test.sample(5)

Unnamed: 0,bmi,age_group,income_category,city_tier,occupation
45,317261.080332,senior,medium,1,unemployed
0,364314.968952,senior,low,1,retired
70,366942.14876,senior,low,2,retired
80,343504.607551,senior,high,1,unemployed
90,210937.5,senior,medium,1,business_owner


In [75]:
import pickle
#save the trained pipeline using a pickle file
pickle_model_path='model.pkl'
with open(pickle_model_path, 'wb') as f:
    pickle.dump(pipeline, f)