In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, accuracy_score
import numpy as np

In [5]:
df=pd.read_csv('data/fitness_data.csv')

In [6]:
df.sample(5)

Unnamed: 0,age,weight,height,income_lpa,smoker,city,occupation,fitness_level
41,65,62.1,1.63,3.87,False,Miami,retired,Low
1,34,68.1,1.78,28.65,False,New York,freelancer,High
2,27,72.9,1.8,24.87,False,San Diego,software_engineer,Medium
77,67,113.5,1.73,0.61,True,Chicago,retired,Low
50,55,67.2,1.88,25.23,True,Miami,private_job,High


In [7]:
df['occupation'].value_counts()

occupation
retired              26
unemployed           15
student              12
private_job          12
business_owner       10
government_job        9
freelancer            9
teacher               4
software_engineer     1
entrepreneur          1
manager               1
Name: count, dtype: int64

In [8]:
df['occupation'].unique()

array(['retired', 'freelancer', 'software_engineer', 'student',
       'business_owner', 'teacher', 'entrepreneur', 'private_job',
       'government_job', 'manager', 'unemployed'], dtype=object)

In [9]:
def age_group(age):
    if age < 18:
        return 'teen'
    elif 18 <= age < 30:
        return 'young_adult'
    elif 30 <= age < 50:
        return 'adult'
    else:
        return 'senior'

In [10]:
df_feat=df.copy()

### Feature 1 BMI

In [11]:
df_feat['bmi']=df_feat['weight']/(df_feat['height']/100)**2

### Feature 2: age group

In [12]:
df_feat['age_group']=df_feat['age'].apply(age_group)

### Feature 3: Income Category

In [15]:
def income_category(income):
    if income < 10:
        return 'low'
    elif 10 <= income < 30:
        return 'medium'
    else:
        return 'high'

df_feat['income_category'] = df_feat['income_lpa'].apply(income_category)

In [16]:
# Verify the new feature
df_feat[['income_lpa', 'income_category']].sample(10)

Unnamed: 0,income_lpa,income_category
35,40.64,high
62,35.67,high
39,12.54,medium
28,12.34,medium
70,0.57,low
14,14.02,medium
59,1.13,low
58,3.31,low
92,30.0,high
47,8.34,low


In [20]:
# Define region/city categories (corrected)
tier_1_cities = ["New York", "Los Angeles", "Chicago", "Houston", "Phoenix", "San Antonio", 
                 "San Diego", "Dallas", "San Francisco", "Seattle", "Boston", "Miami"]
tier_2_cities = [
    "Denver", "Atlanta", "Portland", "Austin",
    "Tampa", "Minneapolis", "Detroit", "Charlotte", "Las Vegas", "Philadelphia", "Nashville",
    "Kansas City", "Indianapolis", "Cleveland", "Baltimore", "Orlando"
]

In [18]:
# Check which cities are in the dataset
cities_in_data = set(df_feat['city'].unique())
cities_in_tiers = set(tier_1_cities + tier_2_cities)

print("Cities in dataset:", sorted(cities_in_data))
print("\nCities covered in tiers:", sorted(cities_in_tiers))
print("\nMissing cities (in data but not in tiers):", sorted(cities_in_data - cities_in_tiers))
print("\nExtra cities (in tiers but not in data):", sorted(cities_in_tiers - cities_in_data))

Cities in dataset: ['Atlanta', 'Boston', 'Chicago', 'Dallas', 'Denver', 'Houston', 'Las Vegas', 'Los Angeles', 'Miami', 'New York', 'Orlando', 'Phoenix', 'Portland', 'San Diego', 'San Francisco', 'Seattle']

Cities covered in tiers: ['Atlanta', 'Austin', 'Baltimore', 'Boston', 'Charlotte', 'Chicago', 'Cleveland', 'Dallas', 'Denver', 'Detroit', 'Houston', 'Indianapolis', 'Kansas City', 'Las Vegas', 'Los Angeles', 'Miami', 'Minneapolis', 'Nashville', 'New York', 'Orlando', 'Philadelphia', 'Phoenix', 'Portland', 'San Diego', 'San Francisco', 'Seattle', 'Tampa']

Missing cities (in data but not in tiers): []

Extra cities (in tiers but not in data): ['Austin', 'Baltimore', 'Charlotte', 'Cleveland', 'Detroit', 'Indianapolis', 'Kansas City', 'Minneapolis', 'Nashville', 'Philadelphia', 'Tampa']


In [None]:
# Verify corrected tier assignments
current_tier_1 = sorted([city for city in cities_in_data if city in tier_1_cities])
current_tier_2 = sorted([city for city in cities_in_data if city in tier_2_cities])

print("✅ CORRECTED Tier 1 cities in dataset:", current_tier_1)
print(f"   Count: {len(current_tier_1)}")
print("\n✅ CORRECTED Tier 2 cities in dataset:", current_tier_2)
print(f"   Count: {len(current_tier_2)}")
print(f"\n📊 Total cities covered: {len(current_tier_1) + len(current_tier_2)}/{len(cities_in_data)}")

Current Tier 1 cities in dataset: ['Chicago', 'Dallas', 'Houston', 'Los Angeles', 'Miami', 'New York', 'San Francisco']
Current Tier 2 cities in dataset: ['Atlanta', 'Boston', 'Denver', 'Las Vegas', 'Orlando', 'Phoenix', 'Portland', 'San Diego', 'Seattle']
