In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 

# Let's Analyze our Data by importing it in our file.

In [2]:
df = pd.read_csv('./Train_Data.csv')

In [3]:
df

Unnamed: 0,SEQN,RIDAGEYR,RIAGENDR,PAQ605,BMXBMI,LBXGLU,DIQ010,LBXGLT,LBXIN,age_group
0,73564.0,61.0,2.0,2.0,35.7,110.0,2.0,150.0,14.91,Adult
1,73568.0,26.0,2.0,2.0,20.3,89.0,2.0,80.0,3.85,Adult
2,73576.0,16.0,1.0,2.0,23.2,89.0,2.0,68.0,6.14,Adult
3,73577.0,32.0,1.0,2.0,28.9,104.0,,84.0,16.15,Adult
4,73580.0,38.0,2.0,1.0,35.9,103.0,2.0,81.0,10.92,Adult
...,...,...,...,...,...,...,...,...,...,...
1961,83711.0,38.0,2.0,2.0,33.5,100.0,2.0,73.0,6.53,Adult
1962,83712.0,61.0,1.0,2.0,30.0,93.0,2.0,208.0,13.02,Adult
1963,83713.0,34.0,1.0,2.0,23.7,103.0,2.0,124.0,21.41,Adult
1964,83718.0,60.0,2.0,2.0,27.4,90.0,2.0,108.0,4.99,Adult


In [4]:
df.describe()

Unnamed: 0,SEQN,RIDAGEYR,RIAGENDR,PAQ605,BMXBMI,LBXGLU,DIQ010,LBXGLT,LBXIN
count,1954.0,1957.0,1948.0,1953.0,1948.0,1953.0,1948.0,1955.0,1957.0
mean,78683.621801,42.005621,1.510267,1.825397,27.9654,99.491039,2.015914,115.150384,11.862892
std,2924.115709,20.147601,0.500023,0.399449,7.327616,16.774665,0.187579,46.271615,9.756713
min,73564.0,12.0,1.0,1.0,14.5,63.0,1.0,40.0,0.14
25%,76194.0,24.0,1.0,2.0,22.8,91.0,2.0,87.0,5.8
50%,78717.0,41.0,2.0,2.0,26.8,97.0,2.0,105.0,9.03
75%,81217.0,58.0,2.0,2.0,31.3,104.0,2.0,131.0,14.48
max,83727.0,80.0,2.0,7.0,70.1,405.0,3.0,604.0,102.29


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1966 entries, 0 to 1965
Data columns (total 10 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   SEQN       1954 non-null   float64
 1   RIDAGEYR   1957 non-null   float64
 2   RIAGENDR   1948 non-null   float64
 3   PAQ605     1953 non-null   float64
 4   BMXBMI     1948 non-null   float64
 5   LBXGLU     1953 non-null   float64
 6   DIQ010     1948 non-null   float64
 7   LBXGLT     1955 non-null   float64
 8   LBXIN      1957 non-null   float64
 9   age_group  1952 non-null   object 
dtypes: float64(9), object(1)
memory usage: 153.7+ KB


In [6]:
df.count()

SEQN         1954
RIDAGEYR     1957
RIAGENDR     1948
PAQ605       1953
BMXBMI       1948
LBXGLU       1953
DIQ010       1948
LBXGLT       1955
LBXIN        1957
age_group    1952
dtype: int64

In [7]:
df[df['age_group']=='adult'].count()

SEQN         0
RIDAGEYR     0
RIAGENDR     0
PAQ605       0
BMXBMI       0
LBXGLU       0
DIQ010       0
LBXGLT       0
LBXIN        0
age_group    0
dtype: int64

## Let's make our data symmetrical first.

- SEQN: Sequence number (identifier)

- RIDAGEYR: Age in years i.e., Respondent's Age Group

- RIAGENDR: Respondent's Gender (1=Male, 2=Female)

- PAQ605: Physical activity questionnaire response: If the respondent engages in moderate or vigorous-intensity sports, fitness, or recreational activities in the typical week

- BMXBMI: Body Mass Index

- LBXGLU: Glucose level

- DIQ010: Diabetes questionnaire response

- LBXGLT: Glucose tolerance (Oral)

- LBXIN: Insulin level

Out of these we can be certain we do not need the sequence number in this dataset.

In [8]:
df.isna().sum()

SEQN         12
RIDAGEYR      9
RIAGENDR     18
PAQ605       13
BMXBMI       18
LBXGLU       13
DIQ010       18
LBXGLT       11
LBXIN         9
age_group    14
dtype: int64

In [9]:
# Drop rows where the target variable 'DIQ010' is missing
df_cleaned = df.dropna(subset=['age_group'])

In [10]:
df_cleaned.isna().sum()

SEQN         12
RIDAGEYR      9
RIAGENDR     18
PAQ605       13
BMXBMI       18
LBXGLU       13
DIQ010       18
LBXGLT       11
LBXIN         9
age_group     0
dtype: int64

In [11]:
tell = df_cleaned.isna().sum()

In [12]:
tell

SEQN         12
RIDAGEYR      9
RIAGENDR     18
PAQ605       13
BMXBMI       18
LBXGLU       13
DIQ010       18
LBXGLT       11
LBXIN         9
age_group     0
dtype: int64

In [13]:
missing_percentage = df_cleaned.isna().mean() * 100
print(missing_percentage)


SEQN         0.614754
RIDAGEYR     0.461066
RIAGENDR     0.922131
PAQ605       0.665984
BMXBMI       0.922131
LBXGLU       0.665984
DIQ010       0.922131
LBXGLT       0.563525
LBXIN        0.461066
age_group    0.000000
dtype: float64


In [14]:
missing_summary = pd.DataFrame({
    'Missing Count': df_cleaned.isna().sum(),
    'Missing %': df_cleaned.isna().mean() * 100,
    'Total Values': len(df_cleaned)
})
print(missing_summary.sort_values('Missing %', ascending=False))


           Missing Count  Missing %  Total Values
RIAGENDR              18   0.922131          1952
BMXBMI                18   0.922131          1952
DIQ010                18   0.922131          1952
PAQ605                13   0.665984          1952
LBXGLU                13   0.665984          1952
SEQN                  12   0.614754          1952
LBXGLT                11   0.563525          1952
RIDAGEYR               9   0.461066          1952
LBXIN                  9   0.461066          1952
age_group              0   0.000000          1952


In [15]:
# Numeric columns
for col in ['RIDAGEYR', 'BMXBMI', 'LBXGLU', 'LBXGLT', 'LBXIN']:
    df_cleaned[col] = df_cleaned[col].fillna(df_cleaned[col].median())

# Categorical columns
for col in ['PAQ605', 'DIQ010']:
    df_cleaned[col] = df_cleaned[col].fillna(df_cleaned[col].mode()[0])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned[col] = df_cleaned[col].fillna(df_cleaned[col].median())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned[col] = df_cleaned[col].fillna(df_cleaned[col].mode()[0])


In [16]:
missing_summary = pd.DataFrame({
    'Missing Count': df_cleaned.isna().sum(),
    'Missing %': df_cleaned.isna().mean() * 100,
    'Total Values': len(df_cleaned)
})
print(missing_summary.sort_values('Missing %', ascending=False))


           Missing Count  Missing %  Total Values
RIAGENDR              18   0.922131          1952
SEQN                  12   0.614754          1952
RIDAGEYR               0   0.000000          1952
PAQ605                 0   0.000000          1952
BMXBMI                 0   0.000000          1952
LBXGLU                 0   0.000000          1952
DIQ010                 0   0.000000          1952
LBXGLT                 0   0.000000          1952
LBXIN                  0   0.000000          1952
age_group              0   0.000000          1952


In [17]:
df_cleaned = df_cleaned.dropna(subset=['SEQN'])

In [18]:
missing_summary = pd.DataFrame({
    'Missing Count': df_cleaned.isna().sum(),
    'Missing %': df_cleaned.isna().mean() * 100,
    'Total Values': len(df_cleaned)
})
print(missing_summary.sort_values('Missing %', ascending=False))


           Missing Count  Missing %  Total Values
RIAGENDR              17   0.876289          1940
SEQN                   0   0.000000          1940
RIDAGEYR               0   0.000000          1940
PAQ605                 0   0.000000          1940
BMXBMI                 0   0.000000          1940
LBXGLU                 0   0.000000          1940
DIQ010                 0   0.000000          1940
LBXGLT                 0   0.000000          1940
LBXIN                  0   0.000000          1940
age_group              0   0.000000          1940


In [19]:
from sklearn.ensemble import RandomForestClassifier

# 1. Define features to use for prediction
features = ['RIDAGEYR', 'BMXBMI', 'LBXGLU', 'LBXGLT', 'LBXIN', 'PAQ605', 'DIQ010']

# 2. Create two datasets: one with known genders, one with missing genders
df_gender_known = df_cleaned[df_cleaned['RIAGENDR'].notna()]
df_gender_missing = df_cleaned[df_cleaned['RIAGENDR'].isna()]

# 3. Ensure all features used for training have no missing values
X_train = df_gender_known[features]
y_train = df_gender_known['RIAGENDR']

# 4. Train the classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# 5. Predict gender for missing entries
X_test = df_gender_missing[features]
gender_pred = clf.predict(X_test)

# 6. Assign predicted gender back to df_cleaned
df_cleaned.loc[df_cleaned['RIAGENDR'].isna(), 'RIAGENDR'] = gender_pred


In [20]:
missing_summary = pd.DataFrame({
    'Missing Count': df_cleaned.isna().sum(),
    'Missing %': df_cleaned.isna().mean() * 100,
    'Total Values': len(df_cleaned)
})
print(missing_summary.sort_values('Missing %', ascending=False))

           Missing Count  Missing %  Total Values
SEQN                   0        0.0          1940
RIDAGEYR               0        0.0          1940
RIAGENDR               0        0.0          1940
PAQ605                 0        0.0          1940
BMXBMI                 0        0.0          1940
LBXGLU                 0        0.0          1940
DIQ010                 0        0.0          1940
LBXGLT                 0        0.0          1940
LBXIN                  0        0.0          1940
age_group              0        0.0          1940


In [21]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score

In [22]:
# Features to use
features = ['RIDAGEYR', 'BMXBMI', 'LBXGLU', 'LBXGLT', 'LBXIN', 'PAQ605', 'DIQ010', 'RIAGENDR']

# Input and target
X = df_cleaned[features]
y = df_cleaned['age_group']  # Assuming this is a multi-class categorical variable

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

In [24]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [25]:
y_pred = model.predict(X_test)

# Print full classification report including F1 score
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       Adult       1.00      1.00      1.00       326
      Senior       1.00      0.98      0.99        62

    accuracy                           1.00       388
   macro avg       1.00      0.99      1.00       388
weighted avg       1.00      1.00      1.00       388



In [26]:
print("Macro F1 Score:", f1_score(y_test, y_pred, average='macro'))
print("Weighted F1 Score:", f1_score(y_test, y_pred, average='weighted'))


Macro F1 Score: 0.99516926256552
Weighted F1 Score: 0.9974141769487981


In [27]:
external_test_df = pd.read_csv('./Test_Data.csv')

In [28]:
df_cleaned

Unnamed: 0,SEQN,RIDAGEYR,RIAGENDR,PAQ605,BMXBMI,LBXGLU,DIQ010,LBXGLT,LBXIN,age_group
0,73564.0,61.0,2.0,2.0,35.7,110.0,2.0,150.0,14.91,Adult
1,73568.0,26.0,2.0,2.0,20.3,89.0,2.0,80.0,3.85,Adult
2,73576.0,16.0,1.0,2.0,23.2,89.0,2.0,68.0,6.14,Adult
3,73577.0,32.0,1.0,2.0,28.9,104.0,2.0,84.0,16.15,Adult
4,73580.0,38.0,2.0,1.0,35.9,103.0,2.0,81.0,10.92,Adult
...,...,...,...,...,...,...,...,...,...,...
1961,83711.0,38.0,2.0,2.0,33.5,100.0,2.0,73.0,6.53,Adult
1962,83712.0,61.0,1.0,2.0,30.0,93.0,2.0,208.0,13.02,Adult
1963,83713.0,34.0,1.0,2.0,23.7,103.0,2.0,124.0,21.41,Adult
1964,83718.0,60.0,2.0,2.0,27.4,90.0,2.0,108.0,4.99,Adult


In [29]:
external_test_df

Unnamed: 0,SEQN,RIDAGEYR,RIAGENDR,PAQ605,BMXBMI,LBXGLU,DIQ010,LBXGLT,LBXIN
0,77017.0,34.0,1.0,1.0,32.2,96.0,2.0,135.0,15.11
1,75580.0,12.0,2.0,2.0,26.3,100.0,2.0,141.0,15.26
2,73820.0,56.0,1.0,2.0,28.6,107.0,2.0,136.0,8.82
3,80489.0,20.0,2.0,1.0,22.1,93.0,2.0,111.0,12.13
4,82047.0,64.0,1.0,1.0,24.7,91.0,2.0,105.0,3.12
...,...,...,...,...,...,...,...,...,...
307,74150.0,20.0,2.0,2.0,21.9,82.0,2.0,82.0,2.54
308,82550.0,34.0,2.0,1.0,33.3,95.0,2.0,77.0,6.36
309,77835.0,64.0,2.0,2.0,41.5,91.0,2.0,149.0,15.52
310,79281.0,23.0,2.0,2.0,22.5,82.0,2.0,93.0,1.39


In [30]:
# Numeric columns
for col in ['RIDAGEYR', 'BMXBMI', 'LBXGLU', 'LBXGLT', 'LBXIN']:
    external_test_df[col] = external_test_df[col].fillna(external_test_df[col].median())

# Categorical columns
for col in ['PAQ605', 'DIQ010']:
    external_test_df[col] = external_test_df[col].fillna(external_test_df[col].mode()[0])

In [31]:
missing_summary = pd.DataFrame({
    'Missing Count': external_test_df.isna().sum(),
    'Missing %': external_test_df.isna().mean() * 100,
    'Total Values': len(external_test_df)
})
print(missing_summary.sort_values('Missing %', ascending=False))

          Missing Count  Missing %  Total Values
SEQN                  2   0.641026           312
RIAGENDR              2   0.641026           312
RIDAGEYR              0   0.000000           312
PAQ605                0   0.000000           312
BMXBMI                0   0.000000           312
LBXGLU                0   0.000000           312
DIQ010                0   0.000000           312
LBXGLT                0   0.000000           312
LBXIN                 0   0.000000           312


In [32]:
X_gf = external_test_df[external_test_df['RIAGENDR'].isna()]

In [33]:
X_gf

Unnamed: 0,SEQN,RIDAGEYR,RIAGENDR,PAQ605,BMXBMI,LBXGLU,DIQ010,LBXGLT,LBXIN
44,77440.0,19.0,,2.0,27.5,90.0,2.0,97.0,10.32
178,82771.0,41.0,,2.0,36.8,106.0,2.0,131.0,8.22


In [34]:
features = ['RIDAGEYR', 'BMXBMI', 'LBXGLU', 'LBXGLT', 'LBXIN', 'PAQ605', 'DIQ010']
X_gf_data = X_gf[features]

In [35]:
gender_pred_test = clf.predict(X_gf_data)

In [36]:
external_test_df.loc[external_test_df['RIAGENDR'].isna(), 'RIAGENDR'] = gender_pred_test

In [37]:
missing_summary = pd.DataFrame({
    'Missing Count': external_test_df.isna().sum(),
    'Missing %': external_test_df.isna().mean() * 100,
    'Total Values': len(external_test_df)
})
print(missing_summary.sort_values('Missing %', ascending=False))

          Missing Count  Missing %  Total Values
SEQN                  2   0.641026           312
RIDAGEYR              0   0.000000           312
RIAGENDR              0   0.000000           312
PAQ605                0   0.000000           312
BMXBMI                0   0.000000           312
LBXGLU                0   0.000000           312
DIQ010                0   0.000000           312
LBXGLT                0   0.000000           312
LBXIN                 0   0.000000           312


In [38]:
features = ['RIDAGEYR', 'BMXBMI', 'LBXGLU', 'LBXGLT', 'LBXIN', 'PAQ605', 'DIQ010', 'RIAGENDR']
X_test = external_test_df[features]
y_pred = model.predict(X_test)

In [39]:
y_pred = pd.DataFrame(y_pred)

In [40]:
y_pred.columns = ['age_group']

In [41]:
y_pred['age_group'] = y_pred['age_group'].map({'Senior': 1, 'Adult': 0})

In [42]:
y_pred.to_csv('Submission-1.csv', index=False)