In [1]:
import pandas as pd
import pyreadstat
import matplotlib.pyplot as plt
import numpy as np

In [2]:
def convert_xpt_to_dataframe(xpt_file):
    df, _ = pyreadstat.read_xport(xpt_file)
    return df

In [3]:
import os
os.chdir('/Users/anika/Desktop/ABL-Web-App/NHANES_analysis')

### All participants with LDL-C levels

In [4]:
df_ldlc = convert_xpt_to_dataframe('data17-18/TRIGLY_J.XPT')
df_ldlc = df_ldlc[['SEQN', 'LBXTR', 'LBDLDL', 'LBDLDLSI']]
df_ldlc = df_ldlc.dropna(subset=['LBDLDL']) # ldl-c levels, using friedman formula and in mg/dL
len(df_ldlc)

2808

In [5]:
df2_ldlc = convert_xpt_to_dataframe('data15-16/TRIGLY_I.XPT')
df2_ldlc = df2_ldlc[['SEQN', 'LBXTR', 'LBDLDL', 'LBDLDLSI']]
df2_ldlc = df2_ldlc.dropna(subset=['LBDLDL']) # ldl-c levels, using friedman formula and in mg/dL
len(df2_ldlc)

2699

In [6]:
df3_ldlc = convert_xpt_to_dataframe('data13-14/TRIGLY_H.XPT')
df3_ldlc = df3_ldlc[['SEQN', 'LBXTR', 'LBDLDL', 'LBDLDLSI']]
df3_ldlc = df3_ldlc.dropna(subset=['LBDLDL']) # ldl-c levels, using friedman formula and in mg/dL
len(df3_ldlc)

3105

### All participants with APO-b levels

In [7]:
df2_apo = convert_xpt_to_dataframe('data15-16/APOB_I.XPT')
df2_apo = df2_apo[['SEQN', 'LBXAPB']]
df2_apo = df2_apo.dropna(subset=['LBXAPB']) # apo-b levels, in mg/dL
len(df2_apo)

2722

In [8]:
df3_apo = convert_xpt_to_dataframe('data13-14/APOB_H.XPT')
df3_apo = df3_apo[['SEQN', 'LBXAPB']]
df3_apo = df3_apo.dropna(subset=['LBXAPB'])  # apo-b levels, in mg/dL
len(df3_apo)

3145

### Blood pressure

In [17]:
df_bp = convert_xpt_to_dataframe('data17-18/BPX_J.XPT')
df_bp = df_bp[['SEQN', 'BPXSY1', 'BPXDI1']] # systolic and diastolic blood pressure (mmHg)
df_bp = df_bp.dropna(subset=['BPXSY1', 'BPXDI1'])
len(df_bp)

6302

In [18]:
df2_bp = convert_xpt_to_dataframe('data15-16/BPX_I.XPT')
df2_bp = df2_bp[['SEQN', 'BPXSY1', 'BPXDI1']] # systolic and diastolic blood pressure (mmHg)
df2_bp = df2_bp.dropna(subset=['BPXSY1', 'BPXDI1'])
len(df2_bp)

7145

In [19]:
df3_bp = convert_xpt_to_dataframe('data13-14/BPX_H.XPT')
df3_bp = df3_bp[['SEQN', 'BPXSY1', 'BPXDI1']] # systolic and diastolic blood pressure (mmHg)
df3_bp = df3_bp.dropna(subset=['BPXSY1', 'BPXDI1'])
len(df3_bp)

7172

### Body Measures

In [20]:
df_bm = convert_xpt_to_dataframe('data17-18/BMX_J.XPT')
# weight (kg) , height (cm), bmi (kg/m^2), waist and hip circumference (cm)
df_bm = df_bm[['SEQN', 'BMXWT', 'BMXHT', 'BMXBMI', 'BMXWAIST', 'BMXHIP']]
df_bm = df_bm.dropna(subset=['BMXWT', 'BMXHT', 'BMXBMI', 'BMXWAIST', 'BMXHIP'])
len(df_bm)

6003

In [21]:
df2_bm = convert_xpt_to_dataframe('data15-16/BMX_I.XPT')
# weight (kg) , height (cm), bmi (kg/m^2), waist and hip circumference (cm)
df2_bm = df2_bm[['SEQN', 'BMXWT', 'BMXHT', 'BMXBMI', 'BMXWAIST', 'BMDAVSAD']]
df2_bm = df2_bm.dropna(subset=['BMXWT', 'BMXHT', 'BMXBMI', 'BMXWAIST', 'BMDAVSAD'])
len(df2_bm)

6970

In [22]:
df3_bm = convert_xpt_to_dataframe('data13-14/BMX_H.XPT')
# weight (kg) , height (cm), bmi (kg/m^2), waist and hip circumference (cm)
df3_bm = df3_bm[['SEQN', 'BMXWT', 'BMXHT', 'BMXBMI', 'BMXWAIST', 'BMDAVSAD']]
df3_bm = df3_bm.dropna(subset=['BMXWT', 'BMXHT', 'BMXBMI', 'BMXWAIST', 'BMDAVSAD'])
len(df3_bm)

7200

### Demographic data

In [23]:
df_demo = convert_xpt_to_dataframe('data17-18/DEMO_J.XPT')
df_demo = df_demo[['SEQN', 'RIAGENDR', 'RIDAGEYR', 'RIDRETH1']] # gender, age in years, race/hispanic origin
df_demo = df_demo.dropna(subset=['RIAGENDR', 'RIDAGEYR', 'RIDRETH1'])
len(df_demo)

9254

In [24]:
df2_demo = convert_xpt_to_dataframe('data15-16/DEMO_I.XPT')
df2_demo = df2_demo[['SEQN', 'RIAGENDR', 'RIDAGEYR', 'RIDRETH1']] # gender, age in years, race/hispanic origin
df2_demo = df2_demo.dropna(subset=['RIAGENDR', 'RIDAGEYR', 'RIDRETH1'])
len(df2_demo)

9971

In [25]:
df3_demo = convert_xpt_to_dataframe('data13-14/DEMO_H.XPT')
df3_demo = df3_demo[['SEQN', 'RIAGENDR', 'RIDAGEYR', 'RIDRETH1']] # gender, age in years, race/hispanic origin
df3_demo = df3_demo.dropna(subset=['RIAGENDR', 'RIDAGEYR', 'RIDRETH1'])
len(df3_demo)

10175

### General Health Status

In [26]:
df_general = convert_xpt_to_dataframe('data17-18/HSQ_J.XPT')
df_general = df_general[['SEQN', 'HSD010']] # general health condition (1 = excellent, 5 = poor)
df_general = df_general.dropna(subset=['HSD010'])
len(df_general)

5968

In [27]:
df2_general = convert_xpt_to_dataframe('data15-16/HSQ_I.XPT')
df2_general = df2_general[['SEQN', 'HSD010']] # general health condition (1 = excellent, 5 = poor)
df2_general = df2_general.dropna(subset=['HSD010'])
len(df2_general)

6166

In [28]:
df3_general = convert_xpt_to_dataframe('data13-14/HSQ_H.XPT')
df3_general = df3_general[['SEQN', 'HSD010']] # general health condition (1 = excellent, 5 = poor)
df3_general = df3_general.dropna(subset=['HSD010'])
len(df_general)

5968

### Disability

In [32]:
df_disability = convert_xpt_to_dataframe('data17-18/DLQ_J.XPT')
df_disability = df_disability[['SEQN', 'DLQ010', 'DLQ020', 'DLQ050']] # difficulty hearing, difficulty seeing, difficulty walking
df_disability = df_disability.dropna(subset=['DLQ020', 'DLQ050'])
len(df_disability)

8056

In [33]:
df2_disability = convert_xpt_to_dataframe('data15-16/DLQ_I.XPT')
df2_disability = df2_disability[['SEQN', 'DLQ020', 'DLQ050']] #difficulty seeing, difficulty walking
df2_disability = df2_disability.dropna(subset=['DLQ020', 'DLQ050'])
len(df2_disability)

8572

In [34]:
df3_disability = convert_xpt_to_dataframe('data13-14/DLQ_H.XPT')
df3_disability = df3_disability[['SEQN', 'DLQ020', 'DLQ050']] #difficulty seeing, difficulty walking
df3_disability = df3_disability.dropna(subset=['DLQ020', 'DLQ050'])
len(df3_disability)

8780

### Physical Activity

In [35]:
df_pa = convert_xpt_to_dataframe('data17-18/PAQ_J.XPT')
df_pa = df_pa[['SEQN', 'PAQ665']] #moderate recreational activities
df_pa = df_pa.dropna(subset=['PAQ665'])
len(df_pa)

5856

In [36]:
df2_pa = convert_xpt_to_dataframe('data15-16/PAQ_I.XPT')
df2_pa = df2_pa[['SEQN', 'PAQ665']]  #moderate recreational activities
df2_pa = df2_pa.dropna(subset=['PAQ665'])
len(df2_pa)

6963

In [37]:
df3_pa = convert_xpt_to_dataframe('data13-14/PAQ_H.XPT')
df3_pa = df3_pa[['SEQN', 'PAQ665']]  #moderate recreational activities
df3_pa = df3_pa.dropna(subset=['PAQ665'])
len(df3_pa)

7145

### Difficulties

In [206]:
df_difficulty = convert_xpt_to_dataframe('data17-18/PFQ_J.XPT')
df_difficulty = df_difficulty[['SEQN', 'PFQ061E']] # Difficulty lifting or carrying 
df_difficulty = df_difficulty.dropna(subset=['PFQ061E'])
len(df_difficulty)

2927

In [207]:
df2_difficulty = convert_xpt_to_dataframe('data15-16/PFQ_I.XPT')
df2_difficulty = df2_difficulty[['SEQN', 'PFQ061E']] # Difficulty lifting or carrying
df2_difficulty = df2_difficulty.dropna(subset=['PFQ061E'])
len(df2_difficulty)

2651

In [208]:
df3_difficulty = convert_xpt_to_dataframe('data13-14/PFQ_H.XPT')
df3_difficulty = df3_difficulty[['SEQN', 'PFQ061E']] # Difficulty lifting or carrying
df3_difficulty = df3_difficulty.dropna(subset=['PFQ061E'])
len(df3_difficulty)

2595

### Medical Conditions

In [72]:
# told doctor had trouble sleeping, How often feel overly sleepy during day?
df_med = convert_xpt_to_dataframe('data17-18/MCQ_J.XPT')
df_med = df_med[['SEQN', 'MCQ053', 'MCQ203', 'MCQ160L', 'MCQ160C']] # taking anemia medicine, jaundice, liver condition, coronary heart disease
df_med = df_med.dropna(subset=['MCQ053', 'MCQ203', 'MCQ160L', 'MCQ160C'])
len(df_med)

5569

In [73]:
df2_med = convert_xpt_to_dataframe('data15-16/MCQ_I.XPT')
df2_med = df2_med[['SEQN', 'MCQ053', 'MCQ203', 'MCQ160L', 'MCQ160C']]
df2_med = df2_med.dropna(subset=['MCQ053', 'MCQ203', 'MCQ160L', 'MCQ160C'])
len(df2_med)

5719

In [74]:
df3_med = convert_xpt_to_dataframe('data13-14/MCQ_H.XPT')
df3_med = df3_med[['SEQN', 'MCQ053', 'MCQ203', 'MCQ160L', 'MCQ160C']] 
df3_med = df3_med.dropna(subset=['MCQ053', 'MCQ203', 'MCQ160L', 'MCQ160C'])
len(df3_med)

5745

### Concatenation of dataframe

In [214]:
df_merged = df_ldlc.merge(df_bm, on='SEQN').merge(df_bp, on='SEQN').merge(df_demo, on='SEQN').merge(df_general, on='SEQN').merge(df_disability, on = 'SEQN').merge(df_pa, on = 'SEQN').merge(df_med, on = 'SEQN')
len(df_merged)

1921

In [576]:
df2_merged = df2_apo.merge(df2_bm, on='SEQN').merge(df2_bp, on='SEQN').merge(df2_demo, on='SEQN').merge(df2_general, on='SEQN').merge(df2_disability, on = 'SEQN').merge(df2_pa, on = 'SEQN').merge(df2_med, on = 'SEQN')
len(df2_merged)

1884

In [577]:
df3_merged = df3_apo.merge(df3_bm, on='SEQN').merge(df3_bp, on='SEQN').merge(df3_demo, on='SEQN').merge(df3_general, on='SEQN').merge(df3_disability, on = 'SEQN').merge(df3_pa, on = 'SEQN').merge(df3_med, on = 'SEQN')
len(df3_merged)

2061

In [578]:
df_all = pd.concat([df2_merged, df3_merged])
df_all

Unnamed: 0,SEQN,LBXAPB,BMXWT,BMXHT,BMXBMI,BMXWAIST,BMDAVSAD,BPXSY1,BPXDI1,RIAGENDR,RIDAGEYR,RIDRETH1,HSD010,DLQ020,DLQ050,PAQ665,MCQ053,MCQ203,MCQ160L,MCQ160C
0,83733.0,129.0,90.4,171.4,30.8,107.9,27.3,146.0,88.0,1.0,53.0,3.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
1,83734.0,129.0,83.4,170.1,28.8,116.5,26.6,138.0,46.0,1.0,78.0,3.0,4.0,2.0,2.0,2.0,2.0,1.0,1.0,2.0
2,83737.0,81.0,64.4,150.0,28.6,92.9,23.1,116.0,58.0,2.0,72.0,1.0,3.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
3,83741.0,72.0,76.6,165.4,28.0,86.6,19.2,110.0,70.0,1.0,22.0,4.0,3.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
4,83750.0,92.0,76.2,177.8,24.1,90.1,20.2,116.0,70.0,1.0,45.0,5.0,3.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2056,83708.0,90.0,175.2,188.3,49.4,157.6,36.5,94.0,74.0,1.0,64.0,3.0,4.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
2057,83711.0,105.0,81.9,156.3,33.5,112.0,26.2,110.0,76.0,2.0,38.0,3.0,3.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
2058,83712.0,91.0,90.8,174.1,30.0,107.3,26.8,124.0,70.0,1.0,61.0,3.0,3.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
2059,83718.0,92.0,74.7,165.1,27.4,100.4,23.1,114.0,72.0,2.0,60.0,4.0,4.0,2.0,1.0,1.0,2.0,2.0,2.0,2.0


### Feature Engineering

In [579]:
# Create new columns with ratio of waist to hip, and ratio of waist to height
# df_merged['waist_to_hip'] = df_merged['BMXWAIST'] / df_merged['BMXHIP']
# df_merged['waist_to_height'] = df_merged['BMXWAIST'] / df_merged['BMXHT']
df_all['waist_to_height'] = df_all['BMXWAIST'] / df_all['BMXHT']
df_all['sag_to_height'] = df_all['BMDAVSAD'] / df_all['BMXHT']

In [294]:
df_all = df_all[(df_all['BPXDI1'] < 200) & (df_all['BPXDI1'] > 60)]
df_all = df_all[(df_all['BPXSY1'] < 200) & (df_all['BPXSY1'] > 100)]
# df_all = df_all[(df_all['RIDAGEYR'] < 70) & (df_all['RIDAGEYR'] > 18)]
# df_all = df_all[(df_all['BMXBMI'] < 65) & (df_all['BMXBMI'] > 15)]

81

In [278]:
#Normalize all of the columns of the df between 0 and 1 except for columns SEQN, LBDLDL, and LBDLDL_class
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
exclude_columns = ['SEQN', 'LBDLDL', 'LBDLDLSI', 'LBXTR', 'LBXAPB']
for col in df_all.columns:
    if col not in exclude_columns:
        df_all[col] = scaler.fit_transform(df_all[[col]])
df_all

Unnamed: 0,SEQN,LBXAPB,BMXWT,BMXHT,BMXBMI,BMXWAIST,BMDAVSAD,BPXSY1,BPXDI1,RIAGENDR,RIDAGEYR,RIDRETH1,HSD010,DBQ700,DLQ020,DLQ050,PAQ665,WHQ030,waist_to_height,sag_to_height
0,83733.0,129.0,0.348739,0.533123,0.295585,0.421665,0.541818,0.444444,0.721311,0.0,0.578125,0.50,0.25,0.00,0.125,0.125,0.125,0.00,0.344382,0.495299
1,83734.0,129.0,0.306723,0.512618,0.257198,0.498657,0.516364,0.395062,0.377049,0.0,0.968750,0.50,0.75,0.75,0.125,0.125,0.125,0.25,0.415899,0.478655
2,83737.0,81.0,0.192677,0.195584,0.253359,0.287377,0.389091,0.259259,0.475410,1.0,0.875000,0.00,0.50,0.50,0.125,0.125,0.125,0.00,0.331223,0.464994
3,83741.0,72.0,0.265906,0.438486,0.241843,0.230976,0.247273,0.222222,0.573770,0.0,0.093750,0.75,0.50,0.50,0.125,0.125,0.125,0.25,0.207545,0.247222
4,83749.0,74.0,0.261705,0.380126,0.261036,0.335721,0.341818,0.259259,0.459016,1.0,0.015625,0.50,0.25,0.75,0.125,0.125,0.125,0.25,0.316476,0.354824
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2321,83711.0,105.0,0.297719,0.294953,0.347409,0.458371,0.501818,0.222222,0.622951,1.0,0.343750,0.50,0.50,0.50,0.125,0.125,0.125,0.00,0.456817,0.543254
2322,83712.0,91.0,0.351140,0.575710,0.280230,0.416294,0.523636,0.308642,0.573770,0.0,0.703125,0.50,0.50,0.25,0.125,0.125,0.125,0.00,0.327321,0.464618
2323,83716.0,49.0,0.276110,0.673502,0.166987,0.198747,0.247273,0.185185,0.491803,0.0,0.015625,0.50,0.00,0.00,0.125,0.125,0.000,0.25,0.125868,0.192126
2324,83718.0,92.0,0.254502,0.433754,0.230326,0.354521,0.389091,0.246914,0.590164,1.0,0.687500,0.75,0.75,0.50,0.125,0.000,0.000,0.25,0.316735,0.384101


### NN Model

In [445]:
import torch
import torch.nn as nn
from torch import optim
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

In [647]:
# - BMI
# - systolic blood pressure
# - diastolic blood pressure
# - gender
# - age
# - waist to height
# - saggital diameter to height
# - general health condition on a scale of 1-5
# - difficulty seeing
# - difficulty walking 
# - taking anemia medicine
# - doctor told you jaundice
# - doctor told you liver condition
# - doctor told you coronary heart disease

features = ['BMXBMI', 'BPXSY1', 'BPXDI1', 'RIAGENDR', 'RIDAGEYR', 'sag_to_height', 'waist_to_height',
            'HSD010', 'DLQ020', 'DLQ050', 'MCQ053', 'MCQ203', 'MCQ160L', 'MCQ160C']


In [648]:
X = df_all[features]
y = df_all['LBXAPB']

In [649]:
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, shuffle=True)

train_X = torch.tensor(train_X.values).to(torch.float32)
test_X = torch.tensor(test_X.values).to(torch.float32)
train_y = torch.tensor(train_y.values).to(torch.float32)
test_y = torch.tensor(test_y.values).to(torch.float32)

# We want the size to be (X, 1) to avoid errors in future
train_y = torch.unsqueeze(train_y, 1)
test_y = torch.unsqueeze(test_y, 1)

# make datasets and dataloaders
train_dataset = TensorDataset(train_X, train_y)
test_dataset = TensorDataset(test_X, test_y)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=64, shuffle=True) 


In [650]:
hidden_size = 64
model = nn.Sequential(
    nn.Linear(train_X.shape[1], hidden_size),
    nn.ReLU(),
    nn.Linear(hidden_size, 1),
)

optimizer = torch.optim.RMSprop(model.parameters(), lr=1e-4)
loss_fn = nn.MSELoss()

In [651]:
# Training loop
num_epochs = 15
epsilon = 1e-7  # small constant
for epoch in range(num_epochs):
    # Training
    model.train()
    total_loss = 0.0
    total_absolute_percentage_error = 0.0
    total_samples = 0
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = loss_fn(outputs, batch_y.float())
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        predicted = outputs
        absolute_percentage_error = (1 - (abs(predicted - batch_y) / (batch_y + epsilon))) * 100
        total_absolute_percentage_error += torch.sum(absolute_percentage_error).item()
        total_samples += batch_y.size(0)

    train_loss = total_loss / len(train_loader)
    train_mape = total_absolute_percentage_error / total_samples
 
    # Testing
    model.eval()
    total_loss = 0.0
    total_absolute_percentage_error = 0.0
    total_samples = 0
    with torch.no_grad():
        for batch_X, batch_y in test_loader:
            outputs = model(batch_X)
            loss = loss_fn(outputs, batch_y.float())
            total_loss += loss.item()
            predicted = outputs
            absolute_percentage_error = (1 - (abs(predicted - batch_y) / (batch_y + epsilon))) * 100
            total_absolute_percentage_error += torch.sum(absolute_percentage_error).item()
            total_samples += batch_y.size(0)

    test_loss = total_loss / len(test_loader)
    test_mape = total_absolute_percentage_error / total_samples

    print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Train MAPE: {train_mape:.2f}%, Test Loss: {test_loss:.4f}, Test MAPE: {test_mape:.2f}%")


Epoch 1/15, Train Loss: 9774.8892, Train MAPE: -4.39%, Test Loss: 8895.5986, Test MAPE: 1.82%
Epoch 2/15, Train Loss: 8142.3126, Train MAPE: 6.13%, Test Loss: 7608.2723, Test MAPE: 10.47%
Epoch 3/15, Train Loss: 6977.1681, Train MAPE: 14.24%, Test Loss: 6491.7506, Test MAPE: 18.23%
Epoch 4/15, Train Loss: 5970.3466, Train MAPE: 21.84%, Test Loss: 5582.4754, Test MAPE: 25.68%
Epoch 5/15, Train Loss: 5101.5529, Train MAPE: 29.17%, Test Loss: 4694.5118, Test MAPE: 33.00%
Epoch 6/15, Train Loss: 4321.3024, Train MAPE: 36.48%, Test Loss: 3955.8309, Test MAPE: 40.16%
Epoch 7/15, Train Loss: 3579.4448, Train MAPE: 43.65%, Test Loss: 3236.2663, Test MAPE: 47.29%
Epoch 8/15, Train Loss: 2936.5686, Train MAPE: 50.67%, Test Loss: 2734.7855, Test MAPE: 54.27%
Epoch 9/15, Train Loss: 2385.0203, Train MAPE: 57.28%, Test Loss: 2133.1329, Test MAPE: 60.77%
Epoch 10/15, Train Loss: 1904.0563, Train MAPE: 63.44%, Test Loss: 1668.5668, Test MAPE: 66.50%
Epoch 11/15, Train Loss: 1512.0603, Train MAPE: 68.

In [594]:
torch.save(model, 'apob_pred.pth')

### Regression Model

In [202]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from xgboost import XGBRegressor

In [203]:
X = df_all[features]
y = df_all['LBDLDL']

In [204]:
def mean_absolute_percentage_error(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [205]:
from sklearn.model_selection import ParameterGrid

param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'random_state': [42]
}

param_combinations = ParameterGrid(param_grid)
best_score = float('-inf')
best_params = None 

for params in param_combinations:
    regressor = RandomForestRegressor(**params)
    cv_scores = cross_val_score(regressor, X, y, cv=5, scoring='neg_mean_absolute_percentage_error')
    mean_mape = -cv_scores.mean()
    
    if 1 - mean_mape > best_score:
        best_score = 1 - mean_mape
        best_params = params

print("Best Parameters:", best_params)
print("Best 1 - MAPE:", best_score)


Best Parameters: {'max_depth': 5, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 50, 'random_state': 42}
Best 1 - MAPE: 0.7029533868693598


In [None]:
from sklearn.model_selection import ParameterGrid
from xgboost import XGBRegressor

# Define the parameter grid
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [5, 10, 15],
    'learning_rate': [0.1, 0.01, 0.001],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0],
    'random_state': [42]
}

param_combinations = ParameterGrid(param_grid)
best_score = float('-inf')
best_params = None

for params in param_combinations:
    regressor = XGBRegressor(**params)
    cv_scores = cross_val_score(regressor, X, y, cv=5, scoring='neg_mean_absolute_percentage_error')
    mean_mape = -cv_scores.mean()
    
    if 1 - mean_mape > best_score:
        best_score = 1 - mean_mape
        best_params = params

print("Best Parameters:", best_params)
print("Best 1 - MAPE:", best_score)


Best Parameters: {'colsample_bytree': 0.9, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 50, 'random_state': 42, 'subsample': 0.8}
Best 1 - MAPE: 0.7240778122571633
