In [59]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

from functions import *
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score

import datetime
import pickle
import os

import warnings
warnings.filterwarnings("ignore")

# Data Funnel Plot

In [60]:
ukb = pd.read_csv('processed/ukb_2023-03-13.csv', index_col=0)

ukb_pft = ukb.dropna(subset=['FEV1','FVC'])
ukb_pft['FEV1/FVC'] = ukb_pft['FEV1']/ukb_pft['FVC']
                     # Baseline 
ukb_clean = ukb_pft[['RIDRETH', 'RIDAGEYR', "RIAGENDR",
                     # Anthropometric
                     'BMXHT', 'BMXWT', 'BMXBMI', 'BMXWAIST', 
                     'BMXHIP', 'BMXSIT', #'BMXWT10', 'BMXHT10', 
                     # Smoking
                     "EXCL_SMOKED_100", "EXCL_SMOKER", 
                     # Exposures
                     'DMD_Mom_Smoked', 'DMD_Born_UK', 'DMD_Finish_HS',
                     'DMD_Home_Smoke', 'DMD_Veteran', 'DMD_PM10', 'DMD_PM2.5',
                     'DMD_PM2.5_10', 'DMD_Work_Fumes', 'DMD_Work_Exhaust',
                     'DMD_Work_Smoking_Exp', 'DMD_Work_Breathing_Probs', 'DMD_INC',
                     # Respiratory Disease/Symptoms 
                     "MCQ_LUNG_CANCER", "MCQ_RESPIRATORY_DISEASE", 
                     "SYMP_COUGH", "SYMP_PHLEGM", "SYMP_WHEEZING",
                     # PFT 
                     "FEV1","FVC", "FEV1/FVC"]].dropna()
ukb_clean = map_subregion(ukb_clean)
# Healthy cohort 
unhealthy = ["EXCL_SMOKED_100", "EXCL_SMOKER", 
                       "MCQ_LUNG_CANCER", "MCQ_RESPIRATORY_DISEASE", 
                       "SYMP_COUGH", "SYMP_PHLEGM", "SYMP_WHEEZING"]

rows_to_drop = ukb_clean[unhealthy].any(axis=1)
ukb_healthy = ukb_clean[~rows_to_drop]
ukb_healthy = ukb_healthy.drop('DMD_Born_UK', axis=1)
ukb_healthy = ukb_healthy.drop(unhealthy, axis=1)

In [66]:
# Get a list of categorical columns
X = ukb_healthy.drop(["FEV1","FVC", "FEV1/FVC"], axis=1)
y = ukb_healthy[["FEV1","FVC", "FEV1/FVC"]]

X = X.applymap(lambda x: 1 if x == True else x)
X = X.applymap(lambda x: 0 if x == False else x)

categorical_columns = list(ukb_healthy.select_dtypes(include=['object']).columns)
numerical_columns = list(set(ukb_healthy.columns) - set(categorical_columns))

# Convert categorical columns to one-hot encoded columns
for column in categorical_columns:
    one_hot = pd.get_dummies(ukb_healthy[column], prefix=column)
    X = X.drop(column, axis=1)
    X = X.join(one_hot)

# Scale features using StandardScaler
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X))
X_scaled.columns = X.columns
X_scaled.index=X.index

pd.concat([X, y], axis=1).to_csv('processed/ukb_2023-04-11.csv')

In [54]:
list(ukb_healthy.select_dtypes(include=['object']).columns)

['RIDRETH', 'DMD_INC', 'DMD_BORN_SubRegion']

In [39]:
# Create a natural cubic spline of age with 3 degrees of freedom
age_spline = dmatrix("cr(age, df=3)", {"age": X["RIDAGEYR"]}, return_type='dataframe')

# Concatenate the age spline with the height and sex variables
X = pd.concat([age_spline, X[["RIAGENDR", "BMXHT"]]], axis=1)

# Fit a multiple linear regression model
model = sm.OLS(y["FEV1"], X).fit()

# Predict pft for a new observation with age = 30, height = 170, and sex = "Male"
new_observation = pd.DataFrame({"age": [30], "height": [170], "sex_Male": [1]})
predict = model.predict(new_observation)


ValueError: Pandas data cast to numpy dtype of object. Check input data with np.asarray(data). The types seen wereNone and Intercept           float64
cr(age, df=3)[0]    float64
cr(age, df=3)[1]    float64
cr(age, df=3)[2]    float64
RIAGENDR               bool
BMXHT               float64
dtype: object. The data was
1838829    2.93
5102879    3.83
2676396    2.48
5337461    2.70
5678670    3.40
           ... 
5908970    1.88
2880906    3.32
2555451    2.13
3362170    2.80
3318998    2.24
Name: FEV1, Length: 159693, dtype: float64
and
          Intercept  cr(age, df=3)[0]  cr(age, df=3)[1]  cr(age, df=3)[2]  \
1838829        1.0          0.240741          0.851852         -0.092593   
5102879        1.0         -0.085650          0.898573          0.187077   
2676396        1.0          0.358753          0.737040         -0.095793   
5337461        1.0         -0.092718          0.670284          0.422434   
5678670        1.0         -0.039444          0.987979          0.051465   
...            ...               ...               ...               ...   
5908970        1.0         -0.095862          0.797785          0.298077   
2880906        1.0         -0.059410          0.967304          0.092106   
2555451        1.0         -0.078888          0.521412          0.557476   
3362170        1.0          0.015833          0.998637         -0.014470   
3318998        1.0          0.015833          0.998637         -0.014470   

         RIAGENDR  BMXHT  
1838829      True  163.0  
5102879     False  182.0  
2676396      True  163.0  
5337461     False  168.0  
5678670     False  180.0  
...           ...    ...  
5908970      True  160.0  
2880906      True  171.0  
2555451      True  161.0  
3362170      True  161.0  
3318998      True  157.0  

[159693 rows x 6 columns]
before. After,
[2.93 3.83 2.48 ... 2.13 2.8  2.24]
[[1.0 0.2407407407407407 0.8518518518518519 -0.0925925925925926 True
  163.0]
 [1.0 -0.08564988730277988 0.8985725018782871 0.18707738542449281 False
  182.0]
 [1.0 0.35875281743050336 0.7370398196844478 -0.09579263711495117 True
  163.0]
 ...
 [1.0 -0.07888805409466566 0.521412471825695 0.5574755822689706 True
  161.0]
 [1.0 0.015833263767148085 0.9986365027687342 -0.014469766535882219 True
  161.0]
 [1.0 0.015833263767148085 0.9986365027687342 -0.014469766535882219 True
  157.0]].

In [40]:
X

Unnamed: 0,Intercept,"cr(age, df=3)[0]","cr(age, df=3)[1]","cr(age, df=3)[2]",RIAGENDR,BMXHT
1838829,1.0,0.240741,0.851852,-0.092593,True,163.0
5102879,1.0,-0.085650,0.898573,0.187077,False,182.0
2676396,1.0,0.358753,0.737040,-0.095793,True,163.0
5337461,1.0,-0.092718,0.670284,0.422434,False,168.0
5678670,1.0,-0.039444,0.987979,0.051465,False,180.0
...,...,...,...,...,...,...
5908970,1.0,-0.095862,0.797785,0.298077,True,160.0
2880906,1.0,-0.059410,0.967304,0.092106,True,171.0
2555451,1.0,-0.078888,0.521412,0.557476,True,161.0
3362170,1.0,0.015833,0.998637,-0.014470,True,161.0


In [7]:
X

Unnamed: 0,Intercept,"bs(age, knots=[25, 35], degree=3, include_intercept=False)[0]","bs(age, knots=[25, 35], degree=3, include_intercept=False)[1]","bs(age, knots=[25, 35], degree=3, include_intercept=False)[2]","bs(age, knots=[25, 35], degree=3, include_intercept=False)[3]","bs(age, knots=[25, 35], degree=3, include_intercept=False)[4]",sex,new_var
0,1.0,0.0,0.0,0.0,0.0,0.0,0,1
1,1.0,0.444444,0.522222,0.033333,0.0,0.0,1,2
2,1.0,0.055556,0.733333,0.204938,0.006173,0.0,0,3
3,1.0,0.0,0.544444,0.406173,0.049383,0.0,1,4
4,1.0,0.0,0.0,0.0,0.0,1.0,0,5
