In [1]:
import os
import pyreadstat
import pandas as pd
from functools import reduce

pd.set_option('display.max_rows', None)

In [10]:
import os
import pyreadstat

def load_data(base_path, file_paths, columns_to_keep):
    """
    Loads data for the specified period.
    """
    dataframes = {}
    for key, relative_path in file_paths.items():
        # Construct the full file path
        file_path = os.path.join(base_path, relative_path)
        try:
            # Check if the file exists
            if not os.path.exists(file_path):
                print(f"Error: File '{file_path}' does not exist!")
                continue

            # Specific handling for LabData_insulin in all periods
            if period == "2017-2020" and key == "LabData_insulin":
                data, _ = pyreadstat.read_xport(file_path, encoding="latin1")

            # Specific handling for demographics in 2021-2023
            elif period == "2021-2023" and key == "demographics":
                data, _ = pyreadstat.read_xport(file_path, encoding="latin1")
            else:
                # Default loading
                data, _ = pyreadstat.read_xport(file_path)

            # Filter columns based on the specified list
            dataframes[key] = data[columns_to_keep[key]]
            print(f"The file '{key}' has been successfully loaded.")
        except Exception as e:
            print(f"Error loading the file '{key}': {e}")
    return dataframes

# Paths for different periods
base_paths = {
    "2017-2020": "../data/raw/2017-2020",
    "2015-2016": "../data/raw/2015-2016"
}

# Relative paths (common for all periods)
file_paths = {
    "body_measures": "P_BMX.xpt",
    "demographics": "P_DEMO.xpt",
    "blood_pressure": "P_BPXO.xpt",
    "LabData_fast_glucose": "P_GLU.xpt",
    "LabData_glycohemog": "P_GHB.xpt",
    "LabData_insulin": "P_INS.xpt",
    "LabData_Cholest_total": "P_TCHOL.xpt",
    "LabData_Cholest_HDL": "P_HDL.xpt",
    "LabData_Cholest_LDL": "P_TRIGLY.xpt",
    "LabData_C_protein": "P_HSCRP.xpt",
    "LabData_Feretin": "P_FERTIN.xpt",
    "SurveyData_Alcohol": "P_ALQ.xpt",
    "SurveyData_Pressure_Cholest": "P_BPQ.xpt",
    "SurveyData_Diabetes": "P_DIQ.xpt",
    "SurveyData_Diet_Behavior": "P_DBQ.xpt",
    "SurveyData_Insurance": "P_HIQ.xpt",
    "SurveyData_Income": "P_INQ.xpt",
    "SurveyData_Medical": "P_MCQ.xpt",
    "SurveyData_MentalHealth": "P_DPQ.xpt",
    "SurveyData_PhysicalActivity": "P_PAQ.xpt",
    "SurveyData_SleepDisorders": "P_SLQ.xpt",
    "SurveyData_Smoking": "P_SMQ.xpt",
    "SurveyData_WeightHistory": "P_WHQ.xpt"
}

columns_to_keep = {
    "body_measures": ["SEQN", "BMXWT", "BMXHT", "BMXBMI", "BMXWAIST", "BMXHIP"],
    "demographics": ["SEQN", "SDDSRVYR", "RIAGENDR", "RIDAGEYR","RIDRETH3","DMDBORN4","DMDEDUC2","RIDEXPRG","INDFMPIR","DMDMARTZ","WTINTPRP","WTMECPRP"],
    "blood_pressure": ["SEQN", "BPXOSY1", "BPXOSY2", "BPXOSY3", "BPXODI1", "BPXODI2", "BPXODI3","BPXOPLS1", "BPXOPLS2", "BPXOPLS3"],
    #"blood_pressure_2015": ["SEQN", "BPXSY1", "BPXSY2", "BPXSY3", "BPXDI1", "BPXDI2", "BPXDI3", "BPXPLS"]
    "LabData_fast_glucose": ["SEQN","WTSAFPRP","LBDGLUSI"],
    "LabData_glycohemog": ["SEQN","LBXGH"],
    "LabData_insulin": ["SEQN","LBXIN","LBDINSI"],      
    "LabData_Cholest_total": ["SEQN","LBXTC","LBDTCSI"],
    "LabData_Cholest_HDL": ["SEQN","LBDHDD","LBDHDDSI"],
    "LabData_Cholest_LDL": ["SEQN","LBXTR", "LBDTRSI","LBDLDL","LBDLDLM","LBDLDLN"],         
    "LabData_C_protein": ["SEQN","LBXHSCRP",], 
    "LabData_Feretin": ["SEQN","LBXFER","LBDFERSI"], 
    "SurveyData_Alcohol" : ["SEQN","ALQ111","ALQ121","ALQ130","ALQ142"],
    "SurveyData_Pressure_Cholest" : ["SEQN","BPQ020","BPQ040A","BPQ050A","BPQ080", "BPQ090D"],
    "SurveyData_Diabetes" : ["SEQN","DIQ160","DID250"],
    "SurveyData_Diet_Behavior" : ["SEQN","DBD900","DBD905","DBD910","DBQ197","DBD895"],
    "SurveyData_Insurance" : ["SEQN","HIQ011"],
    "SurveyData_Income" : ["SEQN","INDFMMPI","INDFMMPC"],
    "SurveyData_Medical" : ["SEQN","MCQ080","MCQ160B","MCQ160C","MCQ160M","MCQ300C","MCQ366B","MCQ550"],
    "SurveyData_MentalHealth" : ["SEQN","DPQ020","DPQ030","DPQ040","DPQ060","DPQ070"],
    "SurveyData_PhysicalActivity" : ["SEQN","PAQ605","PAQ620","PAQ635","PAQ650","PAQ665","PAD680"],
    "SurveyData_SleepDisorders" : ["SEQN","SLD012","SLD013","SLQ120"],
    "SurveyData_Smoking" : ["SEQN","SMQ020","SMD650"],
    "SurveyData_WeightHistory" : ["SEQN","WHD110","WHD120","WHD140"]

}

# Load data for each period
dataframes_by_period = {}
for period, base_path in base_paths.items():
    print(f"Loading data for the period: {period}")
    dataframes_by_period[period] = load_data(base_path, file_paths, columns_to_keep)

Loading data for the period: 2017-2020
The file 'body_measures' has been successfully loaded.
The file 'demographics' has been successfully loaded.
The file 'blood_pressure' has been successfully loaded.
The file 'LabData_fast_glucose' has been successfully loaded.
The file 'LabData_glycohemog' has been successfully loaded.
The file 'LabData_insulin' has been successfully loaded.
The file 'LabData_Cholest_total' has been successfully loaded.
The file 'LabData_Cholest_HDL' has been successfully loaded.
The file 'LabData_Cholest_LDL' has been successfully loaded.
The file 'LabData_C_protein' has been successfully loaded.
The file 'LabData_Feretin' has been successfully loaded.
The file 'SurveyData_Alcohol' has been successfully loaded.
The file 'SurveyData_Pressure_Cholest' has been successfully loaded.
The file 'SurveyData_Diabetes' has been successfully loaded.
The file 'SurveyData_Diet_Behavior' has been successfully loaded.
The file 'SurveyData_Insurance' has been successfully loaded

In [3]:
from functools import reduce
import pandas as pd

# Словарь для хранения объединённых данных по периодам
merged_data_by_period = {}

# Объединение данных внутри каждого периода
for period, dfs in dataframes_by_period.items():
    print(f"Combining data for the period: {period}")
    
    # Проверяем, что все DataFrame содержат ключ 'SEQN'
    dataframes_to_merge = [df for df in dfs.values() if 'SEQN' in df.columns]
    
    if not dataframes_to_merge:
        print(f"No dataframes to merge for the period {period}")
        continue

    # Объединяем все DataFrame для текущего периода по ключу 'SEQN'
    merged_data_by_period[period] = reduce(
        lambda left, right: pd.merge(left, right, on="SEQN", how="outer"),
        dataframes_to_merge
    )

# Пример: просмотр объединённого DataFrame для периода '2017-2020'
merged_data_by_period['2017-2020'].head()


print(merged_data_by_period['2017-2020'].shape)

Combining data for the period: 2017-2020
Combining data for the period: 2015-2016
No dataframes to merge for the period 2015-2016
(15560, 90)
