# Take a sample of ARMD Dataset.

In [3]:
linked_features = ['anon_id', 'pat_enc_csn_id_coded', 'order_proc_id_coded', 'order_time_jittered_utc']

files_info = [
    {
        "name": "cultures_cohort",
        "path": "microbiology_cultures_cohort.csv",
        "merge_on": linked_features, 
        "dtype": {
            "ordering_mode": "object",
            "culture_description": "object",
            "was_positive": "Int64",
            "organism": "object",
            "antibiotic": "object",
            "susceptibility": "object"
        }
    },
    {
        "name": "ward_info",
        "path": "microbiology_cultures_ward_info.csv",
        "merge_on": linked_features, 
        "dtype": {
            "hosp_ward_IP": "Int64",
            "hosp_ward_OP": "Int64",
            "hosp_ward_ER": "Int64",
            "hosp_ward_ICU": "Int64"
        }
    },
    {
        "name": "prior_med",
        "path": "microbiology_cultures_prior_med.csv",
        "merge_on": linked_features,
        "dtype": {
            "medication_name": "object",
            "medication_time_to_culturetime": "Int64",
            "medication_category": "object"
        }
    },
    {
        "name": "microbial_resistance",
        "path": "microbiology_cultures_microbial_resistance.csv",
        "merge_on": linked_features, 
        "dtype": {
            "organism": "object",
            "antibiotic": "object",
            "resistant_time_to_culturetime": "Int64"
        }
    },
    {
        "name": "cultures_demographics",
        "path": "microbiology_cultures_demographics.csv",
        "merge_on": linked_features[:3], 
        "dtype": {
            "age": "object",
            "gender": "object"
        }
    },
     {
        "name": "cultures_labs",
        "path": "microbiology_cultures_labs.csv",
        "merge_on": linked_features[:3], 
        "dtype": {
            "Period_Day": "Int64",
            "Q75_wbc": "object",
            "Q25_wbc": "object",
            "median_wbc": "object",
            "Q25_neutrophils": "float64",
            "Q75_neutrophils": "float64",
            "median_neutrophils": "float64",
            "Q25_lymphocytes": "float64",
            "Q75_lymphocytes": "float64",
            "median_lymphocytes": "float64",
            "Q25_hgb": "object",
            "Q75_hgb": "object",
            "median_hgb": "object",
            "Q25_plt": "object",
            "Q75_plt": "object",
            "median_plt": "object",
            "Q75_na": "object",
            "Q25_na": "object",
            "median_na": "object",
            "Q75_hco3": "object",
            "Q25_hco3": "object",
            "median_hco3": "object",
            "Q75_bun": "object",
            "Q25_bun": "object",
            "median_bun": "object",
            "Q75_cr": "object",
            "Q25_cr": "object",
            "median_cr": "object",
            "Q75_lactate": "object",
            "Q25_lactate": "object",
            "median_lactate": "object",
            "Q75_procalcitonin": "object",
            "Q25_procalcitonin": "object",
            "median_procalcitonin": "object",
            "first_procalcitonin": "object",
            "last_procalcitonin": "object",
            "first_lactate":"object",
            "last_cr":"object",
            "first_cr":"object",
            "last_bun":"object",
            "first_bun":"object",
            "last_hco3":"object",
            "first_hco3":"object",
            "last_na":"object",
            "first_na":"object",
            "last_plt":"object",
            "first_plt":"object",
            "last_hgb":"object",
            "first_hgb":"object",
            "last_lymphocytes":"object",
            "first_lymphocytes":"object",
            "last_neutrophils":"object",
            "first_neutrophils":"object",
            "last_wbc":"object",
            "first_wbc":"object" 
        }
    },
    {
        "name": "cultures_vitals",
        "path": "microbiology_cultures_vitals.csv",
        "merge_on": linked_features[:3],
        "dtype": {
            "Q25_heartrate": "object",
            "Q75_heartrate": "object",
            "median_heartrate": "object",
            "Q25_resprate": "object",
            "Q75_resprate": "object",
            "median_resprate": "object",
            "Q25_temp": "object",
            "Q75_temp": "object",
            "median_temp": "object",
            "Q25_sysbp": "float64",
            "Q75_sysbp": "float64",
            "median_sysbp": "float64",
            "Q25_diasbp": "float64",
            "Q75_diasbp": "float64",
            "median_diasbp": "float64",
            "first_diasbp": "object",
            "last_diasbp": "object",
            "last_sysbp": "object",
            "first_sysbp": "object",
            "last_temp": "object",
            "first_temp": "object",
            "last_resprate": "object",
            "first_resprate": "object",
            "last_heartrate": "object",
            "first_heartrate": "object"
        }
    },
     {
        "name": "antibiotic_class_exposure",
        "path": "microbiology_cultures_antibiotic_class_exposure.csv",
        "merge_on": linked_features, 
        "dtype": {
            "medication_category": "object",
            "medication_name": "object",
            "antibiotic_class": "object",
            "time_to_culturetime": "Int64"
        }
    },
    {
        "name": "antibiotic_subtype_exposure",
        "path": "microbiology_cultures_antibiotic_subtype_exposure.csv",
        "merge_on": linked_features, 
        "dtype": {
            "medication_category": "object",
            "medication_name": "object",
            "antibiotic_subtype": "object",
            "antibiotic_subtype_category": "object",
            "medication_time_to_cultureTime": "Int64"
        }
    },
    {
        "name": "prior_infecting_organism",
        "path": "microbiology_culture_prior_infecting_organism.csv",
        "merge_on": linked_features, 
        "dtype": {
            "prior_organism": "object",
            "prior_infecting_organism_days_to_culutre": "Int64"
        }
    },
    {
        "name": "cultures_comorbidity",
        "path": "microbiology_cultures_comorbidity.csv",
        "merge_on": linked_features, 
        "dtype": {
            "comorbidity_component": "object",
            "comorbidity_component_start_days_culture": "Int64",
            "comorbidity_component_end_days_culture": "float64"
        }
    },
    {
        "name": "cultures_priorprocedures",
        "path": "microbiology_cultures_priorprocedures.csv",
        "merge_on": linked_features, 
        "dtype": {
            "procedure_description": "object",
            "procedure_time_to_culturetime": "Int64"
        }
    },
    {
        "name": "adi_scores",
        "path": "microbiology_cultures_adi_scores.csv",
        "merge_on": linked_features, 
        "dtype": {
            "adi_score": "object",
            "adi_state_rank": "object"
        }
    },
    # {
    #     "name": "nursing_home_visits",
    #     "path": "microbiology_cultures_adi_scores.csv",
    #     "merge_on": linked_features, 
    #     "dtype": {
    #         "nursing_home_visit_culture": "Int64"
    #     }
    # },
    {
        "name": "implied_susceptibility",
        "path": "microbiology_cultures_implied_susceptibility.csv",
        "merge_on": linked_features[:3], 
        "dtype": {
            "organism": "object",
            "antibiotic": "object",
            "susceptibility": "object",
            "implied_susceptibility": "object"
        }
    }
    #, {
    #     "name": "implied_susceptibility_rules",
    #     "path": "microbiology_cultures_implied_susceptibility.csv",
    #     "merge_on": None , 
    #     "dtype": {
    #         "Organism": "object",
    #         "Antibiotic": "object",
    #         "Susceptibility": "object",
    #     }
    # }
]


In [9]:

import pandas as pd
import os


input_folder = 'doi_10_5061_dryad_jq2bvq8kp__v20250411/'
output_file = 'new_sample_tow/'
linked_features = ['anon_id', 'pat_enc_csn_id_coded', 'order_proc_id_coded', 'order_time_jittered_utc']

os.makedirs(output_file, exist_ok=True)

pat_ids = pd.read_csv(input_folder + files_info[0]['path'], usecols=['anon_id'] ).drop_duplicates().sample(n=100, random_state=42)

for file in files_info:
    size_bytes = os.path.getsize(input_folder + file['path'])
    size_gb = size_bytes / (1024 ** 3) 
    if (size_gb < 2):
        df = pd.read_csv(input_folder + file['path'], dtype = file['dtype'] , na_values=['Null'])
        df = df[df['anon_id'].isin(pat_ids['anon_id'])]
        df.to_parquet(output_file + file['name'] + '.parquet',  engine='pyarrow', index=False )
        print(f"Sample saved: {output_file + file['name'] + '.parquet'}")
    else:
        chunk_size = 1_000_000
        filtered_chunks = []
        for chunk in pd.read_csv(input_folder + file['path'], dtype=file['dtype'], na_values=['Null'], chunksize=chunk_size):
            filtered_chunk = chunk[chunk['anon_id'].isin(pat_ids['anon_id'])]
            filtered_chunks.append(filtered_chunk)
    
        if filtered_chunks:
            df = pd.concat(filtered_chunks, ignore_index=True)
            df.to_parquet(output_file + file['name'] + '.parquet', engine='pyarrow',  index=False)
            print(f"Sample Saved: {output_file + file['name']}.parquet")
                       

Sample saved: new_sample_tow/cultures_cohort.parquet
Sample saved: new_sample_tow/ward_info.parquet
Sample saved: new_sample_tow/prior_med.parquet
Sample saved: new_sample_tow/microbial_resistance.parquet
Sample saved: new_sample_tow/cultures_demographics.parquet
Sample saved: new_sample_tow/cultures_labs.parquet
Sample saved: new_sample_tow/cultures_vitals.parquet
Sample saved: new_sample_tow/antibiotic_class_exposure.parquet
Sample saved: new_sample_tow/antibiotic_subtype_exposure.parquet
Sample saved: new_sample_tow/prior_infecting_organism.parquet
Sample Saved: new_sample_tow/cultures_comorbidity.parquet
Sample saved: new_sample_tow/cultures_priorprocedures.parquet
Sample saved: new_sample_tow/adi_scores.parquet
Sample saved: new_sample_tow/implied_susceptibility.parquet


In [10]:
import pandas as pd

output_file = 'new_sample_tow/'
for file in files_info:
    df = pd.read_parquet(output_file + file['name'] + '.parquet')
    print(f"file {output_file + file['name'] + '.parquet'}:  ")
    print(df.head())
    print('-----------------------------------------------')

file new_sample_tow/cultures_cohort.parquet:  
     anon_id  pat_enc_csn_id_coded  order_proc_id_coded  \
0  JC1318312          131013741576            383925524   
1  JC6248701          131326546225            774526552   
2  JC1245822          131002330064            355483042   
3  JC1245822          131002330064            355483042   
4  JC1084866          131342950075            827884512   

     order_time_jittered_utc ordering_mode culture_description  was_positive  \
0  2011-06-09 14:21:00+00:00     Inpatient               BLOOD             1   
1  2022-02-26 22:31:00+00:00     Inpatient               URINE             1   
2  2009-09-07 12:15:00+00:00     Inpatient               BLOOD             1   
3  2009-09-07 12:15:00+00:00     Inpatient               BLOOD             1   
4  2022-10-08 00:30:00+00:00    Outpatient               URINE             1   

                organism    antibiotic susceptibility  
0  STAPHYLOCOCCUS AUREUS  Erythromycin    Susceptible  
1    

In [5]:
import pandas as pd
import os
import pyarrow as pa
import pyarrow.parquet as pq
import dask as d


sample_folder = 'new_sample_tow/'
merged_folder = 'merged_sample_tow/'

os.makedirs(merged_folder, exist_ok=True)

merged_dtype = files_info[0]['dtype']
print(merged_dtype)
base_sample = pd.read_parquet(sample_folder + files_info[0]['name'] + '.parquet', engine='pyarrow')
base_sample = base_sample.astype(merged_dtype)

print(f"Base file: {sample_folder + files_info[0]['name'] + '.parquet'} is loaded")

def get_df_size_mb(df):
    return df.memory_usage(deep=True).sum() / (1024 ** 2)

def getDatatype(file):
    all_keys = {'anon_id':'object',
               'pat_enc_csn_id_coded':'Int64',
               'order_proc_id_coded':'Int64',
               'order_time_jittered_utc':'object'}
    part_keys = {
        'anon_id':'object',
               'pat_enc_csn_id_coded':'Int64',
               'order_proc_id_coded':'Int64'
    }
    for f in files_info:
        if f['name']== file:
            if len(f['merge_on']) == 4:
                all_keys.update(f['dtype'])
                return all_keys
            else:
                part_keys.update(f['dtype'])
                return part_keys

def write_parquet(df1, filename, mode='w'):
    table = pa.Table.from_pandas(df1)
    if mode == 'w':
        pq.write_table(table, filename)
    elif mode == 'a':
        try:
            existing_table = pq.read_table(filename)
            combined_table = pa.concat_tables([existing_table, table])
            pq.write_table(combined_table, filename)
        except FileNotFoundError:
            # If file doesn't exist, just write new table
            pq.write_table(table, filename)


MAX_MEMORY_MB = 100
mode = 'w'
saved = False

for file in files_info[1:]:
    dtype = getDatatype(file['name'])
    print(f"Start merge with: {sample_folder + file['name'] + '.parquet'} ")
    df = pd.read_parquet(sample_folder + file['name'] + '.parquet' )
    df = df.astype(dtype)

    if get_df_size_mb(base_sample) > MAX_MEMORY_MB:
        base_sample.to_parquet(merged_folder + 'merged_ARMD.parquet' , engine='pyarrow', index=False)
        saved = True
        base_sample = pd.DataFrame()
    
    merged_dtype.update(dtype)
    
    if saved:
        parquet_file = pq.ParquetFile(merged_folder + 'merged_ARMD.parquet')
        for i in range(parquet_file.num_row_groups):
            chunk = parquet_file.read_row_group(i).to_pandas()
            temp = chunk.merge(df, how = 'left', on = file['merge_on'], suffixes=('', '_right') )
            temp = temp.drop(columns=[col for col in temp.columns if col.endswith('_right')])
            temp = temp.astype(merged_dtype)
            write_parquet(temp, merged_folder + 'temp_merged_ARMD.parquet', mode=mode)
            mode = 'a'
    else:
        base_sample = base_sample.merge(df, how = 'left', on = file['merge_on'] , suffixes=('', '_right'))
        base_sample = base_sample.drop(columns=[col for col in base_sample.columns if col.endswith('_right')])

    mode = 'w'
    
    if saved:
        parquet_file = pq.ParquetFile(merged_folder + 'temp_merged_ARMD.parquet')
        for i in range(parquet_file.num_row_groups):
            chunk = parquet_file.read_row_group(i).to_pandas()
            chunk = chunk.astype(merged_dtype)
            write_parquet(chunk, merged_folder + 'merged_ARMD.parquet', mode=mode)
            mode = 'a'

            

{'ordering_mode': 'object', 'culture_description': 'object', 'was_positive': 'Int64', 'organism': 'object', 'antibiotic': 'object', 'susceptibility': 'object'}
Base file: new_sample_tow/cultures_cohort.parquet is loaded
Start merge with: new_sample_tow/ward_info.parquet 
Start merge with: new_sample_tow/prior_med.parquet 
Start merge with: new_sample_tow/microbial_resistance.parquet 
Start merge with: new_sample_tow/cultures_demographics.parquet 
Start merge with: new_sample_tow/cultures_labs.parquet 
Start merge with: new_sample_tow/cultures_vitals.parquet 
Start merge with: new_sample_tow/antibiotic_class_exposure.parquet 
Start merge with: new_sample_tow/antibiotic_subtype_exposure.parquet 
Start merge with: new_sample_tow/prior_infecting_organism.parquet 
Start merge with: new_sample_tow/cultures_comorbidity.parquet 
Start merge with: new_sample_tow/cultures_priorprocedures.parquet 
Start merge with: new_sample_tow/adi_scores.parquet 
Start merge with: new_sample_tow/implied_suscep

In [5]:
import dask.dataframe as dd
import os

sample_folder = 'new_sample_tow/'
merged_folder = 'merged_sample_tow/'
os.makedirs(merged_folder, exist_ok=True)

def getMergOn(file):
    all_keys = {
        'anon_id': 'object',
        'pat_enc_csn_id_coded': 'Int64',
        'order_proc_id_coded': 'Int64',
        'order_time_jittered_utc': 'object'
    }
    part_keys = {
        'anon_id': 'object',  
        'pat_enc_csn_id_coded': 'Int64',
        'order_proc_id_coded': 'Int64'
    }
    for f in files_info:
        if f['name'] == file:
            return all_keys if len(f['merge_on']) == 4 else part_keys

file_df_pairs = [(file['name'], dd.read_parquet(
    os.path.join(sample_folder, file['name'] + '.parquet'),
    dtype=file['dtype']
)) for file in files_info]

# Start with base file
merged_df = file_df_pairs[0][1]
print(f"Load base file: {file_df_pairs[0][0]}")

# Merge sequentially
for file_path, df in file_df_pairs[1:]:
    print(f"Merging file: {file_path}")
    merg_on = getMergOn(file_path)
    merged_df = merged_df.merge(df, how='left', on=merg_on)


# Write to parquet
output_path = os.path.join(merged_folder, 'merged_result.parquet')
merged_df = merged_df.repartition(partition_size="50MB")  # or npartitions=500
# merged_df = merged_df.persist()  # only if RAM allows
print(f"\nWriting merged data to: {output_path}")
write_task = merged_df.to_parquet(
    output_path,
    engine='pyarrow',
    compression='snappy',
    write_index=False,
    write_metadata_file=False,
    compute=False
)
print(f"\n Alhamdulillah the {output_path} merged and saved ✅")
# yarb yshteghel ana t3abt :(


Load base file: cultures_cohort
Merging file: ward_info
Merging file: prior_med
Merging file: microbial_resistance
Merging file: cultures_demographics
Merging file: cultures_labs
Merging file: cultures_vitals
Merging file: antibiotic_class_exposure
Merging file: antibiotic_subtype_exposure
Merging file: prior_infecting_organism
Merging file: cultures_comorbidity
Merging file: cultures_priorprocedures
Merging file: adi_scores
Merging file: nursing_home_visits
Merging file: implied_susceptibility

Writing merged data to: merged_sample_tow/merged_result.parquet

 Alhamdulillah the merged_sample_tow/merged_result.parquet merged and saved ✅


+--------------------------------------------------------+------------+-------------+
| Merge columns                                          | left dtype | right dtype |
+--------------------------------------------------------+------------+-------------+
| ('anon_id', 'anon_id')                                 | string     | object      |
| ('order_time_jittered_utc', 'order_time_jittered_utc') | string     | object      |
+--------------------------------------------------------+------------+-------------+
Cast dtypes explicitly to avoid unexpected results.
+------------------------+------------+-------------+
| Merge columns          | left dtype | right dtype |
+------------------------+------------+-------------+
| ('anon_id', 'anon_id') | object     | string      |
+------------------------+------------+-------------+
Cast dtypes explicitly to avoid unexpected results.
+------------------------+------------+-------------+
| Merge columns          | left dtype | right dtype |
+-

# Left join USING Dask: File by File. 

In [None]:
import glob
import os
sample_folder = 'new_sample_one/'
sample_files = glob.glob(sample_folder + "*.parquet")
for file in sample_files:
    print(os.path.basename(file))

In [None]:
import dask.dataframe as dd
import glob
import os 

sample_folder = 'new_sample_one/'
merged_folder = 'merged_sample_one/'
linked_features = ['anon_id', 'pat_enc_csn_id_coded', 'order_proc_id_coded', 'order_time_jittered_utc']

os.makedirs(merged_folder, exist_ok=True)

sample_files = glob.glob(sample_folder + "*.parquet")

print(f'left join, left:{os.path.basename(sample_files[0])} ,right: {os.path.basename(sample_files[1])} ')

df1 = dd.read_parquet(sample_files[0])
df2 = dd.read_parquet(sample_files[1])

result = df1.merge(df2, how='left', on=linked_features)
result.to_parquet(merged_folder, engine='pyarrow', write_index=False)
print(len(result.columns))

In [None]:
import dask.dataframe as dd
import glob
import os 

sample_folder = 'new_sample_one/'
merged_folder = 'merged_sample_one/'

os.makedirs(merged_folder, exist_ok=True)

sample_files = glob.glob(sample_folder + "*.parquet")
merged_sample_file = os.path.basename(glob.glob(merged_folder + "*.parquet")[0])

print(f'left join, left: {merged_sample_file} ,right: {os.path.basename(sample_files[2])} ')

df1 = dd.read_parquet(merged_folder+merged_sample_file)
df2 = dd.read_parquet(sample_files[2])

result = df1.merge(df2, how='left', on=linked_features)
result.to_parquet(merged_folder, engine='pyarrow', write_index=False)
print(len(result.columns))


In [None]:
import dask.dataframe as dd
import glob
import os 

sample_folder = 'new_sample_one/'
merged_folder = 'merged_sample_one/'
merged_sample_file = 'sample_one_merged.parquet'

os.makedirs(merged_folder, exist_ok=True)

sample_files = glob.glob(sample_folder + "*.parquet")
merged_sample_file = os.path.basename(glob.glob(merged_folder + "*.parquet")[0])

print(f'left join, left:{merged_sample_file} ,right: {os.path.basename(sample_files[3])} ')

df1 = dd.read_parquet(merged_folder+merged_sample_file)
df2 = dd.read_parquet(sample_files[3])

result = df1.merge(df2, how='left', on=linked_features)
result.to_parquet(merged_folder, engine='pyarrow', write_index=False)
print(len(result.columns))


In [None]:
import dask.dataframe as dd
import glob
import os 

sample_folder = 'new_sample_one/'
merged_folder = 'merged_sample_one/'

os.makedirs(merged_folder, exist_ok=True)

sample_files = glob.glob(sample_folder + "*.parquet")
merged_sample_file = os.path.basename(glob.glob(merged_folder + "*.parquet")[0])

print(f'left join, left:{merged_sample_file} ,right: {os.path.basename(sample_files[4])} ')


df1 = dd.read_parquet(merged_folder + merged_sample_file).repartition(npartitions=32)
df2 = dd.read_parquet(sample_files[4])

result = df1.merge(df2, how='left', on=linked_features)
result.to_parquet(merged_folder, engine='pyarrow', write_index=False)
print(len(result.columns))


In [None]:
import dask.dataframe as dd
import glob
import os 

sample_folder = 'new_sample_one/'
merged_folder = 'merged_sample_one/'

os.makedirs(merged_folder, exist_ok=True)

sample_files = glob.glob(sample_folder + "*.parquet")
merged_sample_file = os.path.basename(glob.glob(merged_folder + "*.parquet")[0])

print(f'left join, left:{merged_sample_file} ,right: {os.path.basename(sample_files[5])} ')


df1 = dd.read_parquet(merged_folder+merged_sample_file)
df2 = dd.read_parquet(sample_files[5])

result = df1.merge(df2, how='left', on=linked_features[:3])
result.to_parquet(merged_folder, engine='pyarrow', write_index=False)
print(len(result.columns))


In [None]:
import dask.dataframe as dd
import glob
import os 

sample_folder = 'new_sample_one/'
merged_folder = 'merged_sample_one/'

os.makedirs(merged_folder, exist_ok=True)

sample_files = glob.glob(sample_folder + "*.parquet")
merged_sample_file = os.path.basename(glob.glob(merged_folder + "*.parquet")[0])

print(f'left join, left:{merged_sample_file} ,right: {os.path.basename(sample_files[6])} ')


df1 = dd.read_parquet(merged_folder+merged_sample_file)
df2 = dd.read_parquet(sample_files[6])

result = df1.merge(df2, how='left', on=linked_features[:3])
result.to_parquet(merged_folder, engine='pyarrow', write_index=False)
print(len(result.columns))


In [None]:
import dask.dataframe as dd
import glob
import os 

sample_folder = 'new_sample_one/'
merged_folder = 'merged_sample_one/'

os.makedirs(merged_folder, exist_ok=True)

sample_files = glob.glob(sample_folder + "*.parquet")
merged_sample_file = os.path.basename(glob.glob(merged_folder + "*.parquet")[0])

print(f'left join, left:{merged_sample_file} ,right: {os.path.basename(sample_files[7])} ')


df1 = dd.read_parquet(merged_folder+merged_sample_file)
df2 = dd.read_parquet(sample_files[7])

result = df1.merge(df2, how='left', on=linked_features)
result.to_parquet(merged_folder, engine='pyarrow', write_index=False)
print(len(result.columns))


In [None]:
import dask.dataframe as dd
import glob
import os 

sample_folder = 'new_sample_one/'
merged_folder = 'merged_sample_one/'

os.makedirs(merged_folder, exist_ok=True)

sample_files = glob.glob(sample_folder + "*.parquet")
merged_sample_file = os.path.basename(glob.glob(merged_folder + "*.parquet")[0])

print(f'left join, left:{merged_sample_file} ,right: {os.path.basename(sample_files[8])} ')


df1 = dd.read_parquet(merged_folder+merged_sample_file)
df2 = dd.read_parquet(sample_files[8])

result = df1.merge(df2, how='left', on=linked_features[:3])
result.to_parquet(merged_folder, engine='pyarrow', write_index=False)
print(len(result.columns))


In [None]:
import dask.dataframe as dd
import glob
import os 

sample_folder = 'new_sample_one/'
merged_folder = 'merged_sample_one/'

os.makedirs(merged_folder, exist_ok=True)

sample_files = glob.glob(sample_folder + "*.parquet")
merged_sample_file = os.path.basename(glob.glob(merged_folder + "*.parquet")[0])

print(f'left join, left:{merged_sample_file} ,right: {os.path.basename(sample_files[9])} ')

df1 = dd.read_parquet(merged_folder+merged_sample_file)
df2 = dd.read_parquet(sample_files[9])

result = df1.merge(df2, how='left', on=linked_features[:3])
result.to_parquet(merged_folder, engine='pyarrow', write_index=False)
print(len(result.columns))


In [None]:
import dask.dataframe as dd
import glob
import os 

sample_folder = 'new_sample_one/'
merged_folder = 'merged_sample_one/'

os.makedirs(merged_folder, exist_ok=True)

sample_files = glob.glob(sample_folder + "*.parquet")
merged_sample_file = os.path.basename(glob.glob(merged_folder + "*.parquet")[0])

print(f'left join, left:{merged_sample_file} ,right: {os.path.basename(sample_files[10])} ')


df1 = dd.read_parquet(merged_folder+merged_sample_file)
df2 = dd.read_parquet(sample_files[10])

result = df1.merge(df2, how='left', on=linked_features)
result.to_parquet(merged_folder, engine='pyarrow', write_index=False)
print(len(result.columns))


In [None]:
import dask.dataframe as dd
import glob
import os 

sample_folder = 'new_sample_one/'
merged_folder = 'merged_sample_one/'

os.makedirs(merged_folder, exist_ok=True)

sample_files = glob.glob(sample_folder + "*.parquet")
merged_sample_file = os.path.basename(glob.glob(merged_folder + "*.parquet")[0])

print(f'left join, left:{merged_sample_file} ,right: {os.path.basename(sample_files[11])} ')


df1 = dd.read_parquet(merged_folder+merged_sample_file)
df2 = dd.read_parquet(sample_files[11])

result = df1.merge(df2, how='left', on=linked_features)
result.to_parquet(merged_folder, engine='pyarrow', write_index=False)
print(len(result.columns))


In [None]:
import dask.dataframe as dd
import glob
import os 

sample_folder = 'new_sample_one/'
merged_folder = 'merged_sample_one/'

os.makedirs(merged_folder, exist_ok=True)

sample_files = glob.glob(sample_folder + "*.parquet")
merged_sample_file = os.path.basename(glob.glob(merged_folder + "*.parquet")[0])

print(f'left join, left:{merged_sample_file} ,right: {os.path.basename(sample_files[12])} ')

df1 = dd.read_parquet(merged_folder+merged_sample_file)
df2 = dd.read_parquet(sample_files[12])

result = df1.merge(df2, how='left', on=linked_features)
result.to_parquet(merged_folder, engine='pyarrow', write_index=False)
print(len(result.columns))


In [None]:
import dask.dataframe as dd
import glob
import os 

sample_folder = 'new_sample_one/'
merged_folder = 'merged_sample_one/'

os.makedirs(merged_folder, exist_ok=True)

sample_files = glob.glob(sample_folder + "*.parquet")
merged_sample_file = os.path.basename(glob.glob(merged_folder + "*.parquet")[0])

print(f'left join, left:{merged_sample_file} ,right: {os.path.basename(sample_files[13])} ')


df1 = dd.read_parquet(merged_folder+merged_sample_file)
df2 = dd.read_parquet(sample_files[13])

result = df1.merge(df2, how='left', on=linked_features)
result.to_parquet(merged_folder, engine='pyarrow', write_index=False)
print(len(result.columns))


In [None]:
import dask.dataframe as dd
import glob
import os 

sample_folder = 'new_sample_one/'
merged_folder = 'merged_sample_one/'

os.makedirs(merged_folder, exist_ok=True)

sample_files = glob.glob(sample_folder + "*.parquet")
merged_sample_file = os.path.basename(glob.glob(merged_folder + "*.parquet")[0])

print(f'left join, left:{merged_sample_file} ,right: {os.path.basename(sample_files[14])} ')


df1 = dd.read_parquet(merged_folder+merged_sample_file)
df2 = dd.read_parquet(sample_files[14])

result = df1.merge(df2, how='left', on=linked_features)
result.to_parquet(merged_folder, engine='pyarrow', write_index=False)
print(len(result.columns))
