In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import os

In [2]:
diabetics = Path(r'C:\Users\a.bagherian\Desktop\working\Diabetics')
part_locations = [diabetics / p for p in os.listdir(diabetics)]

In [3]:
# parts = [pd.read_pickle(location) for location in locations]

## Opening needed tables

In [None]:
physician = pd.read_pickle(r'C:\Users\a.bagherian\Desktop\IHIO_share_A\physician.pkl')
physician

In [None]:
profile = pd.read_pickle(r'C:\Users\a.bagherian\Desktop\IHIO_share_A\profile.pkl')
profile

In [None]:
service = pd.read_pickle(r'C:\Users\a.bagherian\Desktop\IHIO_share_A\service.pkl')
service

In [7]:
# Importing ab_service
ab_service = pd.read_csv(r'C:\Users\a.bagherian\Desktop\working\ab_service_final.csv',
                         index_col=0)

In [16]:
from statistics import mode
import scipy.stats as st
import time


def preprocess_part(part_loc, physician, profile, service, ddd):

    # Printing process
    print(f'__________{part_loc.stem}:')

    # Reading prescription
    start = time.time()
    print('Reading prescription')
    df = pd.read_pickle(part_loc)
    end = time.time()
    print(end - start)

    # Parsing dates
    start = time.time()
    print('Parsing dates')
    df['date'] = pd.to_datetime(df['date'])
    end = time.time()
    print(end - start)

    # Merging specialty_en to df
    start = time.time()
    print('Merging specialty_en to df')
    df = df.merge(
        physician[['physician', 'specialty_en']],
        how='left',
        on='physician'
    )
    end = time.time()
    print(end - start)

    # Parsing birthdate and merging birthdate and gender to df
    start = time.time()
    print('Parsing birthdate and merging birthdate and gender to df')
    profile['birthdate'] = pd.to_datetime(profile['birthdate'])
    df = df.merge(
        profile[['member', 'birthdate', 'gender']],
        how='left',
        on='member'
    )
    end = time.time()
    print(end - start)

    # Calculating age
    start = time.time()
    print('Calculating age')
    df['age'] = (df['date'] - df['birthdate']) / pd.Timedelta(days=365.2425)
    ages = df.groupby('member')['age'].apply(mode)
    age_loc = part_loc.stem + '_age.csv'
    ages.to_csv(age_loc)
    end = time.time()
    print(end - start)

    # Merging atc, dm, ab
    start = time.time()
    print('Merging atc, dm, ab')
    dm_atc_3 = ['A10A', 'A10B']
    ab_atc_2 = ['J01', 'J02', 'J04', 'J05']
    service['dm'] = service['atc_3'].isin(dm_atc_3)
    service['ab'] = service['atc_2'].isin(ab_atc_2)
    df = df.merge(
        service[['service', 'atc', 'dm', 'ab']],
        how='left',
        on='service'
    )
    end = time.time()
    print(end - start)

    # Adding days to df
    start = time.time()
    print('Adding days to df')
    ab_mask = df['service'].isin(ddd.index)
    df_ab = df.loc[ab_mask]
    df.loc[ab_mask, 'days'] = pd.to_timedelta(
        df_ab['quantity'] * ddd[df_ab['service']].values,
        unit='day'
    )
    end = time.time()
    print(end - start)

    # Dropping unwanted columns
    start = time.time()
    print('Dropping unwanted columns')
    unwanted = ['physician', 'institute', 'claimed', 'fund', 'age']
    df.drop(unwanted, axis=1, inplace=True)
    end = time.time()
    print(end - start)

    return df

In [9]:
# df = preprocess_part(part_locations[0], physician, profile, service,
#                      ab_service['calc_ddd'])
# df

In [17]:
for part_loc in part_locations:
    df = preprocess_part(
        part_loc,
        physician,
        profile,
        service,
        ab_service['calc_ddd']
    )
    
    # Saving
    combined_loc = diabetics / (part_loc.stem + '_combined.pkl')
    df.to_pickle(combined_loc)

__________part1_dm:
Reading prescription
1.9971132278442383
Parsing dates
2.3581340312957764
Merging specialty_en to df
5.995346307754517
Parsing birthdate and merging birthdate and gender to df
84.49186682701111
Calculating age
28.4396390914917
Merging atc, dm, ab
7.613435506820679
Adding days to df
3.088181734085083
Dropping unwanted columns
1.295090675354004
__________part2_dm:
Reading prescription
3.9433648586273193
Parsing dates
3.0051944255828857
Merging specialty_en to df
6.884374141693115
Parsing birthdate and merging birthdate and gender to df
88.84211993217468
Calculating age
29.286685705184937
Merging atc, dm, ab
8.990514993667603
Adding days to df
3.6852123737335205
Dropping unwanted columns
1.4720840454101562
__________part3_dm:
Reading prescription
1.0593352317810059
Parsing dates
0.8030478954315186
Merging specialty_en to df
2.120116710662842
Parsing birthdate and merging birthdate and gender to df
84.4288604259491
Calculating age
9.809564113616943
Merging atc, dm, ab
2.