In [None]:
%matplotlib notebook
import pandas as pd
import pandasql as pds
from pandasql import sqldf
import plotly.graph_objects as go
import matplotlib.pyplot as plt
pd.set_option('display.max_rows', 50)
pd.set_option('display.min_rows', 50)
import seaborn as sns
from dateutil.relativedelta import relativedelta

In [None]:
big_lot_table_8 = pd.read_csv(r"C:\Users\MichaelDiFelice\Documents\Sanofi\MM\DATA\Flatiron\Big LoT Table\Big LoT Table.csv")
sct_induction = pd.read_csv(r"C:\Users\MichaelDiFelice\Documents\Sanofi\Python\Dashboard\Data\Data Raw\SCT_PATS_WITH_MAINT.csv")

def calculate_lookback(lot_df, reference_date):
    # Convert STARTDATE to datetime format
    lot_df['STARTDATE'] = pd.to_datetime(lot_df['STARTDATE'])

    # Filter the rows based on ISFIRSTTREATMENT
    first_treatment_data = lot_df[lot_df['ISFIRSTTREATMENT'] == True]

    # Create a copy of the filtered data to avoid SettingWithCopyWarning
    first_treatment_data_copy = first_treatment_data.copy()

    # Calculate the exact difference in months between two dates
    def months_difference(start_date, end_date):
        delta = relativedelta(end_date, start_date)
        return delta.years * 12 + delta.months

    # Calculate the lookback in 12-month increments (up to 48 months)
    def lookback_periods(months):
        if months < 12:
            return "0-12 months"
        elif months < 24:
            return "12-24 months"
        elif months < 36:
            return "24-36 months"
        elif months < 48:
            return "36-48 months"
        else:
            return "more than 48 months"

    # Calculate the exact months since start for each row
    first_treatment_data_copy['exact_months_since_start'] = first_treatment_data_copy['STARTDATE'].apply(lambda x: months_difference(x, reference_date))

    # Apply the lookback_periods function to determine the lookback based on the exact month difference
    first_treatment_data_copy['exact_lookback'] = first_treatment_data_copy['exact_months_since_start'].apply(lookback_periods)

    data_final = first_treatment_data_copy[first_treatment_data['LINE_ZERO_FLAG']==0]
    
    return data_final



In [None]:
def bundle_regimen(row):
    # Line-based conditions
    if row['LINENUMBER'] == 1:
        # Transplant and regimen conditions within Line 1
        if row['TRANSPLANT_FLAG'] == 1 and row['REGIMEN'] == "DVRd":
            return "DVRd"
        elif row['TRANSPLANT_FLAG'] == 1 and row['REGIMEN'] == "DVTd":
            return "DVTd"
        elif row['TRANSPLANT_FLAG'] == 1 and row['REGIMEN'] == "VCd":
            return "VCd"
        elif row['TRANSPLANT_FLAG'] == 1 and row['REGIMEN'] in ["VRd", "VR", "Vd",]:
            return "VRd"
        elif row['TRANSPLANT_FLAG'] == 0 and row['REGIMEN'] in ['D mono','DKd','DKRd','DPd','DVd','DVMp','DVTd','Other D']:
            return "D-other"
        elif row['TRANSPLANT_FLAG'] == 0 and row['REGIMEN'] == 'DRd':
            return "DRd"
        elif row['TRANSPLANT_FLAG'] == 0 and row['REGIMEN'] == 'DVRd':
            return "DVRd"
        elif row['TRANSPLANT_FLAG'] == 0 and row['REGIMEN'] in ['Kd','Pd','Vd']:
            return 'Other doublet'
        elif row['TRANSPLANT_FLAG'] == 0 and row['REGIMEN'] in ['EPd','ERd','IRd','IsaKd','IsaPd','KPd','KRd','PCd','PVd','VCd','VMp','VTd']:
            return 'Other triplet'
        elif row['TRANSPLANT_FLAG'] == 0 and row['REGIMEN'] == 'Rd':
            return 'Rd'
        elif row['TRANSPLANT_FLAG'] == 0 and row['REGIMEN'] == 'VRd':
            return 'VRd' 

    elif row['LINENUMBER'] == 2:
        # Transplant and regimen conditions within Line 2
        if row['REGIMEN'] in ['D mono','DKRd','DVMp','DVRd','DVTd','Other D']:
            return "D-other"
        elif row['REGIMEN'] == "DKd":
            return "DKd"
        elif row['REGIMEN'] == "DPd":
            return "DPd"
        elif row['REGIMEN'] == "DRd":
            return "DRd"
        elif row['REGIMEN'] == "DVd":
            return "DVd"
        elif row['REGIMEN'] == "IsaKd":
            return "IsaKd"
        elif row['REGIMEN'] == "IsaPd":
            return "IsaPd"
        elif row['REGIMEN'] == 'Rd':
            return 'Rd'
        elif row["REGIMEN"] in ['ERd','IRd','KRd','VRd']:
            return 'R-Triplet'


    # Default return value if none of the conditions are met
    return "Other"




In [None]:
def define_regimen_order(data):
    data_sorted = data.drop_duplicates(subset=['PATIENTID', 'LINENUMBER'])

    # Sort the data by PATIENTID and LINENUMBER to ensure the order
    data_sorted = data_sorted.sort_values(by=["PATIENTID", "LINENUMBER"])

    # Shift the REGIMEN column to get the next regimen for each patient and line
    data_sorted["NEXT_REGIMEN"] = data_sorted.groupby("PATIENTID")["REGIMEN"].shift(-1)
    data_sorted['PREVIOUS_REGIMEN'] = data_sorted.groupby("PATIENTID")["REGIMEN"].shift(1)
    # Fill NaN values in "NEXT_REGIMEN" with "No Advancement"
    data_sorted["NEXT_REGIMEN"].fillna("No Advancement", inplace=True)
    data_sorted["PREVIOUS_REGIMEN"].fillna("", inplace=True)


    # If NEXT_REGIMEN is 'No Advancement', set NEXT_LINENUMBER to the same as LINENUMBER
    # Otherwise, shift the LINENUMBER column to get the next line number for each patient
    data_sorted["NEXT_LINENUMBER"] = data_sorted.apply(
        lambda row: row["LINENUMBER"] if row["NEXT_REGIMEN"] == "No Advancement" else row["LINENUMBER"] + 1, axis=1
    )

    data_sorted["START_YEAR"] = data_sorted['exact_lookback']

    # Filter out rows where the next line is not consecutive (e.g., line 1 followed by line 3) or 'No Advancement'
    data_sorted = data_sorted[(data_sorted["LINENUMBER"] + 1 == data_sorted["NEXT_LINENUMBER"]) | (data_sorted["NEXT_REGIMEN"] == "No Advancement")]


    # Do the same steps above to bundled regimen
    data_sorted["NEXT_BUNDLE"] = data_sorted.groupby("PATIENTID")["BUNDLED_REGIMEN"].shift(-1)
    data_sorted['PREVIOUS_BUNDLE'] = data_sorted.groupby("PATIENTID")["BUNDLED_REGIMEN"].shift(1)
    # Fill NaN values in "NEXT_REGIMEN" with "No Advancement"
    data_sorted["NEXT_BUNDLE"].fillna("No Advancement", inplace=True)
    data_sorted["PREVIOUS_BUNDLE"].fillna("", inplace=True)

    data_sorted["NEXT_LINENUMBER"] = data_sorted.apply(
        lambda row: row["LINENUMBER"] if row["NEXT_BUNDLE"] == "No Advancement" else row["LINENUMBER"] + 1, axis=1
    )

    data_sorted["START_YEAR"] = data_sorted['exact_lookback']

    # Filter out rows where the next line is not consecutive (e.g., line 1 followed by line 3) or 'No Advancement'
    data_sorted = data_sorted[(data_sorted["LINENUMBER"] + 1 == data_sorted["NEXT_LINENUMBER"]) | (data_sorted["NEXT_BUNDLE"] == "No Advancement")]

    # For our current task, we only need certain columns
    data = data_sorted[["PATIENTID", "LINENUMBER", "REGIMEN","PREVIOUS_REGIMEN", "NEXT_REGIMEN","BUNDLED_REGIMEN",'PREVIOUS_BUNDLE','NEXT_BUNDLE', "LEN_FLAG", "CD38_FLAG", "CD38_EXPOSED_FLAG", "TRANSPLANT_FLAG", "START_YEAR","LEN_REFRACTORY_FLAG","NEXT_LINENUMBER"]]
    data = data[data['LINENUMBER'].isin([1, 2, 3, 4])]

    # Concatenate LINENUMBER to REGIMEN and NEXT_REGIMEN columns
    data['BUNDLED_REGIMEN'] = data['BUNDLED_REGIMEN'].astype(str) + data['LINENUMBER'].astype(str)
    data['NEXT_BUNDLE'] = data['NEXT_BUNDLE'].astype(str) + (data['LINENUMBER'] + 1).astype(str)
    data['PREVIOUS_BUNDLE'] = data['PREVIOUS_BUNDLE'].astype(str) + (data['LINENUMBER'] - 1).astype(str)
    data['PREVIOUS_BUNDLE'] = data['PREVIOUS_BUNDLE'].str.replace('0', '')

    # Concatenate LINENUMBER to REGIMEN and NEXT_REGIMEN columns
    data['REGIMEN'] = data['REGIMEN'].astype(str) + data['LINENUMBER'].astype(str)
    data['NEXT_REGIMEN'] = data['NEXT_REGIMEN'].astype(str) + (data['LINENUMBER'] + 1).astype(str)
    data['PREVIOUS_REGIMEN'] = data['PREVIOUS_REGIMEN'].astype(str) + (data['LINENUMBER'] - 1).astype(str)
    data['PREVIOUS_REGIMEN'] = data['PREVIOUS_REGIMEN'].str.replace('0', '')

    # Turn the data into a cvs in the Modified Data folder    
    data.to_csv(r"C:\Users\MichaelDiFelice\Documents\Sanofi\Python\Dashboard\Data\Modified Data\modified_sankey.csv", index=False)
    
    return data



In [None]:
data_final = calculate_lookback(big_lot_table_8, pd.to_datetime('2023-07-31'))

# Apply the function to create the 'bundled_regimen' column
data_final['BUNDLED_REGIMEN'] = data_final.apply(bundle_regimen, axis=1)

define_regimen_order(data_final)

In [None]:
##############################   Data Manipulation for Induction Regimen   #####################################

sct_induction['BUNDLED_REGIMEN'] = sct_induction.apply(bundle_regimen, axis=1)
sct_induction = sct_induction[(sct_induction['ISMAINTENANCETHERAPY'] == True) | (sct_induction['ISFIRSTTREATMENT'] == 1)]

# Sort the dataframe by PATIENTID and STARTDATE to ensure chronological order
sorted_df = sct_induction.sort_values(by=['PATIENTID', 'STARTDATE'])

# For each patient, get the REGIMEN where ISFIRSTTREATMENT is 1
induction_regimen = sorted_df.groupby('PATIENTID').apply(lambda x: x[x['ISFIRSTTREATMENT'] == 1]['REGIMEN'].iloc[0] if any(x['ISFIRSTTREATMENT'] == 1) else None)

# Map this induction regimen back to the dataframe
sorted_df['INDUCTION_REGIMEN'] = sorted_df['PATIENTID'].map(induction_regimen)

# For each patient, get the REGIMEN from the last row where ISMAINTENANCETHERAPY is True
maintenance_regimen = sorted_df.groupby('PATIENTID').apply(lambda x: x[x['ISMAINTENANCETHERAPY'] == True]['REGIMEN'].iloc[-1] if any(x['ISMAINTENANCETHERAPY'] == True) else None)

# Map this maintenance regimen back to the dataframe
sorted_df['MAINTENANCE_REGIMEN'] = sorted_df['PATIENTID'].map(maintenance_regimen)

# For each patient, shift the REGIMEN column up by one to get the next line's REGIMEN
sorted_df['NEXT_LINE_REGIMEN'] = sorted_df.groupby('PATIENTID')['REGIMEN'].shift(-1)

reference_date = pd.to_datetime('2023-07-31')

sorted_df['STARTDATE'] = pd.to_datetime(sorted_df['STARTDATE'])


    # Create a copy of the filtered data to avoid SettingWithCopyWarning
first_treatment_data_copy = sorted_df.copy()

    # Calculate the exact difference in months between two dates
def months_difference(start_date, end_date):
    delta = relativedelta(end_date, start_date)
    return delta.years * 12 + delta.months

    # Calculate the lookback in 12-month increments (up to 48 months)
def lookback_periods(months):
    if months < 12:
            return "0-12 months"
    elif months < 24:
            return "12-24 months"
    elif months < 36:
            return "24-36 months"
    elif months < 48:
            return "36-48 months"
    else:
            return "more than 48 months"

    # Calculate the exact months since start for each row
first_treatment_data_copy['exact_months_since_start'] = first_treatment_data_copy['STARTDATE'].apply(lambda x: months_difference(x, reference_date))

    # Apply the lookback_periods function to determine the lookback based on the exact month difference
first_treatment_data_copy['exact_lookback'] = first_treatment_data_copy['exact_months_since_start'].apply(lookback_periods)

induction_df = first_treatment_data_copy[first_treatment_data_copy['LINE_ZERO_FLAG']==0]

induction_df['START_YEAR'] = induction_df['exact_lookback']

induction_df = induction_df[["PATIENTID", "LINENUMBER", "INDUCTION_REGIMEN","MAINTENANCE_REGIMEN", "NEXT_REGIMEN","LEN_FLAG", "CD38_FLAG", "CD38_EXPOSED_FLAG", "TRANSPLANT_FLAG", "START_YEAR","LEN_REFRACTORY_FLAG",]]

induction_df = induction_df[induction_df['LINENUMBER'].isin([1, 2, 3, 4])]

induction_df.to_csv(r"C:\Users\MichaelDiFelice\Documents\Sanofi\Python\Dashboard\Data\Modified Data\induction_regimen.csv", index=False)
# Display the first few rows of the dataframe with the new column
induction_df.head(20)




In [117]:
big_lot_table_8 = pd.read_csv(r"C:\Users\MichaelDiFelice\Documents\Sanofi\MM\DATA\Flatiron\Big LoT Table\Big LoT Table.csv")

sct_df = big_lot_table_8[(big_lot_table_8['ISMAINTENANCETHERAPY'] == True) | (big_lot_table_8['ISFIRSTTREATMENT'] == 1) | (big_lot_table_8['LINE_ZERO_FLAG'] == 0) | (big_lot_table_8['TRANSPLANT_FLAG'] == 1)]

sct_df = sct_df[sct_df['COMBINEDLINE'].str.contains('Transplant')]

sct_df = sct_df[sct_df['LINE_ZERO_FLAG'] == 0]

# For each patient, get the REGIMEN value where ISFIRSTTREATMENT = 1
induction_regimen_map = sct_df[sct_df['ISFIRSTTREATMENT'] == 1].set_index('PATIENTID')['REGIMEN'].to_dict()

# Map the REGIMEN value to the INDUCTION_REGIMEN column for each patient
sct_df['INDUCTION_REGIMEN'] = sct_df['PATIENTID'].map(induction_regimen_map)

# For each patient, get the REGIMEN value where ISMAINTENANCETHERAPY = True
maintenance_regimen_map = sct_df[sct_df['ISMAINTENANCETHERAPY'] == True].set_index('PATIENTID')['REGIMEN'].to_dict()

# Map the REGIMEN value to the Maintenance_Regimen column for each patient
sct_df['Maintenance_Regimen'] = sct_df['PATIENTID'].map(maintenance_regimen_map)


# Step 1: Identify rows from sct_df where patients received SCT
sct_patient_lines = sct_df[sct_df['REGIMEN'] == 'SCT'][['PATIENTID', 'LINENUMBER']]

# Step 2: For these patients and their specific line of therapy, find the next line (LINENUMBER + 1) from big_lot_table_8
sct_patient_lines['NEXT_LINE'] = sct_patient_lines['LINENUMBER'] + 1

# Step 3: Fetch the REGIMEN value where ISFIRSTTREATMENT = 1 for this next line from big_lot_table_8
next_regimen_map_big_lot = big_lot_table_8[big_lot_table_8['ISFIRSTTREATMENT'] == 1].set_index(['PATIENTID', 'LINENUMBER'])['REGIMEN'].to_dict()

# Step 4: Assign this REGIMEN value to the NEXT_REGIMEN column for all rows in sct_df corresponding to when the patient received SCT
sct_patient_lines['NEXT_REGIMEN'] = sct_patient_lines.apply(lambda row: next_regimen_map_big_lot.get((row['PATIENTID'], row['NEXT_LINE']), None), axis=1)

# Merge this information back into sct_df
sct_df = sct_df.merge(sct_patient_lines[['PATIENTID', 'LINENUMBER', 'NEXT_REGIMEN']], on=['PATIENTID', 'LINENUMBER'], how='left')

# Update the sct_df with the new NEXT_REGIMEN values
sct_df['NEXT_REGIMEN'] = sct_df.apply(lambda row: next_regimen_map_big_lot.get((row['PATIENTID'], row['LINENUMBER'] + 1), None) if row['REGIMEN'] == 'SCT' else None, axis=1)

# Find the next regimen where ISFIRSTTREATMENT = 1 for each patient and LINENUMBER
next_regimen_map = big_lot_table_8[big_lot_table_8['ISFIRSTTREATMENT'] == 1].set_index(['PATIENTID', 'LINENUMBER'])['REGIMEN'].to_dict()

# Update the NEXT_REGIMEN column for rows where the regimen is SCT
sct_df['NEXT_REGIMEN'] = sct_df.apply(lambda row: next_regimen_map.get((row['PATIENTID'], row['LINENUMBER'] + 1), None) if row['REGIMEN'] == 'SCT' else row['NEXT_REGIMEN'], axis=1)


# Determine the NEXT_REGIMEN for each patient and LINENUMBER where SCT was received
sct_next_regimen_map = sct_df[sct_df['REGIMEN'] == 'SCT'].set_index(['PATIENTID', 'LINENUMBER'])['NEXT_REGIMEN'].to_dict()

reference_date = pd.to_datetime('2023-07-31')

sct_df['STARTDATE'] = pd.to_datetime(sct_df['STARTDATE'])


# Apply this NEXT_REGIMEN value to all rows for that LINENUMBER for the patient
sct_df['NEXT_REGIMEN'] = sct_df.apply(lambda row: sct_next_regimen_map.get((row['PATIENTID'], row['LINENUMBER']), row['NEXT_REGIMEN']), axis=1)

sct_df['exact_months_since_start'] = sct_df['STARTDATE'].apply(lambda x: months_difference(x, reference_date))

    # Apply the lookback_periods function to determine the lookback based on the exact month difference
sct_df['exact_lookback'] = sct_df['exact_months_since_start'].apply(lookback_periods)

sct_df['START_YEAR'] = sct_df['exact_lookback']
# Append integers to the end of each regimen column
sct_df['INDUCTION_REGIMEN'] = sct_df['INDUCTION_REGIMEN'].astype(str) + "1"
sct_df['Maintenance_Regimen'] = sct_df['Maintenance_Regimen'].astype(str) + "2"
sct_df['NEXT_REGIMEN'] = sct_df['NEXT_REGIMEN'].astype(str) + "3"

sct_df = sct_df[["PATIENTID", "LINENUMBER", "INDUCTION_REGIMEN","Maintenance_Regimen", "NEXT_REGIMEN","LEN_FLAG", "CD38_FLAG", "CD38_EXPOSED_FLAG", "TRANSPLANT_FLAG", "START_YEAR","LEN_REFRACTORY_FLAG",]]
sct_df = sct_df.drop_duplicates(subset=['PATIENTID', 'LINENUMBER'])
sct_df.to_csv(r"C:\Users\MichaelDiFelice\Documents\Sanofi\Python\Dashboard\Data\Modified Data\induction_regimen.csv", index=False)