In [24]:
%matplotlib notebook
import pandas as pd
import pandasql as pds
from pandasql import sqldf
import plotly.graph_objects as go
import matplotlib.pyplot as plt
pd.set_option('display.max_rows', 50)
pd.set_option('display.min_rows', 50)
import seaborn as sns
from dateutil.relativedelta import relativedelta

In [26]:

# Load the data
big_lot_table_8 = pd.read_csv(r"C:\Users\MichaelDiFelice\Documents\Sanofi\MM\DATA\Flatiron\Big LoT Table\Big LoT Table.csv")

# Convert STARTDATE to datetime format
big_lot_table_8['STARTDATE'] = pd.to_datetime(big_lot_table_8['STARTDATE'])

# Define the reference date
reference_date = pd.to_datetime('2023-07-31')

# Filter the rows based on ISFIRSTTREATMENT
first_treatment_data = big_lot_table_8[big_lot_table_8['ISFIRSTTREATMENT'] == True]

# Calculate the exact difference in months between two dates
def months_difference(start_date, end_date):
    delta = relativedelta(end_date, start_date)
    return delta.years * 12 + delta.months

# Calculate the lookback in 12-month increments (up to 48 months)
def lookback_periods(months):
    if months < 12:
        return "0-12 months"
    elif months < 24:
        return "12-24 months"
    elif months < 36:
        return "24-36 months"
    elif months < 48:
        return "36-48 months"
    else:
        return "more than 48 months"

# Create a copy of the filtered data to avoid SettingWithCopyWarning
first_treatment_data_copy = first_treatment_data.copy()

# Calculate the exact months since start for each row
first_treatment_data_copy['exact_months_since_start'] = first_treatment_data_copy['STARTDATE'].apply(lambda x: months_difference(x, reference_date))

# Apply the lookback_periods function to determine the lookback based on the exact month difference
first_treatment_data_copy['exact_lookback'] = first_treatment_data_copy['exact_months_since_start'].apply(lookback_periods)

data_final = first_treatment_data_copy[first_treatment_data['LINE_ZERO_FLAG']==0]
data_final.head()



Unnamed: 0,PATIENTID,LINENAME,LINENUMBER,LINESETTING,ISMAINTENANCETHERAPY,ENHANCEDCOHORT,STARTDATE,ENDDATE,REGIMEN,COMBINEDLINE,...,TRANSPLANT_FLAG,ENHANCED_LINENUMBER,FIRST_LEN_DATE,LEN_FLAG,LINE_ZERO_FLAG,LEN_REFRACTORY_FLAG,CD38_FLAG,LEN_REFRACTORY_BY_NAME_FLAG,exact_months_since_start,exact_lookback
0,F0004757B960D,"Bortezomib,Dexamethasone,Lenalidomide",1,\N,False,MultipleMyeloma,2018-12-31,2023-07-31,VRd,"Bortezomib,Dexamethasone,Lenalidomide",...,0,1,2018-12-31,0,0,0,0,0,55,more than 48 months
1,F000B7198BCE1,"Bortezomib,Dexamethasone,Lenalidomide",1,\N,False,MultipleMyeloma,2018-07-17,2021-10-31,VRd,"Bortezomib,Dexamethasone,Lenalidomide",...,0,1,2018-07-17,0,0,0,0,0,60,more than 48 months
2,F000B7198BCE1,"Dexamethasone,Pomalidomide",2,\N,False,MultipleMyeloma,2021-11-01,2022-06-06,Pd,"Dexamethasone,Pomalidomide",...,0,2,2018-07-17,1,0,0,0,0,20,12-24 months
3,F000C4086FB76,Dexamethasone,1,\N,False,MultipleMyeloma,2020-09-10,2020-09-17,d mono,Dexamethasone,...,0,1,\N,0,0,0,0,0,34,24-36 months
4,F000DC8ABFEB8,"Bortezomib,Dexamethasone,Lenalidomide",1,\N,False,MultipleMyeloma,2019-03-27,2021-01-25,VRd,"Bortezomib,Dexamethasone,Lenalidomide",...,0,1,2019-03-27,0,0,0,0,0,52,more than 48 months


In [14]:

data_sorted = data_final.drop_duplicates(subset=['PATIENTID', 'LINENUMBER'])

# Sort the data by PATIENTID and LINENUMBER to ensure the order
data_sorted = data_sorted.sort_values(by=["PATIENTID", "LINENUMBER"])

# Shift the REGIMEN column to get the next regimen for each patient and line
data_sorted["NEXT_REGIMEN"] = data_sorted.groupby("PATIENTID")["REGIMEN"].shift(-1)
data_sorted['PREVIOUS_REGIMEN'] = data_sorted.groupby("PATIENTID")["REGIMEN"].shift(1)
# Fill NaN values in "NEXT_REGIMEN" with "No Advancement"
data_sorted["NEXT_REGIMEN"].fillna("No Advancement", inplace=True)
data_sorted["PREVIOUS_REGIMEN"].fillna("", inplace=True)


# If NEXT_REGIMEN is 'No Advancement', set NEXT_LINENUMBER to the same as LINENUMBER
# Otherwise, shift the LINENUMBER column to get the next line number for each patient
data_sorted["NEXT_LINENUMBER"] = data_sorted.apply(
    lambda row: row["LINENUMBER"] if row["NEXT_REGIMEN"] == "No Advancement" else row["LINENUMBER"] + 1, axis=1
)

data_sorted["START_YEAR"] = data_sorted['exact_lookback']

# Filter out rows where the next line is not consecutive (e.g., line 1 followed by line 3) or 'No Advancement'
data_sorted = data_sorted[(data_sorted["LINENUMBER"] + 1 == data_sorted["NEXT_LINENUMBER"]) | (data_sorted["NEXT_REGIMEN"] == "No Advancement")]

# For our current task, we only need certain columns
data = data_sorted[["PATIENTID", "LINENUMBER", "REGIMEN","PREVIOUS_REGIMEN", "NEXT_REGIMEN", "LEN_FLAG", "CD38_FLAG", "CD38_EXPOSED_FLAG", "TRANSPLANT_FLAG", "START_YEAR","LEN_REFRACTORY_FLAG","NEXT_LINENUMBER"]]
data = data[data['LINENUMBER'].isin([1, 2, 3, 4])]

# Concatenate LINENUMBER to REGIMEN and NEXT_REGIMEN columns
data['REGIMEN'] = data['REGIMEN'].astype(str) + data['LINENUMBER'].astype(str)
data['NEXT_REGIMEN'] = data['NEXT_REGIMEN'].astype(str) + (data['LINENUMBER'] + 1).astype(str)
data['PREVIOUS_REGIMEN'] = data['PREVIOUS_REGIMEN'].astype(str) + (data['LINENUMBER'] - 1).astype(str)
data['PREVIOUS_REGIMEN'] = data['PREVIOUS_REGIMEN'].str.replace('0', '')

# Change the name of previous_regimen and regimen to source and target
data = data.rename(columns={"PREVIOUS_REGIMEN": "Source", "REGIMEN": "Target"})

# Turn the data into a cvs in the Modified Data folder
data.to_csv(r"C:\Users\MichaelDiFelice\Documents\Sanofi\Python\Dashboard\Data\Modified Data\modified_sankey.csv", index=False)