In [167]:
%matplotlib notebook
import pandas as pd
import pandasql as pds
from pandasql import sqldf
import plotly.graph_objects as go
import matplotlib.pyplot as plt
pd.set_option('display.max_rows', 50)
pd.set_option('display.min_rows', 50)
import seaborn as sns
from dateutil.relativedelta import relativedelta
import random

In [None]:
big_lot_table_8 = pd.read_csv(r"C:\Users\MichaelDiFelice\Documents\Sanofi\MM\DATA\Flatiron\Big LoT Table\Big LoT Table.csv")
sct_induction = pd.read_csv(r"C:\Users\MichaelDiFelice\Documents\Sanofi\Python\Dashboard\Data\Data Raw\SCT_PATS_WITH_MAINT.csv")

def calculate_lookback(lot_df, reference_date):
    # Convert STARTDATE to datetime format
    lot_df['STARTDATE'] = pd.to_datetime(lot_df['STARTDATE'])

    # Filter the rows based on ISFIRSTTREATMENT
    first_treatment_data = lot_df[lot_df['ISFIRSTTREATMENT'] == True]

    # Create a copy of the filtered data to avoid SettingWithCopyWarning
    first_treatment_data_copy = first_treatment_data.copy()

    # Calculate the exact difference in months between two dates
    def months_difference(start_date, end_date):
        delta = relativedelta(end_date, start_date)
        return delta.years * 12 + delta.months

    # Calculate the lookback in 12-month increments (up to 48 months)
    def lookback_periods(months):
        if months < 12:
            return "0-12 months"
        elif months < 24:
            return "12-24 months"
        elif months < 36:
            return "24-36 months"
        elif months < 48:
            return "36-48 months"
        else:
            return "more than 48 months"

    # Calculate the exact months since start for each row
    first_treatment_data_copy['exact_months_since_start'] = first_treatment_data_copy['STARTDATE'].apply(lambda x: months_difference(x, reference_date))

    # Apply the lookback_periods function to determine the lookback based on the exact month difference
    first_treatment_data_copy['exact_lookback'] = first_treatment_data_copy['exact_months_since_start'].apply(lookback_periods)

    data_final = first_treatment_data_copy[first_treatment_data['LINE_ZERO_FLAG']==0]
    
    return data_final



In [212]:
def bundle_regimen(row):
    # Line-based conditions
    if row['LINENUMBER'] == 1:
        # Transplant and regimen conditions within Line 1
        if row['TRANSPLANT_FLAG'] == 1 and row['REGIMEN'] == "DVRd":
            return "DVRd"
        elif row['TRANSPLANT_FLAG'] == 1 and row['REGIMEN'] == "DVTd":
            return "DVTd"
        elif row['TRANSPLANT_FLAG'] == 1 and row['REGIMEN'] == "VCd":
            return "VCd"
        elif row['TRANSPLANT_FLAG'] == 1 and row['REGIMEN'] in ["VRd", "VR", "Vd",]:
            return "VRd"
        elif row['TRANSPLANT_FLAG'] == 0 and row['REGIMEN'] in ['D mono','DKd','DKRd','DPd','DVd','DVMp','DVTd','Other D']:
            return "D-other"
        elif row['TRANSPLANT_FLAG'] == 0 and row['REGIMEN'] == 'DRd':
            return "DRd"
        elif row['TRANSPLANT_FLAG'] == 0 and row['REGIMEN'] == 'DVRd':
            return "DVRd"
        elif row['TRANSPLANT_FLAG'] == 0 and row['REGIMEN'] in ['Kd','Pd','Vd']:
            return 'Other doublet'
        elif row['TRANSPLANT_FLAG'] == 0 and row['REGIMEN'] in ['EPd','ERd','IRd','IsaKd','IsaPd','KPd','KRd','PCd','PVd','VCd','VMp','VTd']:
            return 'Other triplet'
        elif row['TRANSPLANT_FLAG'] == 0 and row['REGIMEN'] == 'Rd':
            return 'Rd'
        elif row['TRANSPLANT_FLAG'] == 0 and row['REGIMEN'] == 'VRd':
            return 'VRd' 

    elif row['LINENUMBER'] == 2:
        # Transplant and regimen conditions within Line 2
        if row['REGIMEN'] in ['D mono','DKRd','DVMp','DVRd','DVTd','Other D']:
            return "D-other"
        elif row['REGIMEN'] == "DKd":
            return "DKd"
        elif row['REGIMEN'] == "DPd":
            return "DPd"
        elif row['REGIMEN'] == "DRd":
            return "DRd"
        elif row['REGIMEN'] == "DVd":
            return "DVd"
        elif row['REGIMEN'] == "IsaKd":
            return "IsaKd"
        elif row['REGIMEN'] == "IsaPd":
            return "IsaPd"
        elif row['REGIMEN'] == 'Rd':
            return 'Rd'
        elif row["REGIMEN"] in ['ERd','IRd','KRd','VRd']:
            return 'R-Triplet'
    elif row['LINENUMBER'] == 3:
        # Transplant and regimen conditions within Line 2
        if row['REGIMEN'] in ['D mono','DKRd','DVMp','DVRd','DVTd','Other D']:
            return "D-other"
        elif row['REGIMEN'] == "DKd":
            return "DKd"
        elif row['REGIMEN'] == "DPd":
            return "DPd"
        elif row['REGIMEN'] == "DRd":
            return "DRd"
        elif row['REGIMEN'] == "DVd":
            return "DVd"
        elif row['REGIMEN'] == "IsaKd":
            return "IsaKd"
        elif row['REGIMEN'] == "IsaPd":
            return "IsaPd"
        elif row['REGIMEN'] == 'Rd':
            return 'Rd'
        elif row["REGIMEN"] in ['ERd','IRd','KRd','VRd']:
            return 'R-Triplet'


    # Default return value if none of the conditions are met
    return "Other"




In [203]:
def define_regimen_order(data):
    data_sorted = data.drop_duplicates(subset=['PATIENTID', 'LINENUMBER'])

    # Sort the data by PATIENTID and LINENUMBER to ensure the order
    data_sorted = data_sorted.sort_values(by=["PATIENTID", "LINENUMBER"])

    # Shift the REGIMEN column to get the next regimen for each patient and line
    data_sorted["NEXT_REGIMEN"] = data_sorted.groupby("PATIENTID")["REGIMEN"].shift(-1)
    data_sorted['PREVIOUS_REGIMEN'] = data_sorted.groupby("PATIENTID")["REGIMEN"].shift(1)
    # Fill NaN values in "NEXT_REGIMEN" with "No Advancement"
    data_sorted["NEXT_REGIMEN"].fillna("No Advancement", inplace=True)
    data_sorted["PREVIOUS_REGIMEN"].fillna("", inplace=True)


    # If NEXT_REGIMEN is 'No Advancement', set NEXT_LINENUMBER to the same as LINENUMBER
    # Otherwise, shift the LINENUMBER column to get the next line number for each patient
    data_sorted["NEXT_LINENUMBER"] = data_sorted.apply(
        lambda row: row["LINENUMBER"] if row["NEXT_REGIMEN"] == "No Advancement" else row["LINENUMBER"] + 1, axis=1
    )

    data_sorted["START_YEAR"] = data_sorted['exact_lookback']

    # Filter out rows where the next line is not consecutive (e.g., line 1 followed by line 3) or 'No Advancement'
    data_sorted = data_sorted[(data_sorted["LINENUMBER"] + 1 == data_sorted["NEXT_LINENUMBER"]) | (data_sorted["NEXT_REGIMEN"] == "No Advancement")]


    # Do the same steps above to bundled regimen
    data_sorted["NEXT_BUNDLE"] = data_sorted.groupby("PATIENTID")["BUNDLED_REGIMEN"].shift(-1)
    data_sorted['PREVIOUS_BUNDLE'] = data_sorted.groupby("PATIENTID")["BUNDLED_REGIMEN"].shift(1)
    # Fill NaN values in "NEXT_REGIMEN" with "No Advancement"
    data_sorted["NEXT_BUNDLE"].fillna("No Advancement", inplace=True)
    data_sorted["PREVIOUS_BUNDLE"].fillna("", inplace=True)

    data_sorted["NEXT_LINENUMBER"] = data_sorted.apply(
        lambda row: row["LINENUMBER"] if row["NEXT_BUNDLE"] == "No Advancement" else row["LINENUMBER"] + 1, axis=1
    )

    data_sorted["START_YEAR"] = data_sorted['exact_lookback']

    # Filter out rows where the next line is not consecutive (e.g., line 1 followed by line 3) or 'No Advancement'
    data_sorted = data_sorted[(data_sorted["LINENUMBER"] + 1 == data_sorted["NEXT_LINENUMBER"]) | (data_sorted["NEXT_BUNDLE"] == "No Advancement")]

    # For our current task, we only need certain columns
    data = data_sorted[["PATIENTID", "LINENUMBER", "REGIMEN","PREVIOUS_REGIMEN", "NEXT_REGIMEN","BUNDLED_REGIMEN",'PREVIOUS_BUNDLE','NEXT_BUNDLE', "LEN_FLAG", "CD38_FLAG", "CD38_EXPOSED_FLAG", "TRANSPLANT_FLAG", "START_YEAR","LEN_REFRACTORY_FLAG","NEXT_LINENUMBER","newline5"]]

    data = data[data['LINENUMBER'].isin([1, 2, 3, 4])]

    # Concatenate LINENUMBER to REGIMEN and NEXT_REGIMEN columns
    data['BUNDLED_REGIMEN'] = data['BUNDLED_REGIMEN'].astype(str) + data['LINENUMBER'].astype(str)
    data['NEXT_BUNDLE'] = data['NEXT_BUNDLE'].astype(str) + (data['LINENUMBER'] + 1).astype(str)
    data['PREVIOUS_BUNDLE'] = data['PREVIOUS_BUNDLE'].astype(str) + (data['LINENUMBER'] - 1).astype(str)
    data['PREVIOUS_BUNDLE'] = data['PREVIOUS_BUNDLE'].str.replace('0', '')

    # Concatenate LINENUMBER to REGIMEN and NEXT_REGIMEN columns
    data['REGIMEN'] = data['REGIMEN'].astype(str) + data['LINENUMBER'].astype(str)
    data['NEXT_REGIMEN'] = data['NEXT_REGIMEN'].astype(str) + (data['LINENUMBER'] + 1).astype(str)
    data['PREVIOUS_REGIMEN'] = data['PREVIOUS_REGIMEN'].astype(str) + (data['LINENUMBER'] - 1).astype(str)
    data['PREVIOUS_REGIMEN'] = data['PREVIOUS_REGIMEN'].str.replace('0', '')

   
    # Turn the data into a cvs in the Modified Data folder    
    data.to_csv(r"C:\Users\MichaelDiFelice\Documents\Sanofi\Python\Dashboard\Data\Modified Data\modified_sankey.csv", index=False)




In [213]:
# def merge_data(new_data,old_data):

#     merge_data = old_data.merge(
#         new_data[['PatientID', 'LineNumber', 'newline5', 'consolidation', 'induction']],
#         left_on=['PATIENTID', 'LINENUMBER'],
#         right_on=['PatientID', 'LineNumber'],
#         how='left'
# )
#     return merge_data

In [214]:

data = pd.read_csv(r"C:\Users\MichaelDiFelice\Downloads\flatiron_newlot_v2.csv")

merge_data = big_lot_table_8.merge(
    data[['PatientID', 'LineNumber', 'newline5', 'consolidation', 'induction']],
    left_on=['PATIENTID', 'LINENUMBER'],
    right_on=['PatientID', 'LineNumber'],
    how='left'
)

data_final = calculate_lookback(merge_data, pd.to_datetime('2023-07-31'))

data_final['LINENUMBER'] = data_final['newline5'].astype(int)

# Apply the function to create the 'bundled_regimen' column
data_final['BUNDLED_REGIMEN'] = data_final.apply(bundle_regimen, axis=1)

define_regimen_order(data_final)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [205]:
big_lot_table_8 = pd.read_csv(r"C:\Users\MichaelDiFelice\Documents\Sanofi\MM\DATA\Flatiron\Big LoT Table\Big LoT Table.csv")

In [210]:
# Step 1: Filter Patients with Line 1 Transplant
transplant_patients = big_lot_table_8[(big_lot_table_8['COMBINEDLINE'].str.contains('Transplant', case=False)) & 
                                    (big_lot_table_8['LINENUMBER'] == 1)]

# Extracting patients' IDs
transplant_patient_ids = transplant_patients['PATIENTID'].unique()

# Step 2: Identify Induction Regimen
induction_data = big_lot_table_8[(big_lot_table_8['PATIENTID'].isin(transplant_patient_ids)) & 
                               (big_lot_table_8['ISFIRSTTREATMENT'] == 1) &
                               (big_lot_table_8['LINENUMBER'] == 1)][['PATIENTID', 'REGIMEN', 'STARTDATE', 'ENDDATE']]

# Step 3: Identify Maintenance Therapy
maintenance_data = big_lot_table_8[(big_lot_table_8['PATIENTID'].isin(transplant_patient_ids)) & 
                                 (big_lot_table_8['ISMAINTENANCETHERAPY'] == True)].sort_values('STARTDATE', ascending=False).drop_duplicates('PATIENTID')

# Step 4: Identify Line 2 Regimen with updated criteria
line2_data = big_lot_table_8[(big_lot_table_8['PATIENTID'].isin(transplant_patient_ids)) & 
                           (big_lot_table_8['LINENUMBER'] == 2) & 
                           (big_lot_table_8['ISFIRSTTREATMENT'] == 1)][['PATIENTID', 'REGIMEN', 'STARTDATE', 'ENDDATE']]

# Merge the data for a complete view
merged_data = induction_data.merge(maintenance_data[['PATIENTID', 'REGIMEN', 'STARTDATE', 'ENDDATE']], on='PATIENTID', how='left', suffixes=('_induction', '_maintenance')).merge(line2_data, on='PATIENTID', how='left', suffixes=('', '_line2'))

# Filter the dataset to the specified date range
filtered_data = merged_data[(merged_data['STARTDATE'] >= '2022-07-31') & (merged_data['STARTDATE'] <= '2023-07-31')]

# Extracting the CD38_FLAG for the patients in our filtered dataset
cd38_flags = big_lot_table_8[big_lot_table_8['PATIENTID'].isin(filtered_data['PATIENTID']) & 
                           (big_lot_table_8['LINENUMBER'] == 2)][['PATIENTID', 'CD38_FLAG']]

# Merging the CD38_FLAG with our filtered dataset
filtered_data_with_cd38 = filtered_data.merge(cd38_flags, on='PATIENTID', how='left')

# Filtering the dataset to include only patients with a CD38 flag in Line 2
cd38_filtered_data_corrected = filtered_data_with_cd38[filtered_data_with_cd38['CD38_FLAG'] == 1]

# Filling NA values in the 'REGIMEN_maintenance' column with 'No Maintenance'
cd38_filtered_data_corrected['REGIMEN_maintenance'].fillna('No Maintenance', inplace=True)

# Drop duplicate rows based on PATIENTID
cd38_filtered_data= cd38_filtered_data_corrected.drop_duplicates(subset=['PATIENTID'])






A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [215]:
import plotly.graph_objects as go
import random

# Helper function to add identifier to regimen names
def add_identifier(regimen, identifier):
    return regimen + "_" + identifier

# Grouping data to get counts for each flow
induction_counts = cd38_filtered_data['REGIMEN_induction'].value_counts()
maintenance_counts = cd38_filtered_data['REGIMEN_maintenance'].value_counts()
line2_counts = cd38_filtered_data['REGIMEN'].value_counts()

# Counting unique patients for each combination of Induction, Maintenance, and Line 2 regimens
combination_counts = cd38_filtered_data.groupby(['REGIMEN_induction', 'REGIMEN_maintenance', 'REGIMEN']).size().reset_index(name='count')

# Initializing the lists for the Sankey diagram
source = []
target = []
value = []
label = []

# Populating the lists for Induction to Maintenance connections
for index, row in combination_counts.iterrows():
    # Induction
    if add_identifier(row['REGIMEN_induction'], "I") not in label:
        label.append(add_identifier(row['REGIMEN_induction'], "I"))
    source.append(label.index(add_identifier(row['REGIMEN_induction'], "I")))
    
    # Maintenance
    if add_identifier(row['REGIMEN_maintenance'], "M") not in label:
        label.append(add_identifier(row['REGIMEN_maintenance'], "M"))
    target.append(label.index(add_identifier(row['REGIMEN_maintenance'], "M")))
    
    # Value
    value.append(row['count'])

# Populating the lists for Maintenance to Line 2 connections
for index, row in combination_counts.iterrows():
    # Maintenance
    if add_identifier(row['REGIMEN_maintenance'], "M") not in label:
        label.append(add_identifier(row['REGIMEN_maintenance'], "M"))
    source.append(label.index(add_identifier(row['REGIMEN_maintenance'], "M")))
    
    # Line 2
    if add_identifier(row['REGIMEN'], "2") not in label:
        label.append(add_identifier(row['REGIMEN'], "2"))
    target.append(label.index(add_identifier(row['REGIMEN'], "2")))
    
    # Value
    value.append(row['count'])

# Removing the identifiers for node labels
label_clean = [l.split("_")[0] for l in label]

# Generate a random color
def generate_random_color():
    return 'rgba({}, {}, {}, 1)'.format(random.randint(0, 255), 
                                        random.randint(0, 255), 
                                        random.randint(0, 255))

# Create a color map with random colors for each unique node label
color_map = {label: generate_random_color() for label in label_clean}

# Determine the colors for nodes based on the color_map
node_colors = [color_map[label] for label in label_clean]

# Use a loop to determine the color of each link based on its source node
link_colors = [node_colors[src] for src in source]

# Create the Sankey diagram
fig = go.Figure(go.Sankey(
    node=dict(
        pad=15,
        thickness=20,
        line=dict(color="black", width=0.5),
        label=label_clean,
        color=node_colors
    ),
    link=dict(
        source=source,
        target=target,
        value=value,
        color=link_colors
    )
))

fig.update_layout()
fig