## Formatting and Labelling GDSC2 for Olaparib and Talazoparib

In [4]:
import pandas as pd
import os
import numpy as np
from scipy import stats
import math

In [5]:
def load(file_path):
    data = pd.read_csv(file_path, index_col=0)
    return data

### Olaparib (gdsc2)

In [6]:
#load Olaprapib sample sheet
ola_IC50_df = load('OLAPARIB_IC50.csv')

#compute z score
ola_IC50_df['z_score'] = stats.zscore(ola_IC50_df['IC50'])

#compute natural log (ln)
ola_IC50_df['ln_values'] = ola_IC50_df['IC50'].apply(lambda x: math.log(x))

# Calculate the index of the rows that represent the top 25% and bottom 25% of the DataFrame
total_rows = len(ola_IC50_df)
top_25_percent_index = int(total_rows * 0.33)
bottom_25_percent_index = int(total_rows * 0.67)
    
# Create a new column 'resistance_label' and set all rows as 'intermediate' by default
ola_IC50_df['resistance_label'] = 'intermediate'

# Assign 'more resistant' to the top 25% rows
ola_IC50_df.loc[ola_IC50_df.index[:top_25_percent_index], 'resistance_label'] = 'more resistant'

# Assign 'less resistant' to the bottom 25% rows
ola_IC50_df.loc[ola_IC50_df.index[bottom_25_percent_index:], 'resistance_label'] = 'less resistant'

#formatting to align with other datasets
ola_IC50_df.index = ola_IC50_df.index.str.replace("-", "")
ola_IC50_df.index = ola_IC50_df.index.str.replace("s", "S")

### Talazoparib (gdsc2)

In [7]:
#load data
tala_IC50_df = load('TALAZOPARIB_IC50.csv')

#compute z score
tala_IC50_df['z_score'] = stats.zscore(tala_IC50_df['IC50'])

#compute natural log (ln)
tala_IC50_df['ln_values'] = tala_IC50_df['IC50'].apply(lambda x: math.log(x))

# Calculate the index of the rows that represent the top 25% and bottom 25% of the DataFrame
total_rows = len(tala_IC50_df)
top_25_percent_index = int(total_rows * 0.33)
bottom_25_percent_index = int(total_rows * 0.67)

# Create a new column 'resistance_label' and set all rows as 'intermediate' by default
tala_IC50_df['resistance_label'] = 'intermediate'

# Assign 'more resistant' to the top 25% rows
tala_IC50_df.loc[tala_IC50_df.index[:top_25_percent_index], 'resistance_label'] = 'more resistant'

# Assign 'less resistant' to the bottom 25% rows
tala_IC50_df.loc[tala_IC50_df.index[bottom_25_percent_index:], 'resistance_label'] = 'less resistant'

#formatting
tala_IC50_df.index = tala_IC50_df.index.str.replace("-", "")
tala_IC50_df.index = tala_IC50_df.index.str.replace("s", "S")

#### Save IC50 dataframes

In [8]:
outputfile_ola = "OLA_TERTILE_LN.csv"
ola_IC50_df.to_csv(outputfile_ola, index=True)

In [9]:
outputfile_tala = "TALA_TERTILE_LN.csv"
tala_IC50_df.to_csv(outputfile_tala, index=True)