In [1]:
import os
import boto3
import io
from dotenv import load_dotenv
import pandas as pd
import numpy as np

In [2]:
load_dotenv()

True

In [3]:
# Custom function to reduce memory usage of data

def reduce_mem_usage(df, verbose=True):
    numerics = ['int8','int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

## Importing data ingested from previous script that is stored on the S3 bucket

In [4]:
AWS_S3_BUCKET = os.getenv("AWS_S3_BUCKET")
AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID")
AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")

In [5]:
s3_client = boto3.client(
    "s3",
    aws_access_key_id=AWS_ACCESS_KEY_ID,
    aws_secret_access_key=AWS_SECRET_ACCESS_KEY
)

response = s3_client.get_object(Bucket=AWS_S3_BUCKET, Key="rawdata/tcga_raw.csv")

status = response.get("ResponseMetadata", {}).get("HTTPStatusCode")

if status == 200:
    print(f"Successful S3 get_object response. Status - {status}")
    pd_df = reduce_mem_usage(pd.read_csv(response.get("Body")))
    print(pd_df.info())
else:
    print(f"Unsuccessful S3 get_object response. Status - {status}")

Successful S3 get_object response. Status - 200
Mem. usage decreased to  5.10 Mb (2.8% reduction)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12974 entries, 0 to 12973
Data columns (total 53 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Race                    12557 non-null  object 
 1   Deceased_Status         1787 non-null   float16
 2   Age_At_Diagnosis        12322 non-null  float16
 3   Pathologic_Stage        5017 non-null   object 
 4   Primary_Diagnosis       12557 non-null  object 
 5   Prior_Malignancy        5743 non-null   object 
 6   Synchronous_Malignancy  5741 non-null   object 
 7   Disease_Type            12974 non-null  object 
 8   ID                      12974 non-null  object 
 9   Primary_Site            12973 non-null  object 
 10  Submitter_ID            12974 non-null  object 
 11  TP53                    7077 non-null   object 
 12  PIK3CA                  7077 non-null   object

In [6]:
print(pd_df.shape)
pd_df.head()

(12974, 53)


Unnamed: 0,Race,Deceased_Status,Age_At_Diagnosis,Pathologic_Stage,Primary_Diagnosis,Prior_Malignancy,Synchronous_Malignancy,Disease_Type,ID,Primary_Site,...,VHL,PBRM1,SETD2,BAP1,MET,KMT2C,PKHD1,ICE1,SMAD4,CDKN2A
0,black or african american,,3504.0,,"Acute myeloid leukemia, NOS",,,Myeloid Leukemias,29312892-078e-4a35-809c-729f55370967,Hematopoietic and reticuloendothelial systems,...,,,,,,,,,,
1,white,,2328.0,,"Acute myeloid leukemia, NOS",,,Myeloid Leukemias,2939c0a9-3c47-4019-b9e3-958e84a12bb5,Hematopoietic and reticuloendothelial systems,...,,,,,,,,,,
2,white,,6116.0,,"Acute myeloid leukemia, NOS",,,Myeloid Leukemias,cecefca5-6308-49f6-b9c2-226235d60613,Hematopoietic and reticuloendothelial systems,...,,,,,,,,,,
3,white,,5856.0,,"Acute myeloid leukemia, NOS",,,Myeloid Leukemias,56404ff6-a971-4d84-9891-0053b1075ee3,Hematopoietic and reticuloendothelial systems,...,,,,,,,,,,
4,white,,5128.0,,"Acute myeloid leukemia, NOS",,,Myeloid Leukemias,792187f7-d5c3-497d-9573-f7411f027aa3,Hematopoietic and reticuloendothelial systems,...,,,,,,,,,,


In [7]:
pd_df.replace({np.nan: False}, inplace = True)
pd_df.head()

Unnamed: 0,Race,Deceased_Status,Age_At_Diagnosis,Pathologic_Stage,Primary_Diagnosis,Prior_Malignancy,Synchronous_Malignancy,Disease_Type,ID,Primary_Site,...,VHL,PBRM1,SETD2,BAP1,MET,KMT2C,PKHD1,ICE1,SMAD4,CDKN2A
0,black or african american,False,3504.0,False,"Acute myeloid leukemia, NOS",False,False,Myeloid Leukemias,29312892-078e-4a35-809c-729f55370967,Hematopoietic and reticuloendothelial systems,...,False,False,False,False,False,False,False,False,False,False
1,white,False,2328.0,False,"Acute myeloid leukemia, NOS",False,False,Myeloid Leukemias,2939c0a9-3c47-4019-b9e3-958e84a12bb5,Hematopoietic and reticuloendothelial systems,...,False,False,False,False,False,False,False,False,False,False
2,white,False,6116.0,False,"Acute myeloid leukemia, NOS",False,False,Myeloid Leukemias,cecefca5-6308-49f6-b9c2-226235d60613,Hematopoietic and reticuloendothelial systems,...,False,False,False,False,False,False,False,False,False,False
3,white,False,5856.0,False,"Acute myeloid leukemia, NOS",False,False,Myeloid Leukemias,56404ff6-a971-4d84-9891-0053b1075ee3,Hematopoietic and reticuloendothelial systems,...,False,False,False,False,False,False,False,False,False,False
4,white,False,5128.0,False,"Acute myeloid leukemia, NOS",False,False,Myeloid Leukemias,792187f7-d5c3-497d-9573-f7411f027aa3,Hematopoietic and reticuloendothelial systems,...,False,False,False,False,False,False,False,False,False,False


In [8]:
# Replacing True and False values in the SSMS columns with 1 and 0
pd_df.replace({False: 0, True: 1}, inplace=True)
pd_df.head()

Unnamed: 0,Race,Deceased_Status,Age_At_Diagnosis,Pathologic_Stage,Primary_Diagnosis,Prior_Malignancy,Synchronous_Malignancy,Disease_Type,ID,Primary_Site,...,VHL,PBRM1,SETD2,BAP1,MET,KMT2C,PKHD1,ICE1,SMAD4,CDKN2A
0,black or african american,0.0,3504.0,0,"Acute myeloid leukemia, NOS",0,0,Myeloid Leukemias,29312892-078e-4a35-809c-729f55370967,Hematopoietic and reticuloendothelial systems,...,0,0,0,0,0,0,0,0,0,0
1,white,0.0,2328.0,0,"Acute myeloid leukemia, NOS",0,0,Myeloid Leukemias,2939c0a9-3c47-4019-b9e3-958e84a12bb5,Hematopoietic and reticuloendothelial systems,...,0,0,0,0,0,0,0,0,0,0
2,white,0.0,6116.0,0,"Acute myeloid leukemia, NOS",0,0,Myeloid Leukemias,cecefca5-6308-49f6-b9c2-226235d60613,Hematopoietic and reticuloendothelial systems,...,0,0,0,0,0,0,0,0,0,0
3,white,0.0,5856.0,0,"Acute myeloid leukemia, NOS",0,0,Myeloid Leukemias,56404ff6-a971-4d84-9891-0053b1075ee3,Hematopoietic and reticuloendothelial systems,...,0,0,0,0,0,0,0,0,0,0
4,white,0.0,5128.0,0,"Acute myeloid leukemia, NOS",0,0,Myeloid Leukemias,792187f7-d5c3-497d-9573-f7411f027aa3,Hematopoietic and reticuloendothelial systems,...,0,0,0,0,0,0,0,0,0,0


In [9]:
# Dropping ID and Primary_Site columns
cleaning_df = pd_df.drop(['ID', 'Submitter_ID', 'Pathologic_Stage', 'Primary_Diagnosis', 'Disease_Type'], axis=1)
cleaning_df.head()

Unnamed: 0,Race,Deceased_Status,Age_At_Diagnosis,Prior_Malignancy,Synchronous_Malignancy,Primary_Site,TP53,PIK3CA,TTN,CDH1,...,VHL,PBRM1,SETD2,BAP1,MET,KMT2C,PKHD1,ICE1,SMAD4,CDKN2A
0,black or african american,0.0,3504.0,0,0,Hematopoietic and reticuloendothelial systems,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,white,0.0,2328.0,0,0,Hematopoietic and reticuloendothelial systems,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,white,0.0,6116.0,0,0,Hematopoietic and reticuloendothelial systems,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,white,0.0,5856.0,0,0,Hematopoietic and reticuloendothelial systems,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,white,0.0,5128.0,0,0,Hematopoietic and reticuloendothelial systems,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
# Replacing deceased status years with 1
cleaning_df.loc[cleaning_df.Deceased_Status > 1, 'Deceased_Status'] = 1
cleaning_df.head()

Unnamed: 0,Race,Deceased_Status,Age_At_Diagnosis,Prior_Malignancy,Synchronous_Malignancy,Primary_Site,TP53,PIK3CA,TTN,CDH1,...,VHL,PBRM1,SETD2,BAP1,MET,KMT2C,PKHD1,ICE1,SMAD4,CDKN2A
0,black or african american,0.0,3504.0,0,0,Hematopoietic and reticuloendothelial systems,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,white,0.0,2328.0,0,0,Hematopoietic and reticuloendothelial systems,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,white,0.0,6116.0,0,0,Hematopoietic and reticuloendothelial systems,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,white,0.0,5856.0,0,0,Hematopoietic and reticuloendothelial systems,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,white,0.0,5128.0,0,0,Hematopoietic and reticuloendothelial systems,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
# Converting age at diagnosis from days to years
cleaning_df['Age_At_Diagnosis'] = cleaning_df['Age_At_Diagnosis'] / 365
cleaning_df.head()

Unnamed: 0,Race,Deceased_Status,Age_At_Diagnosis,Prior_Malignancy,Synchronous_Malignancy,Primary_Site,TP53,PIK3CA,TTN,CDH1,...,VHL,PBRM1,SETD2,BAP1,MET,KMT2C,PKHD1,ICE1,SMAD4,CDKN2A
0,black or african american,0.0,9.6,0,0,Hematopoietic and reticuloendothelial systems,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,white,0.0,6.378082,0,0,Hematopoietic and reticuloendothelial systems,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,white,0.0,16.756164,0,0,Hematopoietic and reticuloendothelial systems,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,white,0.0,16.043836,0,0,Hematopoietic and reticuloendothelial systems,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,white,0.0,14.049315,0,0,Hematopoietic and reticuloendothelial systems,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
cleaning_df.loc[cleaning_df.Age_At_Diagnosis == 0, 'Age_At_Diagnosis'] = np.nan
cleaning_df.head()

Unnamed: 0,Race,Deceased_Status,Age_At_Diagnosis,Prior_Malignancy,Synchronous_Malignancy,Primary_Site,TP53,PIK3CA,TTN,CDH1,...,VHL,PBRM1,SETD2,BAP1,MET,KMT2C,PKHD1,ICE1,SMAD4,CDKN2A
0,black or african american,0.0,9.6,0,0,Hematopoietic and reticuloendothelial systems,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,white,0.0,6.378082,0,0,Hematopoietic and reticuloendothelial systems,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,white,0.0,16.756164,0,0,Hematopoietic and reticuloendothelial systems,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,white,0.0,16.043836,0,0,Hematopoietic and reticuloendothelial systems,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,white,0.0,14.049315,0,0,Hematopoietic and reticuloendothelial systems,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
# Converting prior_malignancy not reported to NaN
cleaning_df['Prior_Malignancy'].replace({"yes": 1, "no": 0, "not reported": np.nan}, inplace=True)
cleaning_df['Synchronous_Malignancy'].replace({"Yes": 1, "No": 0, "Not Reported": np.nan}, inplace=True)
cleaning_df.head()

Unnamed: 0,Race,Deceased_Status,Age_At_Diagnosis,Prior_Malignancy,Synchronous_Malignancy,Primary_Site,TP53,PIK3CA,TTN,CDH1,...,VHL,PBRM1,SETD2,BAP1,MET,KMT2C,PKHD1,ICE1,SMAD4,CDKN2A
0,black or african american,0.0,9.6,0.0,0.0,Hematopoietic and reticuloendothelial systems,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,white,0.0,6.378082,0.0,0.0,Hematopoietic and reticuloendothelial systems,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,white,0.0,16.756164,0.0,0.0,Hematopoietic and reticuloendothelial systems,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,white,0.0,16.043836,0.0,0.0,Hematopoietic and reticuloendothelial systems,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,white,0.0,14.049315,0.0,0.0,Hematopoietic and reticuloendothelial systems,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
# Converting Race not reported to NaN
cleaning_df['Race'].replace({"not reported": np.nan}, inplace=True)
cleaning_df['Race'].replace({"Unknown": np.nan}, inplace=True)
cleaning_df['Race'].replace({"not allowed to collect": np.nan}, inplace=True)
cleaning_df.head()

Unnamed: 0,Race,Deceased_Status,Age_At_Diagnosis,Prior_Malignancy,Synchronous_Malignancy,Primary_Site,TP53,PIK3CA,TTN,CDH1,...,VHL,PBRM1,SETD2,BAP1,MET,KMT2C,PKHD1,ICE1,SMAD4,CDKN2A
0,black or african american,0.0,9.6,0.0,0.0,Hematopoietic and reticuloendothelial systems,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,white,0.0,6.378082,0.0,0.0,Hematopoietic and reticuloendothelial systems,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,white,0.0,16.756164,0.0,0.0,Hematopoietic and reticuloendothelial systems,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,white,0.0,16.043836,0.0,0.0,Hematopoietic and reticuloendothelial systems,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,white,0.0,14.049315,0.0,0.0,Hematopoietic and reticuloendothelial systems,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
cleaning_df.isna().sum()

Race                      1603
Deceased_Status              0
Age_At_Diagnosis           652
Prior_Malignancy           570
Synchronous_Malignancy     969
Primary_Site                 0
TP53                         0
PIK3CA                       0
TTN                          0
CDH1                         0
GATA3                        0
APC                          0
KRAS                         0
SYNE1                        0
CSMD3                        0
MUC16                        0
RYR2                         0
PTEN                         0
NRAS                         0
MUC5B                        0
TET2                         0
PTPN11                       0
NOTCH1                       0
FBXW7                        0
PHF6                         0
IGHV2-70                     0
IGLV3-1                      0
IGHV2-70D                    0
DNMT3A                       0
NPM1                         0
FLT3                         0
IDH2                         0
RUNX1   

In [16]:
cleaning_df.shape

(12974, 48)

In [17]:
# Dropping rows with null values
cleaning_df = cleaning_df.dropna()

In [18]:
print(cleaning_df.isna().sum())
print(cleaning_df.shape)

Race                      0
Deceased_Status           0
Age_At_Diagnosis          0
Prior_Malignancy          0
Synchronous_Malignancy    0
Primary_Site              0
TP53                      0
PIK3CA                    0
TTN                       0
CDH1                      0
GATA3                     0
APC                       0
KRAS                      0
SYNE1                     0
CSMD3                     0
MUC16                     0
RYR2                      0
PTEN                      0
NRAS                      0
MUC5B                     0
TET2                      0
PTPN11                    0
NOTCH1                    0
FBXW7                     0
PHF6                      0
IGHV2-70                  0
IGLV3-1                   0
IGHV2-70D                 0
DNMT3A                    0
NPM1                      0
FLT3                      0
IDH2                      0
RUNX1                     0
EGFR                      0
IDH1                      0
ATRX                

In [19]:
# Get features dataframe
features_df = cleaning_df.loc[:, cleaning_df.columns != 'Primary_Site']
features_df.head()

Unnamed: 0,Race,Deceased_Status,Age_At_Diagnosis,Prior_Malignancy,Synchronous_Malignancy,TP53,PIK3CA,TTN,CDH1,GATA3,...,VHL,PBRM1,SETD2,BAP1,MET,KMT2C,PKHD1,ICE1,SMAD4,CDKN2A
0,black or african american,0.0,9.6,0.0,0.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,white,0.0,6.378082,0.0,0.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,white,0.0,16.756164,0.0,0.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,white,0.0,16.043836,0.0,0.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,white,0.0,14.049315,0.0,0.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
# Hotcoding Race, Pathologic_Stage, Primary_Diagnosis, and Disease_Type
hotcoded_df = pd.get_dummies(features_df)
hotcoded_df.head()

Unnamed: 0,Deceased_Status,Age_At_Diagnosis,Prior_Malignancy,Synchronous_Malignancy,TP53,PIK3CA,TTN,CDH1,GATA3,APC,...,PKHD1,ICE1,SMAD4,CDKN2A,Race_american indian or alaska native,Race_asian,Race_black or african american,Race_native hawaiian or other pacific islander,Race_other,Race_white
0,0.0,9.6,0.0,0.0,0,0,0,0,0,0,...,0,0,0,0,False,False,True,False,False,False
1,0.0,6.378082,0.0,0.0,0,0,0,0,0,0,...,0,0,0,0,False,False,False,False,False,True
2,0.0,16.756164,0.0,0.0,0,0,0,0,0,0,...,0,0,0,0,False,False,False,False,False,True
3,0.0,16.043836,0.0,0.0,0,0,0,0,0,0,...,0,0,0,0,False,False,False,False,False,True
4,0.0,14.049315,0.0,0.0,0,0,0,0,0,0,...,0,0,0,0,False,False,False,False,False,True


In [21]:
hotcoded_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10017 entries, 0 to 12973
Data columns (total 52 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   Deceased_Status                                 10017 non-null  float64
 1   Age_At_Diagnosis                                10017 non-null  float64
 2   Prior_Malignancy                                10017 non-null  float64
 3   Synchronous_Malignancy                          10017 non-null  float64
 4   TP53                                            10017 non-null  int64  
 5   PIK3CA                                          10017 non-null  int64  
 6   TTN                                             10017 non-null  int64  
 7   CDH1                                            10017 non-null  int64  
 8   GATA3                                           10017 non-null  int64  
 9   APC                                         

In [22]:
target_df = cleaning_df.loc[:, cleaning_df.columns == 'Primary_Site']
target_df.replace({'Colon': 'Colorectal', 'Rectum': 'Colorectal', 'Rectosigmoid junction': 'Colorectal', 'Connective, subcutaneous and other soft tissues': 'Other', 'Other and ill-defined sites': 'Other', 'Uterus, NOS': 'Uterus', 'Corpus uteri': 'Uterus'}, inplace=True)
target_df.value_counts()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  target_df.replace({'Colon': 'Colorectal', 'Rectum': 'Colorectal', 'Rectosigmoid junction': 'Colorectal', 'Connective, subcutaneous and other soft tissues': 'Other', 'Other and ill-defined sites': 'Other', 'Uterus, NOS': 'Uterus', 'Corpus uteri': 'Uterus'}, inplace=True)


Primary_Site                                 
Hematopoietic and reticuloendothelial systems    4814
Bronchus and lung                                1086
Kidney                                           1059
Breast                                            934
Uterus                                            697
Brain                                             634
Colorectal                                        343
Pancreas                                          340
Other                                             110
Name: count, dtype: int64

In [23]:
# Hotcoding target variables
target_df.replace({'Hematopoietic and reticuloendothelial systems': 0, 
                   'Bronchus and lung': 1,
                   'Kidney': 2,
                   'Breast': 3,
                   'Uterus': 4,
                   'Brain': 5,
                   'Colorectal': 6,
                   'Pancreas': 7,
                   'Other': 8}, inplace=True)
target_df.value_counts()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  target_df.replace({'Hematopoietic and reticuloendothelial systems': 0,


Primary_Site
0               4814
1               1086
2               1059
3                934
4                697
5                634
6                343
7                340
8                110
Name: count, dtype: int64

In [24]:
final_df = pd.concat([target_df, hotcoded_df], axis=1)
print(final_df.shape)
final_df.head()

(10017, 53)


Unnamed: 0,Primary_Site,Deceased_Status,Age_At_Diagnosis,Prior_Malignancy,Synchronous_Malignancy,TP53,PIK3CA,TTN,CDH1,GATA3,...,PKHD1,ICE1,SMAD4,CDKN2A,Race_american indian or alaska native,Race_asian,Race_black or african american,Race_native hawaiian or other pacific islander,Race_other,Race_white
0,0,0.0,9.6,0.0,0.0,0,0,0,0,0,...,0,0,0,0,False,False,True,False,False,False
1,0,0.0,6.378082,0.0,0.0,0,0,0,0,0,...,0,0,0,0,False,False,False,False,False,True
2,0,0.0,16.756164,0.0,0.0,0,0,0,0,0,...,0,0,0,0,False,False,False,False,False,True
3,0,0.0,16.043836,0.0,0.0,0,0,0,0,0,...,0,0,0,0,False,False,False,False,False,True
4,0,0.0,14.049315,0.0,0.0,0,0,0,0,0,...,0,0,0,0,False,False,False,False,False,True


In [25]:
final_df.replace({False: 0, True: 1}, inplace=True)
final_df.head()

Unnamed: 0,Primary_Site,Deceased_Status,Age_At_Diagnosis,Prior_Malignancy,Synchronous_Malignancy,TP53,PIK3CA,TTN,CDH1,GATA3,...,PKHD1,ICE1,SMAD4,CDKN2A,Race_american indian or alaska native,Race_asian,Race_black or african american,Race_native hawaiian or other pacific islander,Race_other,Race_white
0,0,0.0,9.6,0.0,0.0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,0,0.0,6.378082,0.0,0.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0.0,16.756164,0.0,0.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0.0,16.043836,0.0,0.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0.0,14.049315,0.0,0.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [26]:
total_samples = len(final_df)
num_classes = final_df['Primary_Site'].nunique()

weight0 = total_samples / (final_df['Primary_Site'].value_counts()[0] * num_classes)
weight1 = total_samples / (final_df['Primary_Site'].value_counts()[1] * num_classes)
weight2 = total_samples / (final_df['Primary_Site'].value_counts()[2] * num_classes)
weight3 = total_samples / (final_df['Primary_Site'].value_counts()[3] * num_classes)
weight4 = total_samples / (final_df['Primary_Site'].value_counts()[4] * num_classes)
weight5 = total_samples / (final_df['Primary_Site'].value_counts()[5] * num_classes)
weight6 = total_samples / (final_df['Primary_Site'].value_counts()[6] * num_classes)
weight7 = total_samples / (final_df['Primary_Site'].value_counts()[7] * num_classes)
weight8 = total_samples / (final_df['Primary_Site'].value_counts()[8] * num_classes)

conditions = [
    (final_df['Primary_Site'] == 0),
    (final_df['Primary_Site'] == 1),
    (final_df['Primary_Site'] == 2),
    (final_df['Primary_Site'] == 3),
    (final_df['Primary_Site'] == 4),
    (final_df['Primary_Site'] == 5),
    (final_df['Primary_Site'] == 6),
    (final_df['Primary_Site'] == 7),
    (final_df['Primary_Site'] == 8)
]

results = [
    weight0,
    weight1,
    weight2,
    weight3,
    weight4,
    weight5,
    weight6,
    weight7,
    weight8
]

final_df['Weight'] = np.select(conditions, results)

final_df['Weight'].value_counts()

Weight
0.231201     4814
1.024862     1086
1.050992     1059
1.191649      934
1.596844      697
1.755521      634
3.244898      343
3.273529      340
10.118182     110
Name: count, dtype: int64

In [27]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10017 entries, 0 to 12973
Data columns (total 54 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   Primary_Site                                    10017 non-null  int64  
 1   Deceased_Status                                 10017 non-null  float64
 2   Age_At_Diagnosis                                10017 non-null  float64
 3   Prior_Malignancy                                10017 non-null  float64
 4   Synchronous_Malignancy                          10017 non-null  float64
 5   TP53                                            10017 non-null  int64  
 6   PIK3CA                                          10017 non-null  int64  
 7   TTN                                             10017 non-null  int64  
 8   CDH1                                            10017 non-null  int64  
 9   GATA3                                       

In [28]:
# Writing the dataframe on S3 as a CSV

with io.StringIO() as csv_buffer:
    final_df.to_csv(csv_buffer, index=False)

    response = s3_client.put_object(
        Bucket=AWS_S3_BUCKET, Key="processed/tcga_processed.csv", Body=csv_buffer.getvalue()
    )

    status = response.get("ResponseMetadata", {}).get("HTTPStatusCode")

    if status == 200:
        print(f"Successful S3 put_object response. Status - {status}")
    else:
        print(f"Unsuccessful S3 put_object response. Status - {status}")

Successful S3 put_object response. Status - 200
