In [1]:
import os
import boto3
import io
from dotenv import load_dotenv
import pandas as pd
import numpy as np

In [2]:
load_dotenv()

True

In [3]:
# Custom function to reduce memory usage of data

def reduce_mem_usage(df, verbose=True):
    numerics = ['int8','int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float32)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

## Importing data ingested from previous script that is stored on the S3 bucket

In [4]:
AWS_S3_BUCKET = os.getenv("AWS_S3_BUCKET")
AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID")
AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")

In [17]:
s3_client = boto3.client(
    "s3",
    aws_access_key_id=AWS_ACCESS_KEY_ID,
    aws_secret_access_key=AWS_SECRET_ACCESS_KEY
)

response = s3_client.get_object(Bucket=AWS_S3_BUCKET, Key="rawdata/tcga_raw.csv")

status = response.get("ResponseMetadata", {}).get("HTTPStatusCode")

if status == 200:
    print(f"Successful S3 get_object response. Status - {status}")
    pd_df = reduce_mem_usage(pd.read_csv(response.get("Body")))
    print(pd_df.info())
else:
    print(f"Unsuccessful S3 get_object response. Status - {status}")

Successful S3 get_object response. Status - 200
Mem. usage decreased to  8.45 Mb (1.4% reduction)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15817 entries, 0 to 15816
Data columns (total 71 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Race                    15369 non-null  object 
 1   Deceased_Status         2268 non-null   float32
 2   Age_At_Diagnosis        14832 non-null  float32
 3   Sex                     15369 non-null  object 
 4   Pathologic_Stage        6408 non-null   object 
 5   Primary_Diagnosis       15369 non-null  object 
 6   Prior_Malignancy        8410 non-null   object 
 7   Synchronous_Malignancy  7906 non-null   object 
 8   Disease_Type            15817 non-null  object 
 9   ID                      15817 non-null  object 
 10  Primary_Site            15816 non-null  object 
 11  Submitter_ID            15817 non-null  object 
 12  TP53                    9448 non-null   object

In [18]:
print(pd_df.shape)
pd_df.head()

(15817, 71)


Unnamed: 0,Race,Deceased_Status,Age_At_Diagnosis,Sex,Pathologic_Stage,Primary_Diagnosis,Prior_Malignancy,Synchronous_Malignancy,Disease_Type,ID,...,FLG,TG,HRAS,KMT2D,BTG2,B2M,PIM1,IGHG1,SPOP,FOXA1
0,black or african american,,3505.0,male,,"Acute myeloid leukemia, NOS",,,Myeloid Leukemias,29312892-078e-4a35-809c-729f55370967,...,,,,,,,,,,
1,white,,2327.0,female,,"Acute myeloid leukemia, NOS",,,Myeloid Leukemias,2939c0a9-3c47-4019-b9e3-958e84a12bb5,...,,,,,,,,,,
2,white,,6115.0,female,,"Acute myeloid leukemia, NOS",,,Myeloid Leukemias,cecefca5-6308-49f6-b9c2-226235d60613,...,,,,,,,,,,
3,white,,5854.0,male,,"Acute myeloid leukemia, NOS",,,Myeloid Leukemias,56404ff6-a971-4d84-9891-0053b1075ee3,...,,,,,,,,,,
4,white,,5129.0,male,,"Acute myeloid leukemia, NOS",,,Myeloid Leukemias,792187f7-d5c3-497d-9573-f7411f027aa3,...,,,,,,,,,,


In [86]:
# Dropping ID and Primary_Site columns
cleaning_df = pd_df.drop(['ID', 'Submitter_ID', 'Pathologic_Stage', 'Primary_Diagnosis', 'Disease_Type'], axis=1)
cleaning_df.head()

Unnamed: 0,Race,Deceased_Status,Age_At_Diagnosis,Sex,Prior_Malignancy,Synchronous_Malignancy,Primary_Site,TP53,PIK3CA,TTN,...,FLG,TG,HRAS,KMT2D,BTG2,B2M,PIM1,IGHG1,SPOP,FOXA1
0,black or african american,,3505.0,male,,,Hematopoietic and reticuloendothelial systems,,,,...,,,,,,,,,,
1,white,,2327.0,female,,,Hematopoietic and reticuloendothelial systems,,,,...,,,,,,,,,,
2,white,,6115.0,female,,,Hematopoietic and reticuloendothelial systems,,,,...,,,,,,,,,,
3,white,,5854.0,male,,,Hematopoietic and reticuloendothelial systems,,,,...,,,,,,,,,,
4,white,,5129.0,male,,,Hematopoietic and reticuloendothelial systems,,,,...,,,,,,,,,,


In [87]:
# Replacing deceased status years with 1
cleaning_df.loc[cleaning_df.Deceased_Status > 1, 'Deceased_Status'] = 1
cleaning_df.head()

Unnamed: 0,Race,Deceased_Status,Age_At_Diagnosis,Sex,Prior_Malignancy,Synchronous_Malignancy,Primary_Site,TP53,PIK3CA,TTN,...,FLG,TG,HRAS,KMT2D,BTG2,B2M,PIM1,IGHG1,SPOP,FOXA1
0,black or african american,,3505.0,male,,,Hematopoietic and reticuloendothelial systems,,,,...,,,,,,,,,,
1,white,,2327.0,female,,,Hematopoietic and reticuloendothelial systems,,,,...,,,,,,,,,,
2,white,,6115.0,female,,,Hematopoietic and reticuloendothelial systems,,,,...,,,,,,,,,,
3,white,,5854.0,male,,,Hematopoietic and reticuloendothelial systems,,,,...,,,,,,,,,,
4,white,,5129.0,male,,,Hematopoietic and reticuloendothelial systems,,,,...,,,,,,,,,,


In [88]:
# Converting age at diagnosis from days to years
cleaning_df['Age_At_Diagnosis'] = cleaning_df['Age_At_Diagnosis'] / 365
cleaning_df.head()

Unnamed: 0,Race,Deceased_Status,Age_At_Diagnosis,Sex,Prior_Malignancy,Synchronous_Malignancy,Primary_Site,TP53,PIK3CA,TTN,...,FLG,TG,HRAS,KMT2D,BTG2,B2M,PIM1,IGHG1,SPOP,FOXA1
0,black or african american,,9.602739,male,,,Hematopoietic and reticuloendothelial systems,,,,...,,,,,,,,,,
1,white,,6.375342,female,,,Hematopoietic and reticuloendothelial systems,,,,...,,,,,,,,,,
2,white,,16.753426,female,,,Hematopoietic and reticuloendothelial systems,,,,...,,,,,,,,,,
3,white,,16.038357,male,,,Hematopoietic and reticuloendothelial systems,,,,...,,,,,,,,,,
4,white,,14.052054,male,,,Hematopoietic and reticuloendothelial systems,,,,...,,,,,,,,,,


In [89]:
# Dropping missing Age rows
cleaning_df = cleaning_df[cleaning_df['Age_At_Diagnosis'].notna()]

In [90]:
# Converting prior_malignancy not reported to NaN
cleaning_df['Prior_Malignancy'].replace({"yes": 1, "no": 0, "not reported": np.nan, "unknown": np.nan}, inplace=True)
cleaning_df['Synchronous_Malignancy'].replace({"Yes": 1, "No": 0, "Not Reported": np.nan}, inplace=True)
cleaning_df.head()

Unnamed: 0,Race,Deceased_Status,Age_At_Diagnosis,Sex,Prior_Malignancy,Synchronous_Malignancy,Primary_Site,TP53,PIK3CA,TTN,...,FLG,TG,HRAS,KMT2D,BTG2,B2M,PIM1,IGHG1,SPOP,FOXA1
0,black or african american,,9.602739,male,,,Hematopoietic and reticuloendothelial systems,,,,...,,,,,,,,,,
1,white,,6.375342,female,,,Hematopoietic and reticuloendothelial systems,,,,...,,,,,,,,,,
2,white,,16.753426,female,,,Hematopoietic and reticuloendothelial systems,,,,...,,,,,,,,,,
3,white,,16.038357,male,,,Hematopoietic and reticuloendothelial systems,,,,...,,,,,,,,,,
4,white,,14.052054,male,,,Hematopoietic and reticuloendothelial systems,,,,...,,,,,,,,,,


In [91]:
# Converting Race not reported to NaN
cleaning_df['Race'].replace({"not reported": np.nan}, inplace=True)
cleaning_df['Race'].replace({"Unknown": np.nan}, inplace=True)
cleaning_df['Race'].replace({"not allowed to collect": np.nan}, inplace=True)

# Converting Native Americans and Native Hawaiians to Other
cleaning_df['Race'].replace({"american indian or alaska native": "other"}, inplace=True)
cleaning_df['Race'].replace({"native hawaiian or other pacific islander": "other"}, inplace=True)

cleaning_df.head()

Unnamed: 0,Race,Deceased_Status,Age_At_Diagnosis,Sex,Prior_Malignancy,Synchronous_Malignancy,Primary_Site,TP53,PIK3CA,TTN,...,FLG,TG,HRAS,KMT2D,BTG2,B2M,PIM1,IGHG1,SPOP,FOXA1
0,black or african american,,9.602739,male,,,Hematopoietic and reticuloendothelial systems,,,,...,,,,,,,,,,
1,white,,6.375342,female,,,Hematopoietic and reticuloendothelial systems,,,,...,,,,,,,,,,
2,white,,16.753426,female,,,Hematopoietic and reticuloendothelial systems,,,,...,,,,,,,,,,
3,white,,16.038357,male,,,Hematopoietic and reticuloendothelial systems,,,,...,,,,,,,,,,
4,white,,14.052054,male,,,Hematopoietic and reticuloendothelial systems,,,,...,,,,,,,,,,


In [92]:
# Dropping missing race rows
cleaning_df = cleaning_df[cleaning_df['Race'].notna()]

In [93]:
# Converting Sex not reported and unknown to NaN to drop
cleaning_df['Sex'].replace({"not reported": np.nan}, inplace=True)
cleaning_df['Sex'].replace({"unknown": "missing"}, inplace=True)
cleaning_df['Sex'].replace({"female": 0, "male": 1}, inplace=True)

cleaning_df.head()

Unnamed: 0,Race,Deceased_Status,Age_At_Diagnosis,Sex,Prior_Malignancy,Synchronous_Malignancy,Primary_Site,TP53,PIK3CA,TTN,...,FLG,TG,HRAS,KMT2D,BTG2,B2M,PIM1,IGHG1,SPOP,FOXA1
0,black or african american,,9.602739,1.0,,,Hematopoietic and reticuloendothelial systems,,,,...,,,,,,,,,,
1,white,,6.375342,0.0,,,Hematopoietic and reticuloendothelial systems,,,,...,,,,,,,,,,
2,white,,16.753426,0.0,,,Hematopoietic and reticuloendothelial systems,,,,...,,,,,,,,,,
3,white,,16.038357,1.0,,,Hematopoietic and reticuloendothelial systems,,,,...,,,,,,,,,,
4,white,,14.052054,1.0,,,Hematopoietic and reticuloendothelial systems,,,,...,,,,,,,,,,


In [94]:
# Dropping missing Sex rows
cleaning_df = cleaning_df[cleaning_df['Sex'].notna()]

In [95]:
cleaning_df.isna().sum()

Race                    0
Deceased_Status     11032
Age_At_Diagnosis        0
Sex                     0
Prior_Malignancy     7015
                    ...  
B2M                  5123
PIM1                 5123
IGHG1                5123
SPOP                 5123
FOXA1                5123
Length: 66, dtype: int64

In [96]:
# Replacing the rest of missing values with False to convert to 0 later
cleaning_df.replace({np.nan: False}, inplace = True)
cleaning_df.head()

Unnamed: 0,Race,Deceased_Status,Age_At_Diagnosis,Sex,Prior_Malignancy,Synchronous_Malignancy,Primary_Site,TP53,PIK3CA,TTN,...,FLG,TG,HRAS,KMT2D,BTG2,B2M,PIM1,IGHG1,SPOP,FOXA1
0,black or african american,False,9.602739,1.0,False,False,Hematopoietic and reticuloendothelial systems,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,white,False,6.375342,0.0,False,False,Hematopoietic and reticuloendothelial systems,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,white,False,16.753426,0.0,False,False,Hematopoietic and reticuloendothelial systems,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,white,False,16.038357,1.0,False,False,Hematopoietic and reticuloendothelial systems,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,white,False,14.052054,1.0,False,False,Hematopoietic and reticuloendothelial systems,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [97]:
# Replacing True and False values in the SSMS columns with 1 and 0
cleaning_df.replace({False: 0, True: 1}, inplace=True)
cleaning_df.head()

Unnamed: 0,Race,Deceased_Status,Age_At_Diagnosis,Sex,Prior_Malignancy,Synchronous_Malignancy,Primary_Site,TP53,PIK3CA,TTN,...,FLG,TG,HRAS,KMT2D,BTG2,B2M,PIM1,IGHG1,SPOP,FOXA1
0,black or african american,0.0,9.602739,1.0,0.0,0.0,Hematopoietic and reticuloendothelial systems,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,white,0.0,6.375342,0.0,0.0,0.0,Hematopoietic and reticuloendothelial systems,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,white,0.0,16.753426,0.0,0.0,0.0,Hematopoietic and reticuloendothelial systems,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,white,0.0,16.038357,1.0,0.0,0.0,Hematopoietic and reticuloendothelial systems,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,white,0.0,14.052054,1.0,0.0,0.0,Hematopoietic and reticuloendothelial systems,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [98]:
cleaning_df.shape

(13183, 66)

In [99]:
print(cleaning_df.isna().sum())
print(cleaning_df.shape)

Race                0
Deceased_Status     0
Age_At_Diagnosis    0
Sex                 0
Prior_Malignancy    0
                   ..
B2M                 0
PIM1                0
IGHG1               0
SPOP                0
FOXA1               0
Length: 66, dtype: int64
(13183, 66)


In [100]:
cleaning_df['Race'].value_counts()

Race
white                        10828
black or african american     1350
asian                          700
other                          305
Name: count, dtype: int64

In [101]:
# Get features dataframe
features_df = cleaning_df.loc[:, cleaning_df.columns != 'Primary_Site']
features_df.head()

Unnamed: 0,Race,Deceased_Status,Age_At_Diagnosis,Sex,Prior_Malignancy,Synchronous_Malignancy,TP53,PIK3CA,TTN,CDH1,...,FLG,TG,HRAS,KMT2D,BTG2,B2M,PIM1,IGHG1,SPOP,FOXA1
0,black or african american,0.0,9.602739,1.0,0.0,0.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,white,0.0,6.375342,0.0,0.0,0.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,white,0.0,16.753426,0.0,0.0,0.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,white,0.0,16.038357,1.0,0.0,0.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,white,0.0,14.052054,1.0,0.0,0.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [102]:
# Hotcoding Race, Pathologic_Stage, Primary_Diagnosis, and Disease_Type
hotcoded_df = pd.get_dummies(features_df)
hotcoded_df.head()

Unnamed: 0,Deceased_Status,Age_At_Diagnosis,Sex,Prior_Malignancy,Synchronous_Malignancy,TP53,PIK3CA,TTN,CDH1,GATA3,...,BTG2,B2M,PIM1,IGHG1,SPOP,FOXA1,Race_asian,Race_black or african american,Race_other,Race_white
0,0.0,9.602739,1.0,0.0,0.0,0,0,0,0,0,...,0,0,0,0,0,0,False,True,False,False
1,0.0,6.375342,0.0,0.0,0.0,0,0,0,0,0,...,0,0,0,0,0,0,False,False,False,True
2,0.0,16.753426,0.0,0.0,0.0,0,0,0,0,0,...,0,0,0,0,0,0,False,False,False,True
3,0.0,16.038357,1.0,0.0,0.0,0,0,0,0,0,...,0,0,0,0,0,0,False,False,False,True
4,0.0,14.052054,1.0,0.0,0.0,0,0,0,0,0,...,0,0,0,0,0,0,False,False,False,True


In [103]:
hotcoded_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13183 entries, 0 to 15816
Data columns (total 68 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Deceased_Status                 13183 non-null  float64
 1   Age_At_Diagnosis                13183 non-null  float32
 2   Sex                             13183 non-null  float64
 3   Prior_Malignancy                13183 non-null  float64
 4   Synchronous_Malignancy          13183 non-null  float64
 5   TP53                            13183 non-null  int64  
 6   PIK3CA                          13183 non-null  int64  
 7   TTN                             13183 non-null  int64  
 8   CDH1                            13183 non-null  int64  
 9   GATA3                           13183 non-null  int64  
 10  APC                             13183 non-null  int64  
 11  KRAS                            13183 non-null  int64  
 12  SYNE1                           13183

In [106]:
target_df = cleaning_df.loc[:, cleaning_df.columns == 'Primary_Site']
target_df.value_counts()

Primary_Site                                                                  
Hematopoietic and reticuloendothelial systems                                     4828
Brain                                                                             1222
Bronchus and lung                                                                 1165
Kidney                                                                            1155
Breast                                                                            1123
Ovary                                                                              548
Prostate gland                                                                     538
Corpus uteri                                                                       514
Skin                                                                               487
Thyroid gland                                                                      416
Pancreas                                           

In [107]:
target_df.replace({'Colon': 'Colorectal', 
                   'Rectum': 'Colorectal', 
                   'Rectosigmoid junction': 'Colorectal', 
                   'Connective, subcutaneous and other soft tissues': 'Other', 
                   'Other and ill-defined sites': 'Other', 
                   'Uterus, NOS': 'Uterus', 'Corpus uteri': 'Uterus', 
                   'Lymph nodes': 'Hematopoietic and reticuloendothelial systems',
                   'Unknown': 'Other', 
                   0: 'Other', 
                   'Other and unspecified female genital organs': 'Other', 
                   'Stomach': 'Other', 
                   'Esophagus': 'Other', 'Retroperitoneum and peritoneum': 'Other', 
                   'Bones, joints and articular cartilage of other and unspecified sites': 'Other', 
                   'Small intestine': 'Other', 
                   'Heart, mediastinum, and pleura': 'Other', 
                   'Other and ill-defined sites within respiratory system and intrathoracic organs': 'Other', 
                   'Other and unspecified major salivary glands': 'Other', 
                   'Other and ill-defined digestive organs': 'Other', 
                   'Testis': 'Other', 
                   'Gallbladder': 'Other'}, inplace=True)
target_df.value_counts()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  target_df.replace({'Colon': 'Colorectal',


Primary_Site                                 
Hematopoietic and reticuloendothelial systems    4859
Brain                                            1222
Bronchus and lung                                1165
Kidney                                           1155
Breast                                           1123
Uterus                                            743
Ovary                                             548
Prostate gland                                    538
Skin                                              487
Thyroid gland                                     416
Colorectal                                        405
Pancreas                                          359
Other                                             163
Name: count, dtype: int64

In [108]:
# Hotcoding target variables
target_df.replace({'Hematopoietic and reticuloendothelial systems': 0, 
                   'Brain': 1,
                   'Bronchus and lung': 2,
                   'Kidney': 3,
                   'Breast': 4,
                   'Uterus': 5,
                   'Ovary': 6,
                   'Prostate gland': 7,
                   'Skin': 8, 
                   'Thyroid gland': 9, 
                   'Colorectal': 10,
                   'Pancreas': 11,
                   'Other': 12}, inplace=True)
target_df.value_counts()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  target_df.replace({'Hematopoietic and reticuloendothelial systems': 0,


Primary_Site
0               4859
1               1222
2               1165
3               1155
4               1123
5                743
6                548
7                538
8                487
9                416
10               405
11               359
12               163
Name: count, dtype: int64

In [109]:
final_df = pd.concat([target_df, hotcoded_df], axis=1)
print(final_df.shape)
final_df.head()

(13183, 69)


Unnamed: 0,Primary_Site,Deceased_Status,Age_At_Diagnosis,Sex,Prior_Malignancy,Synchronous_Malignancy,TP53,PIK3CA,TTN,CDH1,...,BTG2,B2M,PIM1,IGHG1,SPOP,FOXA1,Race_asian,Race_black or african american,Race_other,Race_white
0,0,0.0,9.602739,1.0,0.0,0.0,0,0,0,0,...,0,0,0,0,0,0,False,True,False,False
1,0,0.0,6.375342,0.0,0.0,0.0,0,0,0,0,...,0,0,0,0,0,0,False,False,False,True
2,0,0.0,16.753426,0.0,0.0,0.0,0,0,0,0,...,0,0,0,0,0,0,False,False,False,True
3,0,0.0,16.038357,1.0,0.0,0.0,0,0,0,0,...,0,0,0,0,0,0,False,False,False,True
4,0,0.0,14.052054,1.0,0.0,0.0,0,0,0,0,...,0,0,0,0,0,0,False,False,False,True


In [110]:
final_df = final_df[final_df.Primary_Site != 12]
final_df.shape

(13020, 69)

In [111]:
final_df.replace({False: 0, True: 1}, inplace=True)
final_df.head()

Unnamed: 0,Primary_Site,Deceased_Status,Age_At_Diagnosis,Sex,Prior_Malignancy,Synchronous_Malignancy,TP53,PIK3CA,TTN,CDH1,...,BTG2,B2M,PIM1,IGHG1,SPOP,FOXA1,Race_asian,Race_black or african american,Race_other,Race_white
0,0,0.0,9.602739,1.0,0.0,0.0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,0,0.0,6.375342,0.0,0.0,0.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0.0,16.753426,0.0,0.0,0.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0.0,16.038357,1.0,0.0,0.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0.0,14.052054,1.0,0.0,0.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [112]:
total_samples = len(final_df)
num_classes = final_df['Primary_Site'].nunique()

weight0 = total_samples / (final_df['Primary_Site'].value_counts()[0] * num_classes)
weight1 = total_samples / (final_df['Primary_Site'].value_counts()[1] * num_classes)
weight2 = total_samples / (final_df['Primary_Site'].value_counts()[2] * num_classes)
weight3 = total_samples / (final_df['Primary_Site'].value_counts()[3] * num_classes)
weight4 = total_samples / (final_df['Primary_Site'].value_counts()[4] * num_classes)
weight5 = total_samples / (final_df['Primary_Site'].value_counts()[5] * num_classes)
weight6 = total_samples / (final_df['Primary_Site'].value_counts()[6] * num_classes)
weight7 = total_samples / (final_df['Primary_Site'].value_counts()[7] * num_classes)
weight8 = total_samples / (final_df['Primary_Site'].value_counts()[8] * num_classes)
weight9 = total_samples / (final_df['Primary_Site'].value_counts()[9] * num_classes)

conditions = [
    (final_df['Primary_Site'] == 0),
    (final_df['Primary_Site'] == 1),
    (final_df['Primary_Site'] == 2),
    (final_df['Primary_Site'] == 3),
    (final_df['Primary_Site'] == 4),
    (final_df['Primary_Site'] == 5),
    (final_df['Primary_Site'] == 6),
    (final_df['Primary_Site'] == 7),
    (final_df['Primary_Site'] == 8),
    (final_df['Primary_Site'] == 9),
]

results = [
    weight0,
    weight1,
    weight2,
    weight3,
    weight4,
    weight5,
    weight6,
    weight7,
    weight8,
    weight9,
]

final_df['Weight'] = np.select(conditions, results)

final_df['Weight'].value_counts()

Weight
0.223297    4859
0.887889    1222
0.931330    1165
0.939394    1155
0.966162    1123
0.000000     764
1.460296     743
1.979927     548
2.016729     538
2.227926     487
2.608173     416
Name: count, dtype: int64

In [113]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13020 entries, 0 to 15816
Data columns (total 70 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Primary_Site                    13020 non-null  int64  
 1   Deceased_Status                 13020 non-null  float64
 2   Age_At_Diagnosis                13020 non-null  float32
 3   Sex                             13020 non-null  float64
 4   Prior_Malignancy                13020 non-null  float64
 5   Synchronous_Malignancy          13020 non-null  float64
 6   TP53                            13020 non-null  int64  
 7   PIK3CA                          13020 non-null  int64  
 8   TTN                             13020 non-null  int64  
 9   CDH1                            13020 non-null  int64  
 10  GATA3                           13020 non-null  int64  
 11  APC                             13020 non-null  int64  
 12  KRAS                            13020

In [114]:
final_df = reduce_mem_usage(final_df)
final_df['Deceased_Status'] = final_df['Deceased_Status'].astype('int8')
final_df['Sex'] = final_df['Sex'].astype('int8')
final_df['Prior_Malignancy'] = final_df['Prior_Malignancy'].astype('int8')
final_df['Synchronous_Malignancy'] = final_df['Synchronous_Malignancy'].astype('int8')
final_df.info()

Mem. usage decreased to  1.19 Mb (83.0% reduction)
<class 'pandas.core.frame.DataFrame'>
Index: 13020 entries, 0 to 15816
Data columns (total 70 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Primary_Site                    13020 non-null  int8   
 1   Deceased_Status                 13020 non-null  int8   
 2   Age_At_Diagnosis                13020 non-null  float32
 3   Sex                             13020 non-null  int8   
 4   Prior_Malignancy                13020 non-null  int8   
 5   Synchronous_Malignancy          13020 non-null  int8   
 6   TP53                            13020 non-null  int8   
 7   PIK3CA                          13020 non-null  int8   
 8   TTN                             13020 non-null  int8   
 9   CDH1                            13020 non-null  int8   
 10  GATA3                           13020 non-null  int8   
 11  APC                             13020 non-null 

In [115]:
# Writing the dataframe on S3 as a CSV

with io.StringIO() as csv_buffer:
    final_df.to_csv(csv_buffer, index=False)

    response = s3_client.put_object(
        Bucket=AWS_S3_BUCKET, Key="processed/tcga_processed.csv", Body=csv_buffer.getvalue()
    )

    status = response.get("ResponseMetadata", {}).get("HTTPStatusCode")

    if status == 200:
        print(f"Successful S3 put_object response. Status - {status}")
    else:
        print(f"Unsuccessful S3 put_object response. Status - {status}")

Successful S3 put_object response. Status - 200


In [116]:
# Creating training and testing subfolders and datasets for the S3 bucket
from sklearn.model_selection import train_test_split

In [117]:
train_df, test_df = train_test_split(final_df, test_size=0.1, random_state=42)
print(train_df.shape)
print(test_df.shape)
train_df.head()

(11718, 70)
(1302, 70)


Unnamed: 0,Primary_Site,Deceased_Status,Age_At_Diagnosis,Sex,Prior_Malignancy,Synchronous_Malignancy,TP53,PIK3CA,TTN,CDH1,...,B2M,PIM1,IGHG1,SPOP,FOXA1,Race_asian,Race_black or african american,Race_other,Race_white,Weight
9736,3,0,66.76712,1,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0.939394
8583,0,0,18.39452,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0.223297
5452,7,0,57.972603,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,2.016729
2477,2,0,84.049316,1,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,1,0.93133
7703,9,1,78.347946,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,2.608173


In [118]:
# Writing the train dataframe on S3 as a CSV
train_df = train_df.drop('Weight', axis=1)
with io.StringIO() as csv_buffer:
    train_df.to_csv(csv_buffer, index=False, header=None)

    response = s3_client.put_object(
        Bucket=AWS_S3_BUCKET, Key="train/data.csv", Body=csv_buffer.getvalue()
    )

    status = response.get("ResponseMetadata", {}).get("HTTPStatusCode")

    if status == 200:
        print(f"Successful S3 put_object response. Status - {status}")
    else:
        print(f"Unsuccessful S3 put_object response. Status - {status}")

Successful S3 put_object response. Status - 200


In [119]:
# Writing the test dataframe on S3 as a CSV
test_df = test_df.drop('Weight', axis=1)
with io.StringIO() as csv_buffer:
    test_df.to_csv(csv_buffer, index=False, header=None)

    response = s3_client.put_object(
        Bucket=AWS_S3_BUCKET, Key="validation/data.csv", Body=csv_buffer.getvalue()
    )

    status = response.get("ResponseMetadata", {}).get("HTTPStatusCode")

    if status == 200:
        print(f"Successful S3 put_object response. Status - {status}")
    else:
        print(f"Unsuccessful S3 put_object response. Status - {status}")

Successful S3 put_object response. Status - 200


In [120]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 11718 entries, 9736 to 8908
Data columns (total 69 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Primary_Site                    11718 non-null  int8   
 1   Deceased_Status                 11718 non-null  int8   
 2   Age_At_Diagnosis                11718 non-null  float32
 3   Sex                             11718 non-null  int8   
 4   Prior_Malignancy                11718 non-null  int8   
 5   Synchronous_Malignancy          11718 non-null  int8   
 6   TP53                            11718 non-null  int8   
 7   PIK3CA                          11718 non-null  int8   
 8   TTN                             11718 non-null  int8   
 9   CDH1                            11718 non-null  int8   
 10  GATA3                           11718 non-null  int8   
 11  APC                             11718 non-null  int8   
 12  KRAS                            117

In [121]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1302 entries, 984 to 5559
Data columns (total 69 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Primary_Site                    1302 non-null   int8   
 1   Deceased_Status                 1302 non-null   int8   
 2   Age_At_Diagnosis                1302 non-null   float32
 3   Sex                             1302 non-null   int8   
 4   Prior_Malignancy                1302 non-null   int8   
 5   Synchronous_Malignancy          1302 non-null   int8   
 6   TP53                            1302 non-null   int8   
 7   PIK3CA                          1302 non-null   int8   
 8   TTN                             1302 non-null   int8   
 9   CDH1                            1302 non-null   int8   
 10  GATA3                           1302 non-null   int8   
 11  APC                             1302 non-null   int8   
 12  KRAS                            1302 