In [1]:
import os
import boto3
import io
from dotenv import load_dotenv
import pandas as pd
import numpy as np

In [2]:
load_dotenv()

True

In [3]:
# Custom function to reduce memory usage of data

def reduce_mem_usage(df, verbose=True):
    numerics = ['int8','int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float32)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

## Importing data ingested from previous script that is stored on the S3 bucket

In [4]:
AWS_S3_BUCKET = os.getenv("AWS_S3_BUCKET")
AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID")
AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")

In [105]:
s3_client = boto3.client(
    "s3",
    aws_access_key_id=AWS_ACCESS_KEY_ID,
    aws_secret_access_key=AWS_SECRET_ACCESS_KEY
)

response = s3_client.get_object(Bucket=AWS_S3_BUCKET, Key="rawdata/tcga_raw.csv")

status = response.get("ResponseMetadata", {}).get("HTTPStatusCode")

if status == 200:
    print(f"Successful S3 get_object response. Status - {status}")
    pd_df = reduce_mem_usage(pd.read_csv(response.get("Body")))
    print(pd_df.info())
else:
    print(f"Unsuccessful S3 get_object response. Status - {status}")

Successful S3 get_object response. Status - 200
Mem. usage decreased to  4.26 Mb (2.5% reduction)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14304 entries, 0 to 14303
Data columns (total 40 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Race                    13877 non-null  object 
 1   Deceased_Status         1959 non-null   float32
 2   Age_At_Diagnosis        13581 non-null  float32
 3   Pathologic_Stage        6066 non-null   object 
 4   Primary_Diagnosis       13877 non-null  object 
 5   Prior_Malignancy        6993 non-null   object 
 6   Synchronous_Malignancy  6819 non-null   object 
 7   Disease_Type            14304 non-null  object 
 8   ID                      14304 non-null  object 
 9   Primary_Site            14303 non-null  object 
 10  Submitter_ID            14304 non-null  object 
 11  TP53                    7786 non-null   object 
 12  PIK3CA                  7786 non-null   object

In [106]:
print(pd_df.shape)
pd_df.head()

(14304, 40)


Unnamed: 0,Race,Deceased_Status,Age_At_Diagnosis,Pathologic_Stage,Primary_Diagnosis,Prior_Malignancy,Synchronous_Malignancy,Disease_Type,ID,Primary_Site,...,VHL,PBRM1,MET,SMAD4,BRAF,LRP2,FREM2,KMT2D,BTG2,B2M
0,black or african american,,3505.0,,"Acute myeloid leukemia, NOS",,,Myeloid Leukemias,29312892-078e-4a35-809c-729f55370967,Hematopoietic and reticuloendothelial systems,...,,,,,,,,,,
1,white,,2327.0,,"Acute myeloid leukemia, NOS",,,Myeloid Leukemias,2939c0a9-3c47-4019-b9e3-958e84a12bb5,Hematopoietic and reticuloendothelial systems,...,,,,,,,,,,
2,white,,6115.0,,"Acute myeloid leukemia, NOS",,,Myeloid Leukemias,cecefca5-6308-49f6-b9c2-226235d60613,Hematopoietic and reticuloendothelial systems,...,,,,,,,,,,
3,white,,5854.0,,"Acute myeloid leukemia, NOS",,,Myeloid Leukemias,56404ff6-a971-4d84-9891-0053b1075ee3,Hematopoietic and reticuloendothelial systems,...,,,,,,,,,,
4,white,,5129.0,,"Acute myeloid leukemia, NOS",,,Myeloid Leukemias,792187f7-d5c3-497d-9573-f7411f027aa3,Hematopoietic and reticuloendothelial systems,...,,,,,,,,,,


In [107]:
# Dropping ID and Primary_Site columns
cleaning_df = pd_df.drop(['ID', 'Submitter_ID', 'Pathologic_Stage', 'Primary_Diagnosis', 'Disease_Type'], axis=1)
cleaning_df.head()

Unnamed: 0,Race,Deceased_Status,Age_At_Diagnosis,Prior_Malignancy,Synchronous_Malignancy,Primary_Site,TP53,PIK3CA,TTN,APC,...,VHL,PBRM1,MET,SMAD4,BRAF,LRP2,FREM2,KMT2D,BTG2,B2M
0,black or african american,,3505.0,,,Hematopoietic and reticuloendothelial systems,,,,,...,,,,,,,,,,
1,white,,2327.0,,,Hematopoietic and reticuloendothelial systems,,,,,...,,,,,,,,,,
2,white,,6115.0,,,Hematopoietic and reticuloendothelial systems,,,,,...,,,,,,,,,,
3,white,,5854.0,,,Hematopoietic and reticuloendothelial systems,,,,,...,,,,,,,,,,
4,white,,5129.0,,,Hematopoietic and reticuloendothelial systems,,,,,...,,,,,,,,,,


In [108]:
# Replacing deceased status years with 1
cleaning_df.loc[cleaning_df.Deceased_Status > 1, 'Deceased_Status'] = 1
cleaning_df.head()

Unnamed: 0,Race,Deceased_Status,Age_At_Diagnosis,Prior_Malignancy,Synchronous_Malignancy,Primary_Site,TP53,PIK3CA,TTN,APC,...,VHL,PBRM1,MET,SMAD4,BRAF,LRP2,FREM2,KMT2D,BTG2,B2M
0,black or african american,,3505.0,,,Hematopoietic and reticuloendothelial systems,,,,,...,,,,,,,,,,
1,white,,2327.0,,,Hematopoietic and reticuloendothelial systems,,,,,...,,,,,,,,,,
2,white,,6115.0,,,Hematopoietic and reticuloendothelial systems,,,,,...,,,,,,,,,,
3,white,,5854.0,,,Hematopoietic and reticuloendothelial systems,,,,,...,,,,,,,,,,
4,white,,5129.0,,,Hematopoietic and reticuloendothelial systems,,,,,...,,,,,,,,,,


In [109]:
# Converting age at diagnosis from days to years
cleaning_df['Age_At_Diagnosis'] = cleaning_df['Age_At_Diagnosis'] / 365
cleaning_df.head()

Unnamed: 0,Race,Deceased_Status,Age_At_Diagnosis,Prior_Malignancy,Synchronous_Malignancy,Primary_Site,TP53,PIK3CA,TTN,APC,...,VHL,PBRM1,MET,SMAD4,BRAF,LRP2,FREM2,KMT2D,BTG2,B2M
0,black or african american,,9.602739,,,Hematopoietic and reticuloendothelial systems,,,,,...,,,,,,,,,,
1,white,,6.375342,,,Hematopoietic and reticuloendothelial systems,,,,,...,,,,,,,,,,
2,white,,16.753426,,,Hematopoietic and reticuloendothelial systems,,,,,...,,,,,,,,,,
3,white,,16.038357,,,Hematopoietic and reticuloendothelial systems,,,,,...,,,,,,,,,,
4,white,,14.052054,,,Hematopoietic and reticuloendothelial systems,,,,,...,,,,,,,,,,


In [110]:
# Converting prior_malignancy not reported to NaN
cleaning_df['Prior_Malignancy'].replace({"yes": 1, "no": 0, "not reported": np.nan, "unknown": np.nan}, inplace=True)
cleaning_df['Synchronous_Malignancy'].replace({"Yes": 1, "No": 0, "Not Reported": np.nan}, inplace=True)
cleaning_df.head()

Unnamed: 0,Race,Deceased_Status,Age_At_Diagnosis,Prior_Malignancy,Synchronous_Malignancy,Primary_Site,TP53,PIK3CA,TTN,APC,...,VHL,PBRM1,MET,SMAD4,BRAF,LRP2,FREM2,KMT2D,BTG2,B2M
0,black or african american,,9.602739,,,Hematopoietic and reticuloendothelial systems,,,,,...,,,,,,,,,,
1,white,,6.375342,,,Hematopoietic and reticuloendothelial systems,,,,,...,,,,,,,,,,
2,white,,16.753426,,,Hematopoietic and reticuloendothelial systems,,,,,...,,,,,,,,,,
3,white,,16.038357,,,Hematopoietic and reticuloendothelial systems,,,,,...,,,,,,,,,,
4,white,,14.052054,,,Hematopoietic and reticuloendothelial systems,,,,,...,,,,,,,,,,


In [111]:
# Converting Race not reported to NaN
cleaning_df['Race'].replace({"not reported": np.nan}, inplace=True)
cleaning_df['Race'].replace({"Unknown": np.nan}, inplace=True)
cleaning_df['Race'].replace({"not allowed to collect": np.nan}, inplace=True)

# Converting Native Americans and Native Hawaiians to Other
cleaning_df['Race'].replace({"american indian or alaska native": "other"}, inplace=True)
cleaning_df['Race'].replace({"native hawaiian or other pacific islander": "other"}, inplace=True)

cleaning_df.head()

Unnamed: 0,Race,Deceased_Status,Age_At_Diagnosis,Prior_Malignancy,Synchronous_Malignancy,Primary_Site,TP53,PIK3CA,TTN,APC,...,VHL,PBRM1,MET,SMAD4,BRAF,LRP2,FREM2,KMT2D,BTG2,B2M
0,black or african american,,9.602739,,,Hematopoietic and reticuloendothelial systems,,,,,...,,,,,,,,,,
1,white,,6.375342,,,Hematopoietic and reticuloendothelial systems,,,,,...,,,,,,,,,,
2,white,,16.753426,,,Hematopoietic and reticuloendothelial systems,,,,,...,,,,,,,,,,
3,white,,16.038357,,,Hematopoietic and reticuloendothelial systems,,,,,...,,,,,,,,,,
4,white,,14.052054,,,Hematopoietic and reticuloendothelial systems,,,,,...,,,,,,,,,,


In [112]:
cleaning_df.isna().sum()

Race                       2228
Deceased_Status           12345
Age_At_Diagnosis            723
Prior_Malignancy           7910
Synchronous_Malignancy     8510
Primary_Site                  1
TP53                       6518
PIK3CA                     6518
TTN                        6518
APC                        6518
CSMD3                      6518
MUC16                      6518
KRAS                       6518
NRAS                       6518
MUC5B                      6518
NOTCH1                     6518
IGHV2-70                   6518
IGLV3-1                    6518
DNMT3A                     6518
NPM1                       6518
FLT3                       6518
PTEN                       6518
IDH1                       6518
ATRX                       6518
ARID1A                     6518
VHL                        6518
PBRM1                      6518
MET                        6518
SMAD4                      6518
BRAF                       6518
LRP2                       6518
FREM2   

In [113]:
# Replacing missing values with np.nan
cleaning_df.replace({np.nan: False}, inplace = True)
cleaning_df.head()

Unnamed: 0,Race,Deceased_Status,Age_At_Diagnosis,Prior_Malignancy,Synchronous_Malignancy,Primary_Site,TP53,PIK3CA,TTN,APC,...,VHL,PBRM1,MET,SMAD4,BRAF,LRP2,FREM2,KMT2D,BTG2,B2M
0,black or african american,False,9.602739,False,False,Hematopoietic and reticuloendothelial systems,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,white,False,6.375342,False,False,Hematopoietic and reticuloendothelial systems,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,white,False,16.753426,False,False,Hematopoietic and reticuloendothelial systems,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,white,False,16.038357,False,False,Hematopoietic and reticuloendothelial systems,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,white,False,14.052054,False,False,Hematopoietic and reticuloendothelial systems,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [114]:
# Replacing True and False values in the SSMS columns with 1 and 0
cleaning_df.replace({False: 0, True: 1}, inplace=True)
cleaning_df.head()

Unnamed: 0,Race,Deceased_Status,Age_At_Diagnosis,Prior_Malignancy,Synchronous_Malignancy,Primary_Site,TP53,PIK3CA,TTN,APC,...,VHL,PBRM1,MET,SMAD4,BRAF,LRP2,FREM2,KMT2D,BTG2,B2M
0,black or african american,0.0,9.602739,0.0,0.0,Hematopoietic and reticuloendothelial systems,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,white,0.0,6.375342,0.0,0.0,Hematopoietic and reticuloendothelial systems,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,white,0.0,16.753426,0.0,0.0,Hematopoietic and reticuloendothelial systems,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,white,0.0,16.038357,0.0,0.0,Hematopoietic and reticuloendothelial systems,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,white,0.0,14.052054,0.0,0.0,Hematopoietic and reticuloendothelial systems,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [115]:
cleaning_df.shape

(14304, 35)

In [116]:
print(cleaning_df.isna().sum())
print(cleaning_df.shape)

Race                      0
Deceased_Status           0
Age_At_Diagnosis          0
Prior_Malignancy          0
Synchronous_Malignancy    0
Primary_Site              0
TP53                      0
PIK3CA                    0
TTN                       0
APC                       0
CSMD3                     0
MUC16                     0
KRAS                      0
NRAS                      0
MUC5B                     0
NOTCH1                    0
IGHV2-70                  0
IGLV3-1                   0
DNMT3A                    0
NPM1                      0
FLT3                      0
PTEN                      0
IDH1                      0
ATRX                      0
ARID1A                    0
VHL                       0
PBRM1                     0
MET                       0
SMAD4                     0
BRAF                      0
LRP2                      0
FREM2                     0
KMT2D                     0
BTG2                      0
B2M                       0
dtype: int64
(14304,

In [117]:
cleaning_df = cleaning_df.drop(cleaning_df[cleaning_df['Race'] == 0].index)

In [118]:
cleaning_df['Race'].value_counts()

Race
white                        9857
black or african american    1248
asian                         670
other                         301
Name: count, dtype: int64

In [119]:
cleaning_df.loc[cleaning_df.Age_At_Diagnosis == 0, 'Age_At_Diagnosis'] = np.nan

In [120]:
print(cleaning_df.isna().sum())
print(cleaning_df.shape)

Race                       0
Deceased_Status            0
Age_At_Diagnosis          96
Prior_Malignancy           0
Synchronous_Malignancy     0
Primary_Site               0
TP53                       0
PIK3CA                     0
TTN                        0
APC                        0
CSMD3                      0
MUC16                      0
KRAS                       0
NRAS                       0
MUC5B                      0
NOTCH1                     0
IGHV2-70                   0
IGLV3-1                    0
DNMT3A                     0
NPM1                       0
FLT3                       0
PTEN                       0
IDH1                       0
ATRX                       0
ARID1A                     0
VHL                        0
PBRM1                      0
MET                        0
SMAD4                      0
BRAF                       0
LRP2                       0
FREM2                      0
KMT2D                      0
BTG2                       0
B2M           

In [121]:
# Get features dataframe
features_df = cleaning_df.loc[:, cleaning_df.columns != 'Primary_Site']
features_df.head()

Unnamed: 0,Race,Deceased_Status,Age_At_Diagnosis,Prior_Malignancy,Synchronous_Malignancy,TP53,PIK3CA,TTN,APC,CSMD3,...,VHL,PBRM1,MET,SMAD4,BRAF,LRP2,FREM2,KMT2D,BTG2,B2M
0,black or african american,0.0,9.602739,0.0,0.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,white,0.0,6.375342,0.0,0.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,white,0.0,16.753426,0.0,0.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,white,0.0,16.038357,0.0,0.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,white,0.0,14.052054,0.0,0.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [122]:
# Hotcoding Race, Pathologic_Stage, Primary_Diagnosis, and Disease_Type
hotcoded_df = pd.get_dummies(features_df)
hotcoded_df.head()

Unnamed: 0,Deceased_Status,Age_At_Diagnosis,Prior_Malignancy,Synchronous_Malignancy,TP53,PIK3CA,TTN,APC,CSMD3,MUC16,...,BRAF,LRP2,FREM2,KMT2D,BTG2,B2M,Race_asian,Race_black or african american,Race_other,Race_white
0,0.0,9.602739,0.0,0.0,0,0,0,0,0,0,...,0,0,0,0,0,0,False,True,False,False
1,0.0,6.375342,0.0,0.0,0,0,0,0,0,0,...,0,0,0,0,0,0,False,False,False,True
2,0.0,16.753426,0.0,0.0,0,0,0,0,0,0,...,0,0,0,0,0,0,False,False,False,True
3,0.0,16.038357,0.0,0.0,0,0,0,0,0,0,...,0,0,0,0,0,0,False,False,False,True
4,0.0,14.052054,0.0,0.0,0,0,0,0,0,0,...,0,0,0,0,0,0,False,False,False,True


In [123]:
hotcoded_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12076 entries, 0 to 14303
Data columns (total 37 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Deceased_Status                 12076 non-null  float64
 1   Age_At_Diagnosis                11980 non-null  float64
 2   Prior_Malignancy                12076 non-null  float64
 3   Synchronous_Malignancy          12076 non-null  float64
 4   TP53                            12076 non-null  int64  
 5   PIK3CA                          12076 non-null  int64  
 6   TTN                             12076 non-null  int64  
 7   APC                             12076 non-null  int64  
 8   CSMD3                           12076 non-null  int64  
 9   MUC16                           12076 non-null  int64  
 10  KRAS                            12076 non-null  int64  
 11  NRAS                            12076 non-null  int64  
 12  MUC5B                           12076

In [129]:
target_df = cleaning_df.loc[:, cleaning_df.columns == 'Primary_Site']
target_df.value_counts()

Primary_Site                                                                  
Hematopoietic and reticuloendothelial systems                                     4832
Brain                                                                             1233
Bronchus and lung                                                                 1184
Kidney                                                                            1174
Breast                                                                            1022
Corpus uteri                                                                       516
Skin                                                                               495
Thyroid gland                                                                      416
Pancreas                                                                           366
Colon                                                                              307
Uterus, NOS                                        

In [130]:
target_df.replace({'Colon': 'Colorectal', 
                   'Rectum': 'Colorectal', 
                   'Rectosigmoid junction': 'Colorectal', 
                   'Connective, subcutaneous and other soft tissues': 'Other', 
                   'Other and ill-defined sites': 'Other', 
                   'Uterus, NOS': 'Uterus', 'Corpus uteri': 'Uterus', 
                   'Lymph nodes': 'Hematopoietic and reticuloendothelial systems', 
                   'Ovary': 'Other',
                   'Unknown': 'Other', 
                   0: 'Other', 
                   'Other and unspecified female genital organs': 'Other', 
                   'Stomach': 'Other', 
                   'Esophagus': 'Other', 'Retroperitoneum and peritoneum': 'Other', 
                   'Bones, joints and articular cartilage of other and unspecified sites': 'Other', 
                   'Small intestine': 'Other', 
                   'Heart, mediastinum, and pleura': 'Other', 
                   'Other and ill-defined sites within respiratory system and intrathoracic organs': 'Other', 
                   'Other and unspecified major salivary glands': 'Other', 
                   'Other and ill-defined digestive organs': 'Other', 
                   'Testis': 'Other', 
                   'Gallbladder': 'Other'}, inplace=True)
target_df.value_counts()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  target_df.replace({'Colon': 'Colorectal',


Primary_Site                                 
Hematopoietic and reticuloendothelial systems    4862
Brain                                            1233
Bronchus and lung                                1184
Kidney                                           1174
Breast                                           1022
Uterus                                            752
Skin                                              495
Thyroid gland                                     416
Colorectal                                        408
Pancreas                                          366
Other                                             164
Name: count, dtype: int64

In [131]:
# Hotcoding target variables
target_df.replace({'Hematopoietic and reticuloendothelial systems': 0, 
                   'Brain': 1,
                   'Bronchus and lung': 2,
                   'Kidney': 3,
                   'Breast': 4,
                   'Uterus': 5, 
                   'Skin': 6, 
                   'Thyroid gland': 7, 
                   'Colorectal': 8,
                   'Pancreas': 9,
                   'Other': 10}, inplace=True)
target_df.value_counts()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  target_df.replace({'Hematopoietic and reticuloendothelial systems': 0,


Primary_Site
0               4862
1               1233
2               1184
3               1174
4               1022
5                752
6                495
7                416
8                408
9                366
10               164
Name: count, dtype: int64

In [132]:
final_df = pd.concat([target_df, hotcoded_df], axis=1)
print(final_df.shape)
final_df.head()

(12076, 38)


Unnamed: 0,Primary_Site,Deceased_Status,Age_At_Diagnosis,Prior_Malignancy,Synchronous_Malignancy,TP53,PIK3CA,TTN,APC,CSMD3,...,BRAF,LRP2,FREM2,KMT2D,BTG2,B2M,Race_asian,Race_black or african american,Race_other,Race_white
0,0,0.0,9.602739,0.0,0.0,0,0,0,0,0,...,0,0,0,0,0,0,False,True,False,False
1,0,0.0,6.375342,0.0,0.0,0,0,0,0,0,...,0,0,0,0,0,0,False,False,False,True
2,0,0.0,16.753426,0.0,0.0,0,0,0,0,0,...,0,0,0,0,0,0,False,False,False,True
3,0,0.0,16.038357,0.0,0.0,0,0,0,0,0,...,0,0,0,0,0,0,False,False,False,True
4,0,0.0,14.052054,0.0,0.0,0,0,0,0,0,...,0,0,0,0,0,0,False,False,False,True


In [133]:
final_df = final_df[final_df.Primary_Site != 10]
final_df.shape

(11912, 38)

In [134]:
final_df.replace({False: 0, True: 1}, inplace=True)
final_df.head()

Unnamed: 0,Primary_Site,Deceased_Status,Age_At_Diagnosis,Prior_Malignancy,Synchronous_Malignancy,TP53,PIK3CA,TTN,APC,CSMD3,...,BRAF,LRP2,FREM2,KMT2D,BTG2,B2M,Race_asian,Race_black or african american,Race_other,Race_white
0,0,0.0,9.602739,0.0,0.0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,0,0.0,6.375342,0.0,0.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0.0,16.753426,0.0,0.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0.0,16.038357,0.0,0.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0.0,14.052054,0.0,0.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [135]:
total_samples = len(final_df)
num_classes = final_df['Primary_Site'].nunique()

weight0 = total_samples / (final_df['Primary_Site'].value_counts()[0] * num_classes)
weight1 = total_samples / (final_df['Primary_Site'].value_counts()[1] * num_classes)
weight2 = total_samples / (final_df['Primary_Site'].value_counts()[2] * num_classes)
weight3 = total_samples / (final_df['Primary_Site'].value_counts()[3] * num_classes)
weight4 = total_samples / (final_df['Primary_Site'].value_counts()[4] * num_classes)
weight5 = total_samples / (final_df['Primary_Site'].value_counts()[5] * num_classes)
weight6 = total_samples / (final_df['Primary_Site'].value_counts()[6] * num_classes)
weight7 = total_samples / (final_df['Primary_Site'].value_counts()[7] * num_classes)
weight8 = total_samples / (final_df['Primary_Site'].value_counts()[8] * num_classes)
weight9 = total_samples / (final_df['Primary_Site'].value_counts()[9] * num_classes)

conditions = [
    (final_df['Primary_Site'] == 0),
    (final_df['Primary_Site'] == 1),
    (final_df['Primary_Site'] == 2),
    (final_df['Primary_Site'] == 3),
    (final_df['Primary_Site'] == 4),
    (final_df['Primary_Site'] == 5),
    (final_df['Primary_Site'] == 6),
    (final_df['Primary_Site'] == 7),
    (final_df['Primary_Site'] == 8),
    (final_df['Primary_Site'] == 9),
]

results = [
    weight0,
    weight1,
    weight2,
    weight3,
    weight4,
    weight5,
    weight6,
    weight7,
    weight8,
    weight9,
]

final_df['Weight'] = np.select(conditions, results)

final_df['Weight'].value_counts()

Weight
0.245002    4862
0.966099    1233
1.006081    1184
1.014651    1174
1.165558    1022
1.584043     752
2.406465     495
2.863462     416
2.919608     408
3.254645     366
Name: count, dtype: int64

In [136]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 11912 entries, 0 to 14303
Data columns (total 39 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Primary_Site                    11912 non-null  int64  
 1   Deceased_Status                 11912 non-null  float64
 2   Age_At_Diagnosis                11817 non-null  float64
 3   Prior_Malignancy                11912 non-null  float64
 4   Synchronous_Malignancy          11912 non-null  float64
 5   TP53                            11912 non-null  int64  
 6   PIK3CA                          11912 non-null  int64  
 7   TTN                             11912 non-null  int64  
 8   APC                             11912 non-null  int64  
 9   CSMD3                           11912 non-null  int64  
 10  MUC16                           11912 non-null  int64  
 11  KRAS                            11912 non-null  int64  
 12  NRAS                            11912

In [137]:
final_df = reduce_mem_usage(final_df)
final_df['Deceased_Status'] = final_df['Deceased_Status'].astype('int8')
final_df['Prior_Malignancy'] = final_df['Prior_Malignancy'].astype('int8')
final_df['Synchronous_Malignancy'] = final_df['Synchronous_Malignancy'].astype('int8')
final_df.info()

Mem. usage decreased to  0.70 Mb (80.6% reduction)
<class 'pandas.core.frame.DataFrame'>
Index: 11912 entries, 0 to 14303
Data columns (total 39 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Primary_Site                    11912 non-null  int8   
 1   Deceased_Status                 11912 non-null  int8   
 2   Age_At_Diagnosis                11817 non-null  float32
 3   Prior_Malignancy                11912 non-null  int8   
 4   Synchronous_Malignancy          11912 non-null  int8   
 5   TP53                            11912 non-null  int8   
 6   PIK3CA                          11912 non-null  int8   
 7   TTN                             11912 non-null  int8   
 8   APC                             11912 non-null  int8   
 9   CSMD3                           11912 non-null  int8   
 10  MUC16                           11912 non-null  int8   
 11  KRAS                            11912 non-null 

In [138]:
# Writing the dataframe on S3 as a CSV

with io.StringIO() as csv_buffer:
    final_df.to_csv(csv_buffer, index=False)

    response = s3_client.put_object(
        Bucket=AWS_S3_BUCKET, Key="processed/tcga_processed.csv", Body=csv_buffer.getvalue()
    )

    status = response.get("ResponseMetadata", {}).get("HTTPStatusCode")

    if status == 200:
        print(f"Successful S3 put_object response. Status - {status}")
    else:
        print(f"Unsuccessful S3 put_object response. Status - {status}")

Successful S3 put_object response. Status - 200


In [99]:
# Creating training and testing subfolders and datasets for the S3 bucket
from sklearn.model_selection import train_test_split

In [139]:
train_df, test_df = train_test_split(final_df, test_size=0.1, random_state=42)
print(train_df.shape)
print(test_df.shape)
train_df.head()

(10720, 39)
(1192, 39)


Unnamed: 0,Primary_Site,Deceased_Status,Age_At_Diagnosis,Prior_Malignancy,Synchronous_Malignancy,TP53,PIK3CA,TTN,APC,CSMD3,...,LRP2,FREM2,KMT2D,BTG2,B2M,Race_asian,Race_black or african american,Race_other,Race_white,Weight
5334,2,1,76.449318,0,0,1,0,1,0,1,...,0,0,0,0,0,0,0,0,1,1.006081
14106,0,0,20.676712,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0.245002
8320,0,0,2.161644,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0.245002
4512,5,0,64.454796,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1.584043
11173,4,1,57.663013,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1.165558


In [140]:
# Writing the train dataframe on S3 as a CSV
train_df = train_df.drop('Weight', axis=1)
with io.StringIO() as csv_buffer:
    train_df.to_csv(csv_buffer, index=False, header=None)

    response = s3_client.put_object(
        Bucket=AWS_S3_BUCKET, Key="train/data.csv", Body=csv_buffer.getvalue()
    )

    status = response.get("ResponseMetadata", {}).get("HTTPStatusCode")

    if status == 200:
        print(f"Successful S3 put_object response. Status - {status}")
    else:
        print(f"Unsuccessful S3 put_object response. Status - {status}")

Successful S3 put_object response. Status - 200


In [141]:
# Writing the test dataframe on S3 as a CSV
test_df = test_df.drop('Weight', axis=1)
with io.StringIO() as csv_buffer:
    test_df.to_csv(csv_buffer, index=False, header=None)

    response = s3_client.put_object(
        Bucket=AWS_S3_BUCKET, Key="validation/data.csv", Body=csv_buffer.getvalue()
    )

    status = response.get("ResponseMetadata", {}).get("HTTPStatusCode")

    if status == 200:
        print(f"Successful S3 put_object response. Status - {status}")
    else:
        print(f"Unsuccessful S3 put_object response. Status - {status}")

Successful S3 put_object response. Status - 200


In [142]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10720 entries, 5334 to 8850
Data columns (total 38 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Primary_Site                    10720 non-null  int8   
 1   Deceased_Status                 10720 non-null  int8   
 2   Age_At_Diagnosis                10635 non-null  float32
 3   Prior_Malignancy                10720 non-null  int8   
 4   Synchronous_Malignancy          10720 non-null  int8   
 5   TP53                            10720 non-null  int8   
 6   PIK3CA                          10720 non-null  int8   
 7   TTN                             10720 non-null  int8   
 8   APC                             10720 non-null  int8   
 9   CSMD3                           10720 non-null  int8   
 10  MUC16                           10720 non-null  int8   
 11  KRAS                            10720 non-null  int8   
 12  NRAS                            107

In [143]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1192 entries, 1581 to 11643
Data columns (total 38 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Primary_Site                    1192 non-null   int8   
 1   Deceased_Status                 1192 non-null   int8   
 2   Age_At_Diagnosis                1182 non-null   float32
 3   Prior_Malignancy                1192 non-null   int8   
 4   Synchronous_Malignancy          1192 non-null   int8   
 5   TP53                            1192 non-null   int8   
 6   PIK3CA                          1192 non-null   int8   
 7   TTN                             1192 non-null   int8   
 8   APC                             1192 non-null   int8   
 9   CSMD3                           1192 non-null   int8   
 10  MUC16                           1192 non-null   int8   
 11  KRAS                            1192 non-null   int8   
 12  NRAS                            119