### Python Review Project - Part 2

In [232]:
import pandas as pd
import numpy as np

#### Aman Sharma

###### Importing CCRCC 

In [233]:
import cptac
cptac.download(dataset="Ccrcc")
ccrcc = cptac.Ccrcc()

                                          

###### Getting the necessary data + Stage 1/3 patients

In [234]:
protein_data = ccrcc.get_proteomics()
protein_data.columns = protein_data.columns.get_level_values(0)
rna_data = ccrcc.get_transcriptomics()
rna_data.columns = rna_data.columns.get_level_values(0)
clinical_data = ccrcc.get_clinical()

In [235]:
# Builds a mask that will be helpful for filtering stage 1 and 3 patients
# it'll store the patient ids that fall under this category

stage1and3 = clinical_data[clinical_data['tumor_stage_pathological'].isin(['Stage I', 'Stage III'])]
patient_ids_13 = stage1and3.index

###### Protein data 

In [236]:
protein_data_13 = protein_data.loc[patient_ids_13]

In [237]:
# Gathers stage 1 and stage 3 patients respectively for patient data

protein_data_stage1 = protein_data_13.loc[clinical_data['tumor_stage_pathological'] == 'Stage I']
protein_data_stage3 = protein_data_13.loc[clinical_data['tumor_stage_pathological'] == 'Stage III']

In [238]:
# Gathers the means for stage 1 and stage 3 patients respectively

protein_data_stage1_means = protein_data_stage1.mean()
protein_data_stage3_means = protein_data_stage3.mean()

In [239]:
# Gathers differentiation and prints out the top 5

protein_differential = (protein_data_stage1_means - protein_data_stage3_means).abs()
top5_proteins = protein_differential.nlargest(5).index.tolist()

print("Top 5 differentially expressed proteins: ", top5_proteins)

Top 5 differentially expressed proteins:  ['LDB3', 'BTBD7', 'GDF6', 'COX4I2', 'SNCB']


###### RNA data

In [240]:
# Gathers the filtered rna data and log scales the rna data

rna_data_13 = rna_data.loc[patient_ids_13]
rna_data_13 = np.log2(rna_data_13 + 1)

In [241]:
rna_data_stage1 = rna_data_13.loc[clinical_data['tumor_stage_pathological'] == 'Stage I']
rna_data_stage3 = rna_data_13.loc[clinical_data['tumor_stage_pathological'] == 'Stage III']

In [242]:
rna_data_stage1_means = rna_data_stage1.mean()
rna_data_stage3_means = rna_data_stage3.mean()

In [243]:
rna_differential = (rna_data_stage1_means - rna_data_stage3_means).abs()
top5_rna = rna_differential.nlargest(5).index.tolist()

print("Top 4 differentially expressed RNA: ", top5_rna)

Top 4 differentially expressed RNA:  ['SAA1', 'FGG', 'IL20RB', 'FGA', 'DPEP1']


###### Creation of final dataframe + stage column

In [244]:
# Mask creation and getting columns from the protein_data and rna_data
# The columns we get are the top 5 protein/rna values
stage1and3 = clinical_data[clinical_data['tumor_stage_pathological'].isin(['Stage I', 'Stage III'])]

protein_data_final = protein_data[top5_proteins]
rna_data_final = rna_data[top5_rna]

# Concatenation of protein and rna data along with imputation using median for NaN values
combined_features = pd.concat([protein_data_final, rna_data_final], axis = 1)
combined_features.fillna(combined_features.median(), inplace = True)
# Adding the final_data columns and the stage column
final_data = combined_features.join(clinical_data['tumor_stage_pathological'])
final_data.rename(columns = {'tumor_stage_pathological': 'Stage'}, inplace = True)

# I know we are supposed to use lists, but I personally wanted everything to be together
# for organization's case.

###### Encoding and scaling of features

In [245]:
final_data

Unnamed: 0_level_0,LDB3,LDB3,LDB3,BTBD7,GDF6,COX4I2,SNCB,SAA1,FGG,IL20RB,FGA,DPEP1,Stage
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
C3L-00004,-0.077437,-0.077437,-0.374673,-1.823736,-0.051074,-0.079757,0.250234,1.950050,0.648374,6.434866,2.808154,0.423428,Stage III
C3L-00010,-0.077437,-0.077437,-0.802613,-1.823736,-0.051074,-0.079757,0.250234,1.468143,0.033347,1.314524,0.124209,22.874116,Stage I
C3L-00011,-0.077437,-0.077437,-1.215860,-1.823736,-0.051074,-0.079757,0.250234,12.636296,150.929169,40.215020,49.302256,0.257918,Stage IV
C3L-00026,-0.077437,-0.077437,-0.002303,-1.823736,-0.051074,-0.079757,0.250234,0.271053,0.184699,2.203213,0.209662,24.854416,Stage I
C3L-00079,-0.077437,-0.077437,-1.696459,-1.978903,1.831929,-0.079757,0.250234,7.418851,0.165071,1.357986,0.065876,1.297931,Stage III
...,...,...,...,...,...,...,...,...,...,...,...,...,...
C3N-01646.N,-0.077437,0.803529,0.897684,-1.823736,-0.051074,-0.079757,0.250234,0.401329,0.064823,0.449951,0.232824,87.758048,
C3N-01648.N,-0.077437,-0.077437,1.201632,-1.823736,-0.051074,-0.079757,0.250234,1.882143,0.386614,0.642226,0.514296,268.328861,
C3N-01649.N,-0.077437,-0.077437,0.181077,-1.823736,-0.051074,-0.079757,0.250234,3.726070,0.039061,0.227620,0.983810,200.675947,
C3N-01651.N,-0.077437,-0.077437,0.376318,-1.823736,-0.051074,-0.362112,0.250234,1.636172,0.630656,0.444703,2.023031,390.729888,


In [246]:
from sklearn.preprocessing import StandardScaler

numerical_features = final_data.columns.drop('Stage')
scaler = StandardScaler()
final_data[numerical_features] = scaler.fit_transform(final_data[numerical_features])

if 'Stage' in final_data.columns:
    most_common_stage = final_data['Stage'].mode()[0]
    final_data['Stage'].fillna(most_common_stage, inplace=True)

final_data.dropna(subset=['Stage'], inplace=True)

#final_data = pd.get_dummies(final_data, columns = ['Stage'])

###### Splitting data

In [247]:
from sklearn.model_selection import train_test_split

X = final_data.drop('Stage', axis = 1)
y = final_data['Stage']

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.7, random_state = 42)

###### ML Models

In [248]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
import warnings

warnings.filterwarnings("ignore")

models = {
    "KNeighborsClassifier": KNeighborsClassifier(),
    "DecisionTreeClassifier": DecisionTreeClassifier(),
    "MLPClassifier": MLPClassifier(max_iter = 1000), 
    "GaussianNB": GaussianNB()
}

accuracies = {model: [] for model in models}

for _ in range(10):
    for model_name, model in models.items():
        X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.7)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        accuracies[model_name].append(accuracy)

mean_accuracies = {model: np.mean(accuracies[model]) for model in models}

for model, accuracy in mean_accuracies.items():
    print(f"{model}: {accuracy}")

KNeighborsClassifier: 0.7050847457627119
DecisionTreeClassifier: 0.6372881355932203
MLPClassifier: 0.6610169491525424
GaussianNB: 0.43728813559322033


#### Based on the results, the KNN model performed the best