In [103]:
# Initialize constants
import os
import pandas as pd
import numpy as np
clinical = pd.read_csv('clinical.csv')
somatic = pd.read_csv('somatic.csv')

In [104]:
# 1. Number of rows and columns for CLINICAL dataset
print("clinical.csv:\n")

col = clinical.shape[1]
print("columns =", col, "\n")

row = clinical.shape[0]
print("rows =", row)      

clinical.csv:

columns = 7 

rows = 5


In [105]:
# 2. Number of patients.
patients = clinical.patient
print(patients, "\n")
print("patients =", len(clinical.patient), "\n")

print("Unqiue patients =", clinical['patient'].nunique())

0    P2938
1    P2829
2    P2885
3    P2731
4    P2740
Name: patient, dtype: object 

patients = 5 

Unqiue patients = 5


In [106]:
# 3. Unique races and number of patients of a race
clinical.loc[:, ["race"]] # : iterates through all rows, "race" pulls only race column

Unnamed: 0,race
0,white
1,white
2,black or african american
3,black or african american
4,white


In [107]:
# sort by race, shows number of patients per race
race = clinical.groupby("race").size()

print(race)

race
black or african american    2
white                        3
dtype: int64


In [108]:
print("Unique 'races':")
clinical['race'].nunique()

Unique 'races':


2

In [109]:
# 4. Unique diagnosis and number of patients with a specific diagnosis
clinical.loc[:, ['diagnosis']]

Unnamed: 0,diagnosis
0,Breast Invasive Carcinoma
1,Breast Invasive Carcinoma
2,Breast Invasive Carcinoma
3,Normal
4,Normal


In [110]:
# sort by diagnosis, shows number of patients per diagnosis
dx = clinical.groupby('diagnosis').size()

print(dx)

diagnosis
Breast Invasive Carcinoma    3
Normal                       2
dtype: int64


In [111]:
# shows unique diagnoses by getting the shape of the above groupby list
print("Unique 'diagnoses':")

dx.shape[0]

Unique 'diagnoses':


2

In [112]:
# list all patients by calling column "patient"
print("All patients in clinical.csv:")

clinical.patient

All patients in clinical.csv:


0    P2938
1    P2829
2    P2885
3    P2731
4    P2740
Name: patient, dtype: object

In [113]:
# 5. Add an “age” column
age = [30, 49, 78, 66, 18] # arbitrary values for age
clinical['age'] = age # add column to clinical.csv from list

clinical

Unnamed: 0,patient,year_of_birth,age_at_diagnosis,race,tumor_stage,diagnosis,subtype,age
0,P2938,1959,20880,white,stage iii,Breast Invasive Carcinoma,LumA,30
1,P2829,1953,22693,white,stage iib,Breast Invasive Carcinoma,LumA,49
2,P2885,1967,18689,black or african american,stage i,Breast Invasive Carcinoma,Basal,78
3,P2731,1960,17275,black or african american,,Normal,,66
4,P2740,1972,16282,white,,Normal,,18


In [114]:
# 6. Convert “age_at_diagnosis” from days to years and update the values of this column

clinical["age_at_diagnosis"] = (clinical["age_at_diagnosis"] / 365.25) 
# code after the '=' divides each value in column 'age_at_diagnosis' by 365.25 (0.25 accounts for leap year)
# code before the '=' updates the column with the new calculated values
print("\nNew clinical.csv with updated age_at_diagnosis values:")
clinical


New clinical.csv with updated age_at_diagnosis values:


Unnamed: 0,patient,year_of_birth,age_at_diagnosis,race,tumor_stage,diagnosis,subtype,age
0,P2938,1959,57.166324,white,stage iii,Breast Invasive Carcinoma,LumA,30
1,P2829,1953,62.130048,white,stage iib,Breast Invasive Carcinoma,LumA,49
2,P2885,1967,51.167693,black or african american,stage i,Breast Invasive Carcinoma,Basal,78
3,P2731,1960,47.296372,black or african american,,Normal,,66
4,P2740,1972,44.577687,white,,Normal,,18


In [115]:
# 7. Average age_at_diagnosis of patients with cancers

# First, isolate the two columns of reference
clinical.loc[0:, ['age_at_diagnosis', 'diagnosis']] # Beginning at row index [0], onwards

Unnamed: 0,age_at_diagnosis,diagnosis
0,57.166324,Breast Invasive Carcinoma
1,62.130048,Breast Invasive Carcinoma
2,51.167693,Breast Invasive Carcinoma
3,47.296372,Normal
4,44.577687,Normal


In [116]:
cancer = 'Breast Invasive Carcinoma' # Initialize variable for our (only) cancer diagnosis

# Initialize a new variable, average_age_cancer
# For each patient with the "cancer" diagnosis established above, save their age
average_age_cancer = clinical.loc[clinical['diagnosis'] == cancer, 'age_at_diagnosis'] 

print("\nAverage age at diagnosis for patients with Breast Invasive Carcinoma:")
average_age_cancer


Average age at diagnosis for patients with Breast Invasive Carcinoma:


0    57.166324
1    62.130048
2    51.167693
Name: age_at_diagnosis, dtype: float64

In [117]:
avg1 = sum(average_age_cancer) / len(average_age_cancer)
avg1

56.82135523613963

In [118]:
# 8. Average age_at_diagnosis of patients without cancers
clinical.loc[0:, ['age_at_diagnosis', 'diagnosis']]

Unnamed: 0,age_at_diagnosis,diagnosis
0,57.166324,Breast Invasive Carcinoma
1,62.130048,Breast Invasive Carcinoma
2,51.167693,Breast Invasive Carcinoma
3,47.296372,Normal
4,44.577687,Normal


In [119]:
normal = 'Normal'
average_age_norm = clinical.loc[clinical['diagnosis'] == normal, 'age_at_diagnosis']
average_age_norm

3    47.296372
4    44.577687
Name: age_at_diagnosis, dtype: float64

In [120]:
avg1 = sum(average_age_norm) / len(average_age_norm)
avg1

45.937029431895965

In [121]:
somatic = pd.DataFrame({
"patient" : ["P2938", "P2938", "P2829", "P2885", "P2885", "P2938", "P2829", "P2938", "P2885", 
                "P2938", "P2938", "P2885", "P2829", "P2938"],
"gene": ["CUL1", "DNMT3A", "EIF5B", "EIF5B", "FOXP4", "HDGF", "HSPA13", "PIK3CB", "RBM44", 
                 "RBM44", "ROBO2", "TP53", "TP53", "TP53"],
"variant_class": ["Missense", "Frame_Shift_Del", "Missense", "Nonsense", "Frame_Shift", "Silent", 
                    "Missense", "Missense", "Missense", "Frame_Shift", "Missense", "Missense", "Missense", "Missense"],
"ref_bp":["G", "T", "T", "C", "-", "C", "T", "C", "G", "-","G","G","T","T"],
"alt_bp":["T", "-", "A", "A", "A", "T", "C", "T", "T","T","A","A","C", "C"],
"prot_pos":[241, 315, 40, 40, 306, 188, 462, 68, 203, 204, 33, 193, 280, 179],
"ref_aa":["E", "M", "S", "S", "H", "E", "N", "M", "G", "L", "G","H","R","H"],
"alt_aa":["D", "X", "T", "*", "QX", "", "S", "I", "V","LX","R","Y","G", "R"]
})
somatic

Unnamed: 0,patient,gene,variant_class,ref_bp,alt_bp,prot_pos,ref_aa,alt_aa
0,P2938,CUL1,Missense,G,T,241,E,D
1,P2938,DNMT3A,Frame_Shift_Del,T,-,315,M,X
2,P2829,EIF5B,Missense,T,A,40,S,T
3,P2885,EIF5B,Nonsense,C,A,40,S,*
4,P2885,FOXP4,Frame_Shift,-,A,306,H,QX
5,P2938,HDGF,Silent,C,T,188,E,
6,P2829,HSPA13,Missense,T,C,462,N,S
7,P2938,PIK3CB,Missense,C,T,68,M,I
8,P2885,RBM44,Missense,G,T,203,G,V
9,P2938,RBM44,Frame_Shift,-,T,204,L,LX


In [122]:
# 1. Number of rows and columns.
print("rows =", somatic.shape[0])
print("columns =", somatic.shape[1])

rows = 14
columns = 8


In [123]:
# 2. Number of patients.
print(f"\nNumber of unique patients = {somatic['patient'].nunique()}")


Number of unique patients = 3


In [124]:
# Print unique patients and values/instances for each

somatic.groupby('patient').size()

patient
P2829    3
P2885    4
P2938    7
dtype: int64

In [125]:
# 3. Number of unique genes.
print(f"Number of unique genes: {somatic['gene'].nunique()}")

Number of unique genes: 10


In [126]:
# Print unique genes and number of values for each
geneUniq = somatic.groupby('gene').size()
print(geneUniq)

gene
CUL1      1
DNMT3A    1
EIF5B     2
FOXP4     1
HDGF      1
HSPA13    1
PIK3CB    1
RBM44     2
ROBO2     1
TP53      3
dtype: int64


In [127]:
# 4. Which gene is mutated the most frequently?
# Method one: .iloc[]
mostUniq = geneUniq.iloc[-1:]
print(f"Most mutated gene:\n {mostUniq}")

Most mutated gene:
 gene
TP53    3
dtype: int64


In [128]:
# Method 2: .mode()
somatic['gene'].mode()

0    TP53
Name: gene, dtype: object

In [129]:
# 5. Number of unique variant classes.
print("Uniqe variant classes:", somatic['variant_class'].nunique())

Uniqe variant classes: 5


In [130]:
# show table of unique variant classes with associated counts
somatic.groupby('variant_class').count()

Unnamed: 0_level_0,patient,gene,ref_bp,alt_bp,prot_pos,ref_aa,alt_aa
variant_class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Frame_Shift,2,2,2,2,2,2,2
Frame_Shift_Del,1,1,1,1,1,1,1
Missense,9,9,9,9,9,9,9
Nonsense,1,1,1,1,1,1,1
Silent,1,1,1,1,1,1,1


In [131]:
# Straight from workbook, no unique contribution, still insightful
# shows table of unique variant class, sorted by count
varUniq = somatic[['variant_class', 'patient']].groupby('variant_class').count().sort_values('patient', ascending=False)
varUniq

Unnamed: 0_level_0,patient
variant_class,Unnamed: 1_level_1
Missense,9
Frame_Shift,2
Frame_Shift_Del,1
Nonsense,1
Silent,1


In [132]:
# 6. Which variant class is the most frequently observed?
# Method 1: .max()
print(varUniq.max(), "\n")

# Find instances by .loc()
print(varUniq.loc['Missense'])

patient    9
dtype: int64 

patient    9
Name: Missense, dtype: int64


In [133]:
# Method 2: .iloc()
varUniq.iloc[0,:]

patient    9
Name: Missense, dtype: int64

In [134]:
# 7. Find patients who have more than three somatic mutations. Return the IDs as a set.

# Group by 'patient' and count the number of mutations per patient
somatic_by_patient = somatic.groupby('patient').count()

# Filter patients with more than three somatic mutations
patients_with_more_than_three_mutations = somatic_by_patient[somatic_by_patient['gene'] > 3] # terrible variable name, just awful

# Use the built-in function __builtins__.list to convert the index to a list if 'list' has been overridden
list_of_patient_ids = __builtins__.list(patients_with_more_than_three_mutations.index)

# Alternatively, you can use a list comprehension as a workaround
list_of_patient_ids = [id for id in patients_with_more_than_three_mutations.index]

# Proceed to convert the list to a set to ensure uniqueness if needed
set_of_patient_ids = set(list_of_patient_ids)

print("Patients who have more than three somatic mutations:\n", set_of_patient_ids)

Patients who have more than three somatic mutations:
 {'P2885', 'P2938'}


In [135]:
# 8. Find patients who have no silent mutations using a set.

silent = somatic[somatic['variant_class'] == 'Silent']

silent_patient = set(silent['patient']) # great novel, by the way

all_patient = set(somatic['patient'])
#print(silent_patient)
#print(all_patient)
print(all_patient.difference(silent_patient))

{'P2885', 'P2829'}


In [136]:
# Perform an outer join of “clinical” and “somatic” data on patient IDs.
clinical_somatic = clinical.join(somatic.set_index('patient'), on='patient', how='outer')
clinical_somatic

Unnamed: 0,patient,year_of_birth,age_at_diagnosis,race,tumor_stage,diagnosis,subtype,age,gene,variant_class,ref_bp,alt_bp,prot_pos,ref_aa,alt_aa
0,P2938,1959,57.166324,white,stage iii,Breast Invasive Carcinoma,LumA,30,CUL1,Missense,G,T,241.0,E,D
0,P2938,1959,57.166324,white,stage iii,Breast Invasive Carcinoma,LumA,30,DNMT3A,Frame_Shift_Del,T,-,315.0,M,X
0,P2938,1959,57.166324,white,stage iii,Breast Invasive Carcinoma,LumA,30,HDGF,Silent,C,T,188.0,E,
0,P2938,1959,57.166324,white,stage iii,Breast Invasive Carcinoma,LumA,30,PIK3CB,Missense,C,T,68.0,M,I
0,P2938,1959,57.166324,white,stage iii,Breast Invasive Carcinoma,LumA,30,RBM44,Frame_Shift,-,T,204.0,L,LX
0,P2938,1959,57.166324,white,stage iii,Breast Invasive Carcinoma,LumA,30,ROBO2,Missense,G,A,33.0,G,R
0,P2938,1959,57.166324,white,stage iii,Breast Invasive Carcinoma,LumA,30,TP53,Missense,T,C,179.0,H,R
1,P2829,1953,62.130048,white,stage iib,Breast Invasive Carcinoma,LumA,49,EIF5B,Missense,T,A,40.0,S,T
1,P2829,1953,62.130048,white,stage iib,Breast Invasive Carcinoma,LumA,49,HSPA13,Missense,T,C,462.0,N,S
1,P2829,1953,62.130048,white,stage iib,Breast Invasive Carcinoma,LumA,49,TP53,Missense,T,C,280.0,R,G


In [137]:
# 1. Number of rows of columns.
print("Columns: ",  clinical_somatic.shape[0])
print("Rows:  \t", clinical_somatic.shape[1])

Columns:  16
Rows:  	 15


In [138]:
mutation_by_diagnosis = clinical_somatic[['patient', 'diagnosis', 'gene']].groupby(['diagnosis', 'patient']).count()
mutation_by_diagnosis

Unnamed: 0_level_0,Unnamed: 1_level_0,gene
diagnosis,patient,Unnamed: 2_level_1
Breast Invasive Carcinoma,P2829,3
Breast Invasive Carcinoma,P2885,4
Breast Invasive Carcinoma,P2938,7
Normal,P2731,0
Normal,P2740,0


In [144]:
# 2. Average number of somatic mutations for patients with cancers.
print("Average number of somatic mutations for patients with cancers:\n", 
      mutation_by_diagnosis.loc['Breast Invasive Carcinoma'].mean())

Average number of somatic mutations for patients with cancers:
 gene    4.666667
dtype: float64


In [145]:
# 3. Average number of somatic mutations for patients without cancers.
print("Average number of somatic mutations for patients without cancers:\n", 
      mutation_by_diagnosis.loc['Normal'].mean())

Average number of somatic mutations for patients without cancers:
 gene    0.0
dtype: float64
