In [1]:
#@title Mount Google Drive (You don't need to run this if you are running notebooks on your laptop)

from google.colab import drive

# The following command will prompt a URL for you to click and obtain the
# authorization code

drive.mount("/content/drive")

Mounted at /content/drive


In [2]:
# Set data file location
# If you are running notebooks on your laptop, change this to the directory
# where you put downloaded files

from pathlib import Path

DATA = Path("/content/drive/My Drive/Colab Notebooks")

In [10]:
import pandas as pd
from scipy.stats import pearsonr

# Load the Excel file into a pandas DataFrame
file_path = '/content/drive/My Drive/Colab Notebooks/Amino Acid Mutation.xlsx'
df = pd.read_excel(file_path)
drive_path = '/content/drive/My Drive/Colab Notebooks'
import os
os.makedirs(drive_path, exist_ok=True)
# Extract the relevant columns
mutation_columns = [
    'A222V', 'A570D', 'A67V', 'A701V', 'D1118H', 'D215G', 'D405N', 'D614G', 'D796Y', 'D80A',
    'D950N', 'E154K', 'E484A', 'E484K', 'E484Q', 'F490S', 'F496V', 'F888L', 'FR157-158', 'G142D',
    'G339D', 'G446S', 'G496S', 'G75V', 'H655Y', 'H665Y', 'HV69', 'K417N', 'K417T', 'L18F', 'L452Q',
    'L452R', 'L981F', 'LAL242', 'N440K', 'N501Y', 'N679K', 'N764K', 'N856K', 'N969K', 'P681H', 'P681R',
    'Q1071H', 'Q493R', 'Q498R', 'Q52R', 'Q677H', 'Q954H', 'R246I', 'R408S', 'RSYLTPG246', 'S371L', 'S373P',
    'S375F', 'S477N', 'S982A', 'T1027I', 'T19R', 'T376A', 'T478K', 'T547K', 'T716I', 'T76I', 'T859N', 'T95I',
    'V70F', 'W258L', 'Y144', 'Y145H', 'Y505H'
]

rate_columns = ['Positive Rate', 'Hosp Rate', 'Death Rate', 'Anti Rate']

# Create a DataFrame to store p-values
p_values_df = pd.DataFrame(index=mutation_columns, columns=rate_columns)

# Calculate the p-value for each combination
for mutation in mutation_columns:
    for rate in rate_columns:
        r_value, p_value = pearsonr(df[mutation], df[rate])
        p_values_df.at[mutation, rate] = p_value
p_values_df.to_csv(os.path.join(drive_path, 'new_matrix.csv'), index=False)

# Display the p-values DataFrame
print("P-values for the relationship between mutation columns and rate columns:")
print(p_values_df)

P-values for the relationship between mutation columns and rate columns:
       Positive Rate Hosp Rate Death Rate Anti Rate
A222V       0.238802  0.223303   0.314215  0.736835
A570D       0.442237  0.159954   0.020326  0.000097
A67V        0.000017  0.000101   0.000043  0.188129
A701V       0.308079  0.258331   0.138942  0.168048
D1118H      0.442237  0.159954   0.020326  0.000097
...              ...       ...        ...       ...
V70F        0.167227   0.12578   0.169169  0.838428
W258L       0.167227   0.12578   0.169169  0.838428
Y144        0.052223  0.011851   0.000272  0.000201
Y145H        0.77508  0.858482   0.990389  0.385911
Y505H       0.708888  0.838932   0.996886  0.124641

[70 rows x 4 columns]


In [14]:
import pandas as pd
from scipy.stats import pearsonr

# Load the Excel file into a pandas DataFrame
file_path = '/content/drive/My Drive/Colab Notebooks/Variant_Tracking_Results.csv'
df = pd.read_csv(file_path, encoding='latin1')
drive_path = '/content/drive/My Drive/Colab Notebooks'
import os
os.makedirs(drive_path, exist_ok=True)
# Extract the relevant columns
variant_columns = ['Alpha (B.1.1.7)', 'Beta (B.1.351)', 'Gamma (P.1)', 'Zeta (P.2)', 'Eta (B.1.525)',
                   'Kappa (B.1.617.1)', 'Delta (B.1.617)', 'Delta (AY.1)', 'Delta (AY.4)', 'Lambda (C.37)',
                   'Mu (B.1.621)', 'Omicron (BA.1)', 'Omicron (BA.2)', 'Omicron (BA.3/BA.4)', 'Other Omicron']

rate_columns = ['Positive Rate', 'Hosp Rate', 'Death Rate', 'Anti Rate']

# Create a DataFrame to store p-values
p_values_df2 = pd.DataFrame(index=variant_columns, columns=rate_columns)

# Calculate the p-value for each combination
for variant in variant_columns:
    for rate in rate_columns:
        r_value, p_value = pearsonr(df[variant], df[rate])
        p_values_df2.at[variant, rate] = p_value
p_values_df2.to_csv(os.path.join(drive_path, 'new_matrix2.csv'), index=False)

# Display the p-values DataFrame
print("P-values for the relationship between mutation columns and rate columns:")
print(p_values_df2)

P-values for the relationship between mutation columns and rate columns:
                    Positive Rate Hosp Rate Death Rate Anti Rate
Alpha (B.1.1.7)          0.728925  0.423104   0.097522  0.003906
Beta (B.1.351)           0.661228  0.390408   0.133032  0.018808
Gamma (P.1)              0.704051  0.881636   0.950025  0.388385
Zeta (P.2)               0.484204  0.383071   0.225937   0.28201
Eta (B.1.525)            0.614851  0.506424   0.328009  0.384789
Kappa (B.1.617.1)        0.621839  0.516311   0.340702  0.404569
Delta (B.1.617)          0.377344  0.298014   0.205045  0.218417
Delta (AY.1)             0.692516   0.58464   0.369026   0.46372
Delta (AY.4)             0.904612  0.888053   0.829596  0.971856
Lambda (C.37)            0.640194  0.559369   0.901511  0.396315
Mu (B.1.621)             0.678461  0.623321   0.757229  0.532961
Omicron (BA.1)           0.406048  0.340529   0.790605  0.376251
Omicron (BA.2)           0.347753  0.284856   0.869452  0.415668
Omicron (BA.3/BA.