In [1]:
import pandas as pd 
import matplotlib.pyplot as plt 
import csv 

In [2]:
#sanity check 
df = pd.read_csv("mutation_with_sequences.csv")
print(df.head())

# Check for mismatched lengths
mismatch_rows = []
for idx, row in df.iterrows():
    wild_seq = row["wild_seq"]
    mut_seq = row["mut_seq"]
    if len(wild_seq) != len(mut_seq):
        mismatch_rows.append((idx, len(wild_seq), len(mut_seq)))

# Report results
if not mismatch_rows:
    print("All wild_seq and mut_seq pairs are of equal L length.")
else:
    print("❌ Found mismatches in the following rows (index, wild_len, mut_len):")
    for item in mismatch_rows:
        print(item)

       Source   Gene                ENST Gene Code           ENST.1  \
0  cBioPortal  BRCA1   ENST00000357654.9    P38398  ENST00000357654   
1  cBioPortal  BRCA2   ENST00000380152.8    P51587  ENST00000380152   
2  cBioPortal   CDH1  ENST00000261769.10    P12830  ENST00000261769   
3  cBioPortal   CDH1  ENST00000261769.10    P12830  ENST00000261769   
4  cBioPortal   CDH1  ENST00000261769.10    P12830  ENST00000261769   

     Gene Name Mutation    Type  \
0  BRCA1_HUMAN   G1788V  Driver   
1  BRCA2_HUMAN   R2336C  Driver   
2  CADH1_HUMAN    D288N  Driver   
3  CADH1_HUMAN    D254Y  Driver   
4  CADH1_HUMAN    R732Q  Driver   

                                            wild_seq  \
0  MDLSALRVEEVQNVINAMQKILECPICLELIKEPVSTKCDHIFCKF...   
1  MPIGSKERPTFFEIFKTRCNKADLGPISLNWFEELSSEAPPYNSEP...   
2  MGPWSRSLSALLLLLQVSSWLCQEPEPCHPGFDAESYTFTVPRRHL...   
3  MGPWSRSLSALLLLLQVSSWLCQEPEPCHPGFDAESYTFTVPRRHL...   
4  MGPWSRSLSALLLLLQVSSWLCQEPEPCHPGFDAESYTFTVPRRHL...   

                         

In [3]:
#sanity check 2
# Compute the length of each wild_seq
wild_lengths = df["wild_seq"].apply(len)

# Get unique lengths
unique_lengths = wild_lengths.unique()

# Report results
if len(unique_lengths) == 1:
    print(f"✅ All wild_seq entries have the same length: {unique_lengths[0]}")
else:
    print(f"❌ Found multiple wild_seq lengths: {sorted(unique_lengths)}")
    print("Number of sequences for each length:")
    print(wild_lengths.value_counts().sort_index())



❌ Found multiple wild_seq lengths: [np.int64(189), np.int64(198), np.int64(393), np.int64(399), np.int64(403), np.int64(413), np.int64(443), np.int64(472), np.int64(510), np.int64(552), np.int64(595), np.int64(661), np.int64(707), np.int64(724), np.int64(727), np.int64(821), np.int64(882), np.int64(1053), np.int64(1068), np.int64(1182), np.int64(1210), np.int64(1255), np.int64(1304), np.int64(1342), np.int64(1401), np.int64(1464), np.int64(1512), np.int64(1516), np.int64(1541), np.int64(1544), np.int64(1863), np.int64(1884), np.int64(1960), np.int64(2073), np.int64(2090), np.int64(2115), np.int64(2285), np.int64(2332), np.int64(2440), np.int64(2442), np.int64(2471), np.int64(2490), np.int64(2549), np.int64(3056), np.int64(3418), np.int64(3664), np.int64(4599), np.int64(4911), np.int64(4983)]
Number of sequences for each length:
wild_seq
189      43
198       2
393     212
399      36
403     130
413       7
443       3
472      36
510       1
552       5
595      24
661       1
707    