<a href="https://colab.research.google.com/github/begumbasovali/-Predicting-Drug-Resistance-in-Mutated-EGFR/blob/main/Bioinformatic_DataPreprocess.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np

In [4]:
mut = pd.read_csv("last_Mutation_data.csv", sep=";")
resp = pd.read_csv("last_response_data.csv", sep=";")

print("Mutation shape:", mut.shape)
print("Response shape:", resp.shape)

print("\nMutation columns:", list(mut.columns))
print("Response columns:", list(resp.columns))

mut.head()


Mutation shape: (29, 15)
Response shape: (36, 4)

Mutation columns: ['ModelID', 'VariantType', 'VariantInfo', 'ProteinChange', 'HugoSymbol', 'Exon', 'VepImpact', 'Sift', 'Polyphen', 'LikelyLoF', 'RevelScore', 'ProveanPrediction', 'AMClass', 'AMPathogenicity', 'Hotspot']
Response columns: ['Ids', 'Drug Name', 'ModelID', 'Response']


Unnamed: 0,ModelID,VariantType,VariantInfo,ProteinChange,HugoSymbol,Exon,VepImpact,Sift,Polyphen,LikelyLoF,RevelScore,ProveanPrediction,AMClass,AMPathogenicity,Hotspot
0,ACH-000996,SNV,missense_variant,p.K28T,EGFR,1/28,MODERATE,deleterious_low_confidence(0.01),possibly_damaging(0.68),YANLIŞ,279.0,Neutral,likely_benign,3052.0,YANLIŞ
1,ACH-000784,deletion,frameshift_variant,p.Y69MfsTer11,EGFR,2/28,HIGH,,,DOĞRU,,,,,YANLIŞ
2,ACH-000978,SNV,missense_variant,p.N234D,EGFR,6/28,MODERATE,deleterious_low_confidence(0.02),benign(0.02),YANLIŞ,229.0,Neutral,likely_benign,2657.0,YANLIŞ
3,ACH-000955,SNV,missense_variant,p.V292M,EGFR,7/28,MODERATE,deleterious_low_confidence(0),probably_damaging(0.99),YANLIŞ,548.0,Damaging,ambiguous,4597.0,YANLIŞ
4,ACH-000445,SNV,missense_variant,p.V292L,EGFR,7/28,MODERATE,deleterious_low_confidence(0.02),benign(0.377),YANLIŞ,379.0,Neutral,ambiguous,487.0,YANLIŞ


In [5]:
resp_clean = resp.copy()

# Sütun isimlerini daha rahat kullanmak için yeniden adlandır
resp_clean = resp_clean.rename(columns={"Drug Name": "DrugName", "Response": "ResponseValue"})

# ModelID trim
resp_clean["ModelID"] = resp_clean["ModelID"].astype(str).str.strip()

# Virgüllü ondalık -> nokta -> float
resp_clean["ResponseValue"] = (
    resp_clean["ResponseValue"].astype(str).str.replace(",", ".", regex=False)
)
resp_clean["ResponseValue"] = pd.to_numeric(resp_clean["ResponseValue"], errors="coerce")

print(resp_clean.isna().sum())
resp_clean.head()


Ids              0
DrugName         0
ModelID          0
ResponseValue    4
dtype: int64


Unnamed: 0,Ids,DrugName,ModelID,ResponseValue
0,BRD:BRD-K42805893-001-04-9,OSIMERTINIB,ACH-000996,0.05103
1,BRD:BRD-K64052750-001-17-5,GEFITINIB,ACH-000996,-0.030484
2,BRD:BRD-K66175015-001-09-0,AFATINIB,ACH-000996,-0.106863
3,BRD:BRD-K70401845-003-09-6,ERLOTINIB,ACH-000996,0.861794
4,BRD:BRD-K42805893-001-04-9,OSIMERTINIB,ACH-000784,-0.407999


In [6]:
mut_clean = mut.copy()

# ProteinChange: "p." prefix kaldır
mut_clean["ProteinChange"] = mut_clean["ProteinChange"].astype(str).str.strip()
mut_clean["ProteinChange_clean"] = mut_clean["ProteinChange"].str.replace("^p\\.", "", regex=True).str.strip()

# Boş/NaN mutasyon varsa düşür
mut_clean = mut_clean.replace({"": np.nan, "nan": np.nan, "None": np.nan})
mut_clean = mut_clean.dropna(subset=["ProteinChange_clean"]).copy()

# Aynı ModelID + aynı mutasyon tekrar ediyorsa tekilleştir
mut_clean = mut_clean.drop_duplicates(subset=["ModelID", "ProteinChange_clean"])

mut_clean.head()


Unnamed: 0,ModelID,VariantType,VariantInfo,ProteinChange,HugoSymbol,Exon,VepImpact,Sift,Polyphen,LikelyLoF,RevelScore,ProveanPrediction,AMClass,AMPathogenicity,Hotspot,ProteinChange_clean
0,ACH-000996,SNV,missense_variant,p.K28T,EGFR,1/28,MODERATE,deleterious_low_confidence(0.01),possibly_damaging(0.68),YANLIŞ,279.0,Neutral,likely_benign,3052.0,YANLIŞ,K28T
1,ACH-000784,deletion,frameshift_variant,p.Y69MfsTer11,EGFR,2/28,HIGH,,,DOĞRU,,,,,YANLIŞ,Y69MfsTer11
2,ACH-000978,SNV,missense_variant,p.N234D,EGFR,6/28,MODERATE,deleterious_low_confidence(0.02),benign(0.02),YANLIŞ,229.0,Neutral,likely_benign,2657.0,YANLIŞ,N234D
3,ACH-000955,SNV,missense_variant,p.V292M,EGFR,7/28,MODERATE,deleterious_low_confidence(0),probably_damaging(0.99),YANLIŞ,548.0,Damaging,ambiguous,4597.0,YANLIŞ,V292M
4,ACH-000445,SNV,missense_variant,p.V292L,EGFR,7/28,MODERATE,deleterious_low_confidence(0.02),benign(0.377),YANLIŞ,379.0,Neutral,ambiguous,487.0,YANLIŞ,V292L


In [9]:
def tr_bool_to_int(x):
    if pd.isna(x):
        return 0
    s = str(x).strip().lower()
    if s in ["doğru", "dogru", "true", "t", "1", "yes", "y"]:
        return 1
    return 0

# 0/1 kolonlarını üret
mut_clean["Hotspot_i"] = mut_clean["Hotspot"].apply(tr_bool_to_int)
mut_clean["LikelyLoF_i"] = mut_clean["LikelyLoF"].apply(tr_bool_to_int)

# Orijinal metin kolonlarını DROP et (kafa karışmasın diye)
mut_clean = mut_clean.drop(columns=["Hotspot", "LikelyLoF"])

# Kontrol
mut_clean[["ModelID","ProteinChange_clean","Hotspot_i","LikelyLoF_i"]].head(10)


Unnamed: 0,ModelID,ProteinChange_clean,Hotspot_i,LikelyLoF_i
0,ACH-000996,K28T,0,0
1,ACH-000784,Y69MfsTer11,0,1
2,ACH-000978,N234D,0,0
3,ACH-000955,V292M,0,0
4,ACH-000445,V292L,0,0
5,ACH-000805,S306L,0,0
6,ACH-000888,C311F,0,0
7,ACH-000963,Q408R,0,0
8,ACH-000985,Q486E,0,0
9,ACH-000994,R531Ter,0,1


In [10]:
def vep_impact_score(x):
    if pd.isna(x):
        return 0
    s = str(x).strip().upper()
    return {"HIGH": 3, "MODERATE": 2, "LOW": 1}.get(s, 0)

def sift_score(x):
    if pd.isna(x):
        return 0
    s = str(x).lower()
    if "deleterious" in s:
        return 2
    if "tolerated" in s:
        return 0
    return 0

def polyphen_score(x):
    if pd.isna(x):
        return 0
    s = str(x).lower()
    if "probably_damaging" in s:
        return 2
    if "possibly_damaging" in s:
        return 1
    if "benign" in s:
        return 0
    return 0

def proteinchange_hint(pc):
    if pd.isna(pc):
        return 0
    s = str(pc).lower()
    score = 0
    if "fs" in s:
        score += 3
    if "*" in s or "ter" in s:
        score += 2
    if "del" in s:
        score += 1
    return score

mut_clean["VepImpact_s"] = mut_clean["VepImpact"].apply(vep_impact_score)
mut_clean["SIFT_s"] = mut_clean["Sift"].apply(sift_score)
mut_clean["Polyphen_s"] = mut_clean["Polyphen"].apply(polyphen_score)
mut_clean["PC_hint_s"] = mut_clean["ProteinChange_clean"].apply(proteinchange_hint)


In [11]:
mut_clean["RepScore"] = (
    10*mut_clean["Hotspot_i"] +
    8*mut_clean["LikelyLoF_i"] +
    3*mut_clean["VepImpact_s"] +
    2*mut_clean["SIFT_s"] +
    2*mut_clean["Polyphen_s"] +
    2*mut_clean["PC_hint_s"]
)

# Kontrol: her ModelID için en yüksek skorlar üstte mi?
mut_clean.sort_values(
    ["ModelID","RepScore","ProteinChange_clean"],
    ascending=[True, False, True]
)[["ModelID","ProteinChange_clean","VepImpact","Hotspot_i","LikelyLoF_i","RepScore"]].head(25)


Unnamed: 0,ModelID,ProteinChange_clean,VepImpact,Hotspot_i,LikelyLoF_i,RepScore
16,ACH-000012,E746_A750del,MODERATE,1,0,18
15,ACH-000030,E746_A750del,MODERATE,1,0,18
14,ACH-000035,E746_A750del,MODERATE,1,0,18
11,ACH-000041,M600V,MODERATE,0,0,6
25,ACH-000278,H1124Q,MODERATE,0,0,6
4,ACH-000445,V292L,MODERATE,0,0,10
10,ACH-000479,G598V,MODERATE,1,0,22
19,ACH-000587,L858R,MODERATE,1,0,24
18,ACH-000587,T790M,MODERATE,1,0,24
28,ACH-000666,Q1159H,MODERATE,0,0,10


In [12]:
mutation_list = (
    mut_clean.groupby("ModelID")["ProteinChange_clean"]
    .apply(lambda s: ";".join(sorted(set(s.astype(str)))))
    .reset_index(name="MutationList")
)

mutation_list.head(10)


Unnamed: 0,ModelID,MutationList
0,ACH-000012,E746_A750del
1,ACH-000030,E746_A750del
2,ACH-000035,E746_A750del
3,ACH-000041,M600V
4,ACH-000278,H1124Q
5,ACH-000445,V292L
6,ACH-000479,G598V
7,ACH-000587,L858R;T790M
8,ACH-000666,Q1159H
9,ACH-000784,Y69MfsTer11


In [13]:
mut_sorted = mut_clean.sort_values(
    ["ModelID", "RepScore", "ProteinChange_clean"],
    ascending=[True, False, True]
)

rep = mut_sorted.groupby("ModelID").head(1).copy()

rep_summary = rep[[
    "ModelID",
    "ProteinChange_clean",
    "VariantType",
    "VariantInfo",
    "Exon",
    "VepImpact",
    "Sift",
    "Polyphen",
    "RevelScore",
    "ProveanPrediction",
    "AMClass",
    "AMPathogenicity",
    "Hotspot_i",
    "LikelyLoF_i",
    "RepScore"
]].rename(columns={
    "ProteinChange_clean": "RepresentativeMutation",
    "Hotspot_i": "Hotspot",
    "LikelyLoF_i": "LikelyLoF"
})

rep_summary.head(10)


Unnamed: 0,ModelID,RepresentativeMutation,VariantType,VariantInfo,Exon,VepImpact,Sift,Polyphen,RevelScore,ProveanPrediction,AMClass,AMPathogenicity,Hotspot,LikelyLoF,RepScore
16,ACH-000012,E746_A750del,deletion,inframe_deletion,19/28,MODERATE,,,,,,,1,0,18
15,ACH-000030,E746_A750del,deletion,inframe_deletion,19/28,MODERATE,,,,,,,1,0,18
14,ACH-000035,E746_A750del,deletion,inframe_deletion,19/28,MODERATE,,,,,,,1,0,18
11,ACH-000041,M600V,SNV,missense_variant,15/28,MODERATE,tolerated_low_confidence(0.49),benign(0.089),18.0,Neutral,likely_benign,921.0,0,0,6
25,ACH-000278,H1124Q,SNV,missense_variant,28/28,MODERATE,tolerated_low_confidence(0.41),benign(0),63.0,Neutral,likely_benign,704.0,0,0,6
4,ACH-000445,V292L,SNV,missense_variant,7/28,MODERATE,deleterious_low_confidence(0.02),benign(0.377),379.0,Neutral,ambiguous,487.0,0,0,10
10,ACH-000479,G598V,SNV,missense_variant,15/28,MODERATE,deleterious_low_confidence(0),possibly_damaging(0.766),61.0,Damaging,likely_pathogenic,838.0,1,0,22
19,ACH-000587,L858R,SNV,missense_variant,21/28,MODERATE,deleterious_low_confidence(0),probably_damaging(0.997),961.0,Damaging,likely_pathogenic,9968.0,1,0,24
28,ACH-000666,Q1159H,SNV,missense_variant,28/28,MODERATE,deleterious_low_confidence(0.04),benign(0.352),15.0,Neutral,likely_benign,1374.0,0,0,10
1,ACH-000784,Y69MfsTer11,deletion,frameshift_variant,2/28,HIGH,,,,,,,0,1,27


In [14]:
mut_summary = mutation_list.merge(rep_summary, on="ModelID", how="left")

print("mut_summary shape:", mut_summary.shape)
mut_summary.head(10)


mut_summary shape: (24, 16)


Unnamed: 0,ModelID,MutationList,RepresentativeMutation,VariantType,VariantInfo,Exon,VepImpact,Sift,Polyphen,RevelScore,ProveanPrediction,AMClass,AMPathogenicity,Hotspot,LikelyLoF,RepScore
0,ACH-000012,E746_A750del,E746_A750del,deletion,inframe_deletion,19/28,MODERATE,,,,,,,1,0,18
1,ACH-000030,E746_A750del,E746_A750del,deletion,inframe_deletion,19/28,MODERATE,,,,,,,1,0,18
2,ACH-000035,E746_A750del,E746_A750del,deletion,inframe_deletion,19/28,MODERATE,,,,,,,1,0,18
3,ACH-000041,M600V,M600V,SNV,missense_variant,15/28,MODERATE,tolerated_low_confidence(0.49),benign(0.089),18.0,Neutral,likely_benign,921.0,0,0,6
4,ACH-000278,H1124Q,H1124Q,SNV,missense_variant,28/28,MODERATE,tolerated_low_confidence(0.41),benign(0),63.0,Neutral,likely_benign,704.0,0,0,6
5,ACH-000445,V292L,V292L,SNV,missense_variant,7/28,MODERATE,deleterious_low_confidence(0.02),benign(0.377),379.0,Neutral,ambiguous,487.0,0,0,10
6,ACH-000479,G598V,G598V,SNV,missense_variant,15/28,MODERATE,deleterious_low_confidence(0),possibly_damaging(0.766),61.0,Damaging,likely_pathogenic,838.0,1,0,22
7,ACH-000587,L858R;T790M,L858R,SNV,missense_variant,21/28,MODERATE,deleterious_low_confidence(0),probably_damaging(0.997),961.0,Damaging,likely_pathogenic,9968.0,1,0,24
8,ACH-000666,Q1159H,Q1159H,SNV,missense_variant,28/28,MODERATE,deleterious_low_confidence(0.04),benign(0.352),15.0,Neutral,likely_benign,1374.0,0,0,10
9,ACH-000784,Y69MfsTer11,Y69MfsTer11,deletion,frameshift_variant,2/28,HIGH,,,,,,,0,1,27


In [15]:
master_v1 = resp_clean.merge(mut_summary, on="ModelID", how="left")

print("master_v1 shape:", master_v1.shape)
master_v1.head(10)


master_v1 shape: (36, 19)


Unnamed: 0,Ids,DrugName,ModelID,ResponseValue,MutationList,RepresentativeMutation,VariantType,VariantInfo,Exon,VepImpact,Sift,Polyphen,RevelScore,ProveanPrediction,AMClass,AMPathogenicity,Hotspot,LikelyLoF,RepScore
0,BRD:BRD-K42805893-001-04-9,OSIMERTINIB,ACH-000996,0.05103,K28T,K28T,SNV,missense_variant,1/28,MODERATE,deleterious_low_confidence(0.01),possibly_damaging(0.68),279.0,Neutral,likely_benign,3052.0,0,0,12
1,BRD:BRD-K64052750-001-17-5,GEFITINIB,ACH-000996,-0.030484,K28T,K28T,SNV,missense_variant,1/28,MODERATE,deleterious_low_confidence(0.01),possibly_damaging(0.68),279.0,Neutral,likely_benign,3052.0,0,0,12
2,BRD:BRD-K66175015-001-09-0,AFATINIB,ACH-000996,-0.106863,K28T,K28T,SNV,missense_variant,1/28,MODERATE,deleterious_low_confidence(0.01),possibly_damaging(0.68),279.0,Neutral,likely_benign,3052.0,0,0,12
3,BRD:BRD-K70401845-003-09-6,ERLOTINIB,ACH-000996,0.861794,K28T,K28T,SNV,missense_variant,1/28,MODERATE,deleterious_low_confidence(0.01),possibly_damaging(0.68),279.0,Neutral,likely_benign,3052.0,0,0,12
4,BRD:BRD-K42805893-001-04-9,OSIMERTINIB,ACH-000784,-0.407999,Y69MfsTer11,Y69MfsTer11,deletion,frameshift_variant,2/28,HIGH,,,,,,,0,1,27
5,BRD:BRD-K64052750-001-17-5,GEFITINIB,ACH-000784,0.779613,Y69MfsTer11,Y69MfsTer11,deletion,frameshift_variant,2/28,HIGH,,,,,,,0,1,27
6,BRD:BRD-K66175015-001-09-0,AFATINIB,ACH-000784,-0.313073,Y69MfsTer11,Y69MfsTer11,deletion,frameshift_variant,2/28,HIGH,,,,,,,0,1,27
7,BRD:BRD-K70401845-003-09-6,ERLOTINIB,ACH-000784,0.346152,Y69MfsTer11,Y69MfsTer11,deletion,frameshift_variant,2/28,HIGH,,,,,,,0,1,27
8,BRD:BRD-K42805893-001-04-9,OSIMERTINIB,ACH-000978,0.080921,N234D,N234D,SNV,missense_variant,6/28,MODERATE,deleterious_low_confidence(0.02),benign(0.02),229.0,Neutral,likely_benign,2657.0,0,0,10
9,BRD:BRD-K64052750-001-17-5,GEFITINIB,ACH-000978,-0.533544,N234D,N234D,SNV,missense_variant,6/28,MODERATE,deleterious_low_confidence(0.02),benign(0.02),229.0,Neutral,likely_benign,2657.0,0,0,10


In [16]:
print("Missing RepresentativeMutation ratio:", master_v1["RepresentativeMutation"].isna().mean())

# Boş kalanları gör
master_v1[master_v1["RepresentativeMutation"].isna()][["ModelID","DrugName","ResponseValue"]].drop_duplicates().head(20)


Missing RepresentativeMutation ratio: 0.0


Unnamed: 0,ModelID,DrugName,ResponseValue


In [17]:
mut_clean[mut_clean["ModelID"]=="ACH-000996"]["ProteinChange_clean"].nunique()


1

In [18]:
master_v1[master_v1["ModelID"]=="ACH-000996"][["ModelID","DrugName","ResponseValue"]]


Unnamed: 0,ModelID,DrugName,ResponseValue
0,ACH-000996,OSIMERTINIB,0.05103
1,ACH-000996,GEFITINIB,-0.030484
2,ACH-000996,AFATINIB,-0.106863
3,ACH-000996,ERLOTINIB,0.861794


In [19]:
master_v1.groupby(["ModelID"])["DrugName"].nunique().sort_values(ascending=False).head(10)


Unnamed: 0_level_0,DrugName
ModelID,Unnamed: 1_level_1
ACH-000445,4
ACH-000784,4
ACH-000805,4
ACH-000888,4
ACH-000955,4
ACH-000963,4
ACH-000978,4
ACH-000985,4
ACH-000996,4


In [20]:
mut_summary.to_csv("A_mutation_summary.csv", index=False)
master_v1.to_csv("master_v1_response_plus_mutations.csv", index=False)

print("Saved: A_mutation_summary.csv")
print("Saved: master_v1_response_plus_mutations.csv")


Saved: A_mutation_summary.csv
Saved: master_v1_response_plus_mutations.csv
