__This script is mainly for pre-processing column "Variation"__

## Input preparation

In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
## INPUT FILE IS MANUALLY ADJUSTED

training_set = pd.read_csv('training_variants')
final_file = "training_variants_cleaned.csv"

# training_set = pd.read_csv('test_variants')
# final_file = "test_variants_cleaned.csv"

In [4]:
training_variation = training_set[["ID","Variation"]]

Unnamed: 0,ID,Variation
0,0,Truncating Mutations
1,1,W802*
2,2,Q249E
3,3,N454D
4,4,L399V
...,...,...
95,95,S387Y
96,96,TGFBR1*6A
97,97,R537P
98,98,D404G


## Categorize types of variation information

- __Standard variation__ in form a.a+position+a.a. If there is * instead of a.a, then consider it as frameshift mutation.
- SNP: replaced.
- __fusion__: chromosomal translocation/ interstitial deletion/ chromosomal inversion. Taking overall average of substitution table. Adding 1 column to notice a fusion mutation.
- __fs__ : frameshift by indel (delection/insertion) that causes change in reading frame. Number of indel is varying. Then take average of substitution score of deletion and insertion. Adding 1 column to notice a frameshift mutation.
- __truncating mutation__: means shortening reading frame --> a type of frameshift mutation
- __overexpression__: gene expression is higher than normal level. Adding 1 column to notice a overexpression mutation.
- __splice__: indel/substitution mutation at splice site of pre-mature mRNA. Adding 1 column to notice a splice mutation.

![variation_categor.png](variation_categor.png)

In [4]:
## Import BLOSSUM62 matrix
## (a.a, *) = -4 for all a.a
from Bio.SubsMat import MatrixInfo    
blosum = MatrixInfo.blosum62 
avg_score = 0
count = 0
for (k1, k2) in blosum:
    if k1 != k2:
        avg_score += blosum[(k1,k2)]
        count += 1
avg_score = round((avg_score+20*(-4))/(count+20),0)
print(avg_score)
blosum[('W', 'F')]

-2.0




1

## Known amino acid change

Create column __Score__

In [5]:
training_variation["Variation_old"] = training_variation["Variation"].copy()

## Clean variation name in standard form but with "del" instead of "*"
del_pattern = re.compile("^[*\w]\d+del$", re.IGNORECASE)
training_variation["Del"] = training_variation["Variation"].str.match(del_pattern)
training_variation.loc[training_variation["Del"] == True
                       , "Variation"] = training_variation.loc[training_variation["Del"] == True
                                                                   , "Variation"].str.replace('del$', '*', regex=True, flags=re.IGNORECASE)


## Detect variation with standard name
stand_pattern = re.compile("^[*\w]\d+[\w*]$", re.IGNORECASE)
training_variation["Standard"] = training_variation["Variation"].str.match(stand_pattern)
training_variation.loc[training_variation['Standard'] == True, "Standard-last"] = training_variation.loc[training_variation['Standard'] == True, "Variation"].str.slice(start=-1)
training_variation.loc[training_variation['Standard'] == True, "Standard-first"] = training_variation.loc[training_variation['Standard'] == True, "Variation"].str.slice(stop=1)
training_variation["Standard-tuple"] = list(zip(training_variation["Standard-last"]
                                                , training_variation["Standard-first"]))

def map_blosum(x,y):
    if x.isnumeric():
        x = "*"
    if y.isnumeric():
        y = "*"
    if ((x=="*") | (y=="*")):
        return -4
    else:
        try:
            return blosum[(x,y)]
        except KeyError:
            return blosum[(y,x)]

training_variation.loc[training_variation['Standard'] == True
                       , "Score"] = training_variation.loc[training_variation['Standard'] == True
                                                           , "Standard-tuple"].apply(lambda x: map_blosum(x[0],x[1]))

training_variation.loc[training_variation['Standard'] == True
                       , "Known_del"] = training_variation.loc[training_variation['Standard'] == True
                                                           , "Standard-tuple"].apply(lambda x: True if ((x[0]=="*") | (x[1]=="*")) else False)



## Delete unused columns
training_variation.drop(["Del", "Standard-tuple", "Standard-last", "Standard-first"], axis=1, inplace=True)

training_variation.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,ID,Variation,Variation_old,Standard,Score,Known_del
0,0,Truncating Mutations,Truncating Mutations,False,,
1,1,W802*,W802*,True,-4.0,True
2,2,Q249E,Q249E,True,2.0,False
3,3,N454D,N454D,True,1.0,False
4,4,L399V,L399V,True,1.0,False


## Unknown amino acid change

### Fusion

In [6]:
fusion_pattern = re.compile("^(.*)(Fusion|Fusions)(.*)$", re.IGNORECASE)
training_variation["Fusion"] = training_variation["Variation"].str.match(fusion_pattern)
training_variation.loc[training_variation["Fusion"]==True, "Score"] = -2

### Splice

In [7]:
splice_pattern = re.compile("^(.*)splice(.*)$", re.IGNORECASE)
training_variation["Splice"] = training_variation["Variation"].str.match(splice_pattern)
training_variation.loc[training_variation["Splice"]==True, "Score"] = -2

### Overexpression/underexpression

In [8]:
expression_pattern = re.compile("^(.*)(expression|amplif|copy number|copies number)(.*)$", re.IGNORECASE)
training_variation["Expression"] = training_variation["Variation"].str.match(expression_pattern)

In [9]:
duplicate_pattern = re.compile("(.*)\ddup$", re.IGNORECASE)
training_variation["Duplicate"] = training_variation["Variation"].str.match(duplicate_pattern)
training_variation.loc[training_variation["Duplicate"] == True, "Expression"] = True

In [10]:
training_variation.loc[training_variation["Expression"]==True, "Score"] = -4

### Frameshift

In [11]:
expression_pattern = re.compile("^\w+\d+[\w+]fs[*]*[\d+]*$", re.IGNORECASE)
training_variation["Frameshift"] = training_variation["Variation"].str.match(expression_pattern)
training_variation.loc[training_variation["Frameshift"]==True, "Score"] = -4

#### Unknown truncating

In [12]:
truncate_pattern = re.compile("^(.*)(truncat|trunc)(.*)$", re.IGNORECASE)
training_variation["Truncate"] = training_variation["Variation"].str.match(truncate_pattern)
training_variation.loc[training_variation["Truncate"]==True, "Score"] = -4

#### Unknown deletion

In [13]:
deletion_pattern = re.compile("^(.*)(delete|del|deltion)(.*)$", re.IGNORECASE)
training_variation["Unknown_deletion"] = training_variation["Variation"].str.match(deletion_pattern)
training_variation.loc[training_variation["Unknown_deletion"]==True, "Score"] = -4

#### Unknown insertion

In [14]:
insertion_pattern = re.compile("^(.*)(insert|ins)(.*)$", re.IGNORECASE)
training_variation["Unknown_insertion"] = training_variation["Variation"].str.match(insertion_pattern)
training_variation.loc[training_variation["Unknown_insertion"]==True, "Score"] = -4

#### Final setup for frameshift

In [15]:
training_variation.loc[(training_variation.Truncate == True)
                       | (training_variation.Unknown_deletion == True)
                       | (training_variation.Unknown_insertion == True)
                       | (training_variation.Known_del == True)
                       , "Frameshift"
                      ] = True

In [16]:
training_variation.columns

Index(['ID', 'Variation', 'Variation_old', 'Standard', 'Score', 'Known_del',
       'Fusion', 'Splice', 'Expression', 'Duplicate', 'Frameshift', 'Truncate',
       'Unknown_deletion', 'Unknown_insertion'],
      dtype='object')

## Other mutations

Set average substitution score to un-categorized mutations

In [17]:
training_variation.loc[(training_variation.Standard == False)
                       & (training_variation.Fusion == False)
                       & (training_variation.Splice == False)
                       & (training_variation.Expression == False)
                       & (training_variation.Frameshift == False)
                       & (training_variation.Score.isnull())
                      , "Score"] = -2

## Check

### Combination of mutation types

In [18]:
training_variation[['Standard', 'Known_del',
       'Fusion', 'Splice', 'Expression', 'Frameshift', 'Truncate',
       'Unknown_deletion', 'Unknown_insertion']].drop_duplicates()

Unnamed: 0,Standard,Known_del,Fusion,Splice,Expression,Frameshift,Truncate,Unknown_deletion,Unknown_insertion
0,False,,False,False,False,True,True,False,False
1,True,True,False,False,False,True,False,False,False
2,True,False,False,False,False,False,False,False,False
7,False,,False,False,False,True,False,True,False
31,False,,False,False,False,False,False,False,False
33,False,,False,False,True,False,False,False,False
72,False,,False,False,False,True,False,False,False
138,False,,False,False,False,True,False,True,True
146,False,,False,False,False,True,False,False,True
164,False,,True,False,False,False,False,False,False


In [19]:
training_variation.loc[(training_variation.Fusion == True) & (training_variation.Frameshift == True)]

Unnamed: 0,ID,Variation,Variation_old,Standard,Score,Known_del,Fusion,Splice,Expression,Duplicate,Frameshift,Truncate,Unknown_deletion,Unknown_insertion
326,326,KDELR2-ROS1 Fusion,KDELR2-ROS1 Fusion,False,-4.0,,True,False,False,False,True,False,True,False
3223,3223,Delta-NTRK1 Fusion,Delta-NTRK1 Fusion,False,-4.0,,True,False,False,False,True,False,True,False


#### Clean unexpected combination

- Fusion == Frameshift : then choose Fusion (score=-2)
- Splice == Frameshift : then choose Splice (score=-2)
- Fusion == Expression : then choose Expression
- Splice == Expression : then choose Expression

In [20]:
## Fusion == Frameshift : then choose Fusion (score=-2)
training_variation.loc[(training_variation.Fusion == True) 
                       & (training_variation.Frameshift == True)
                      , "Score"] = -2
training_variation.loc[(training_variation.Fusion == True) 
                       & (training_variation.Frameshift == True)
                      , "Frameshift"] = False

In [21]:
## Splice == Frameshift : then choose Splice (score=-2)
training_variation.loc[(training_variation.Splice == True) 
                       & (training_variation.Frameshift == True)
                      , "Score"] = -2
training_variation.loc[(training_variation.Splice == True) 
                       & (training_variation.Frameshift == True)
                      , "Frameshift"] = False

In [22]:
## Fusion == Expression : then choose Expression
# training_variation.loc[(training_variation.Fusion == True) 
#                        & (training_variation.Expression == True)
#                       , "Score"] =
training_variation.loc[(training_variation.Fusion == True) 
                       & (training_variation.Expression == True)
                      , "Fusion"] = False

In [23]:
## Splice == Expression : then choose Expression
# training_variation.loc[(training_variation.Splice == True) 
#                        & (training_variation.Expression == True)
#                       , "Score"] =
training_variation.loc[(training_variation.Splice == True) 
                       & (training_variation.Expression == True)
                      , "Splice"] = False

### Null score

In [24]:
training_variation.loc[training_variation["Score"].isnull()].to_csv("check.csv")
training_variation.loc[training_variation["Score"].isnull()]

Unnamed: 0,ID,Variation,Variation_old,Standard,Score,Known_del,Fusion,Splice,Expression,Duplicate,Frameshift,Truncate,Unknown_deletion,Unknown_insertion


### Data description

In [25]:
training_variation.describe(include="all")

Unnamed: 0,ID,Variation,Variation_old,Standard,Score,Known_del,Fusion,Splice,Expression,Duplicate,Frameshift,Truncate,Unknown_deletion,Unknown_insertion
count,3321.0,3321,3321,3321,3321.0,2675,3321,3321,3321,3321,3321,3321,3321,3321
unique,,2996,2996,2,,2,2,2,2,2,2,2,2,2
top,,Truncating Mutations,Truncating Mutations,True,,False,False,False,False,False,False,False,False,False
freq,,93,93,2675,,2593,3139,3309,3231,3309,2904,3222,3144,3246
mean,1660.0,,,,-1.376995,,,,,,,,,
std,958.834449,,,,1.812108,,,,,,,,,
min,0.0,,,,-4.0,,,,,,,,,
25%,830.0,,,,-3.0,,,,,,,,,
50%,1660.0,,,,-2.0,,,,,,,,,
75%,2490.0,,,,0.0,,,,,,,,,


## Final training data

In [26]:
training_variation[["ID", "Variation_old", "Score", "Fusion", "Splice", "Expression", "Frameshift"]].to_csv(final_file, index=False)