In [5]:
import os, math
import numpy as np
import pandas as pd
import seaborn as sns

%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer

In [6]:
train_variants_df = pd.read_csv("./data/training_variants")
test_variants_df = pd.read_csv("./data/test_variants")
train_text_df = pd.read_csv("./data/training_text", sep="\|\|", engine="python", skiprows=1, names=["ID", "Text"])
test_text_df = pd.read_csv("./data/test_text", sep="\|\|", engine="python", skiprows=1, names=["ID", "Text"])
print("Train Variant".ljust(15), train_variants_df.shape)
print("Train Text".ljust(15), train_text_df.shape)
print("Test Variant".ljust(15), test_variants_df.shape)
print("Test Text".ljust(15), test_text_df.shape)

Train Variant   (3321, 4)
Train Text      (3321, 2)
Test Variant    (5668, 3)
Test Text       (5668, 2)


In [7]:
for header in list(train_variants_df):
    print(header, len(train_variants_df[header].unique()))

ID 3321
Gene 264
Variation 2996
Class 9


In [8]:
for header in list(test_variants_df):
    print(header, len(test_variants_df[header].unique()))

ID 5668
Gene 1397
Variation 5628


In [9]:
gene_group = train_variants_df.groupby('Gene').size()
print(gene_group.sort_values(ascending=False)[:10])
print(gene_group.sort_values(ascending=True)[:10])

Gene
BRCA1     264
TP53      163
EGFR      141
PTEN      126
BRCA2     125
KIT        99
BRAF       93
ERBB2      69
ALK        69
PDGFRA     60
dtype: int64
Gene
KLF4      1
FGF19     1
FANCC     1
FAM58A    1
PAK1      1
ERRFI1    1
PAX8      1
PIK3R3    1
PMS1      1
PPM1D     1
dtype: int64


In [10]:
train_text_df['Text_count'] = train_text_df['Text'].apply(lambda x:len(x.split()))

This shows that the combination of gene and variantion are unique
which means that if we know the gene and variantion at the same time, we will know the class in the training set

In [11]:
train_variants_df['G-V'] = train_variants_df['Gene']+train_variants_df['Variation']
train_variants_df['G-V'].unique().shape

(3321,)

In [12]:
test_variants_df['G-V'] = test_variants_df['Gene']+test_variants_df['Variation']
test_variants_df['G-V'].unique().shape

(5668,)

In [13]:
train_full = train_variants_df.merge(train_text_df, how = 'inner', left_on='ID', right_on='ID')

This shows that different genes can have the same variation

In [14]:
print(train_full['Variation'].unique().shape)
print(train_full['Variation'].shape)

v_count_group = train_full.groupby('Variation')['ID'].count().reset_index()
repeating_var = v_count_group[v_count_group['ID']>1]['Variation'].tolist()

train_full[train_full['Variation'].apply(lambda x:x in repeating_var)].sort_values(['Variation', 'Class', 'Gene'], ascending=[False,False,False])

(2996,)
(3321,)


Unnamed: 0,ID,Gene,Variation,Class,G-V,Text,Text_count
3126,3126,KRAS,Y64A,7,KRASY64A,Growth factor receptors activate Ras by recrui...,8525
1296,1296,HRAS,Y64A,2,HRASY64A,Ras is a key signaling molecule in living cell...,4295
2819,2819,BRCA2,Y42C,5,BRCA2Y42C,Abstract Classification of rare missense var...,20090
1818,1818,RHOA,Y42C,4,RHOAY42C,The RhoA GTPase regulates diverse cellular pro...,21754
1915,1915,SMO,V321M,7,SMOV321M,Basal activities of adenylate cyclase and gua...,13914
1629,1629,MAP2K4,V321M,4,MAP2K4V321M,MAP2K4 encodes a dual-specificity kinase (mito...,5694
1094,1094,MAP3K1,Truncating Mutations,6,MAP3K1Truncating Mutations,Passenger mutation rates are highly elevated i...,8420
2090,2090,KMT2B,Truncating Mutations,6,KMT2BTruncating Mutations,Epigenetic regulation lies at the heart of cel...,19591
117,117,LATS1,Truncating Mutations,4,LATS1Truncating Mutations,The lats gene has been identified as a tumour ...,14173
2121,2121,CCND1,Truncating Mutations,2,CCND1Truncating Mutations,A series of our previous studies demonstrated ...,14406


Let's look at some of the popular variations: Amplification, Truncating Mutations, Deletion. These three variations are actually exceptions to the variation notation.

The notation for variations usually comes in this form: Aletter+number+Bletter, 
Aletter stands for the original amino acid, number stands for the location of mutation, and 
Bletter stands for the final amino acid due to the mutation

In [15]:
train_full[train_full['Variation']=='Amplification'].groupby('Class').count()

Unnamed: 0_level_0,ID,Gene,Variation,G-V,Text,Text_count
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2,21,21,21,21,21,21
6,10,10,10,10,10,10
7,40,40,40,40,40,40


It seems that Truncating Mutations usually is the class 1 mutation

In [16]:
train_full[train_full['Variation']=='Truncating Mutations'].groupby('Class').count()

Unnamed: 0_level_0,ID,Gene,Variation,G-V,Text,Text_count
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,89,89,89,89,89,89
2,1,1,1,1,1,1
4,1,1,1,1,1,1
6,2,2,2,2,2,2


In [17]:
train_full[train_full['Variation']=='Deletion'].groupby('Class').count()

Unnamed: 0_level_0,ID,Gene,Variation,G-V,Text,Text_count
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,57,57,57,57,57,57
4,16,16,16,16,16,16
6,1,1,1,1,1,1


I try to find the variations in the texts and find that even if the exact text of the variations don't always appear,
sometimes we can find a very close one, for example

1)the number which stands for location changed by 3 (Intesting,
might related to the fact that 3 units in the DNA suquence corresponds to a amino acid, though I'm not sure)

2)for deletion, truncating, amplification, usually a similar word will appear in the texts. (truncated, etc...)

3)the 1-letter can be 3-letter strings which also stands for the same amino acid...

4)....

Given these findings, I think that the gene and variation are the keywords the biologists would use to search a related paper to classify the gene-variation combination. 

Or given a paper, one just finds all the gene-variation combinations with a class label that can be determined from the content of the paper.
                                                                                     

In [18]:
index = []
index_no = []
for (i, var) in enumerate(train_full['Variation'].tolist()):
    if var in train_full['Text'].iloc[i]:
        index.append(i)
    else:
        index_no.append(i)

In [19]:
train_full.iloc[index_no, :]

Unnamed: 0,ID,Gene,Variation,Class,G-V,Text,Text_count
0,0,FAM58A,Truncating Mutations,1,FAM58ATruncating Mutations,Cyclin-dependent kinases (CDKs) regulate a var...,6089
7,7,CBL,Deletion,1,CBLDeletion,CBL is a negative regulator of activated recep...,14683
16,16,CBL,Truncating Mutations,1,CBLTruncating Mutations,To determine if residual cylindrical refractiv...,8118
19,19,CBL,Y371S,4,CBLY371S,Acquired uniparental disomy (aUPD) is a common...,3018
41,41,DICER1,Truncating Mutations,1,DICER1Truncating Mutations,Mesenchymal cell populations contribute to mic...,17929
44,44,PTPRT,T1365M,4,PTPRTT1365M,"Tyrosine phosphorylation, regulated by protein...",1933
46,46,PTPRT,T844M,4,PTPRTT844M,Introduction Preceding hematologic disorders ...,1307
48,48,PTPRT,Promoter Hypermethylation,4,PTPRTPromoter Hypermethylation,Metastasis is the leading cause of death for g...,8484
49,49,PTPRT,R1343L,4,PTPRTR1343L,"Tyrosine phosphorylation, regulated by protein...",1933
61,61,PTPRT,R1209W,4,PTPRTR1209W,"Tyrosine phosphorylation, regulated by protein...",1933


In [22]:
import re
for i in index_no[:50]:
    print('variation in text:')
    for start in [m.start() for m in re.finditer('[A-Za-z]+[\d]+', train_full['Text'].iloc[i])]:
        print(train_full['Text'].iloc[i][start:start+6])
    print('variation: ', train_full['Variation'].iloc[i], '\n')        

variation in text:
CDK10 
CDK10 
ETS2 (
E26 on
CDK10 
ETS2 a
CDK10,
CDK10 
FAM58A
CDK10.
CDK10 
CDK10/
ETS2 i
ETS2 d
ETS2 p
ETS2, 
CDK10 
CDK10 
CDK10 
ETS2 (
E26 on
CDK10 
ETS2, 
MCF7 c
CDK10 
FAM58A
CDK10/
ETS2 i
CDK10 
CDK10/
ETS2 d
ETS2 e
Y2H) s
CDK10 
FAM58A
Y2H ma
CDK10 
CDK10 
P1) ex
D1, p2
p21 (C
CIP1),
Cdi1 (
CDK1 a
CDK3 a
Y2H si
CDK10 
CDK10 
S1A). 
CDK10 
FAM58A
S1A).F
CDK10 
Y2H as
CDK10 
P1. (B
Y2H as
B42 tr
pEG202
pJG4-5
B42, r
Y2H in
Y2H as
CDK10 
ETS2, 
CDK10 
CDK10 
V5-6Hi
HEK293
CDK10 
HEK293
CDK10 
V5-6Hi
CDK10 
MCF7 c
MCF7 t
CDK10 
CDK10.
CDK10 
CDK10 
S2B). 
S1D.Ne
CDK10 
CDK10 
D181A 
V5-6Hi
CDK10 
HEK293
V5-6Hi
CDK10 
CDK10 
V5-6Hi
CDK10 
CDK10 
CDK10 
V5-6Hi
S1B). 
CDK10.
P2 iso
S1C). 
MCF7 c
CDK10 
CDK10 
CDK10 
CDK10/
CDK10 
CDK10 
H1 as 
H1 pho
CDK10 
CDK10 
CDK10(
ETS2, 
CDK10 
CDK10/
ETS2 b
CDK10/
CDK10 
CDK10(
CDK10 
H1. Ly
CDK10(
H1, wh
ETS2 a
CDK10 
ETS2-d
CDK10 
S2) an
CDK10 
S2B). 
CDK10.
ERK1 a
ERK2 p
S3B), 
CDK10 
CDK10 
MCF7 c
S3C). 
CDK10,
ETS2.F
C

GFRA1 
ZNF382
S5C). 
GFRA1,
ZNF382
GSE273
S2). P
BMP3, 
BNIP3,
CDKN2A
HOXD10
TFPI2,
ZNF382
KCNH1,
PSMD10
BNIP3,
KCNH1,
ZNF382
TBX5 a
ELK1 m
TBX5 o
ELK1 w
GFRA1 
GFRA1 
GFRA1 
GFRA1 
ZNF382
GFRA1,
ZNF382
GFRA1 
ZNF382
GFRA1 
ZNF382
GFRA1 
ZNF382
GFRA1,
ZNF382
GFRA1,
ZNF382
STAT3 
STAT3 
STAT3 
STAT3 
STAT3,
pSTAT3
pSTAT3
STAT3 
STAT3 
STAT3)
Y705).
STAT3 
STAT3 
STAT3 
pSTAT3
pSTAT3
pSTAT3
STAT3 
pSTAT3
pSTAT3
STAT3 
STAT3 
HumanM
STAT3 
pSTAT3
pSTAT3
STAT3 
pSTAT3
STAT3 
pSTAT3
pSTAT3
STAT3 
Cal27)
Cal27 
pSTAT3
Cal27 
pSTAT3
pSTAT3
STAT3 
pSTAT3
STAT3 
STAT3 
EC50 v
STAT3 
STAT3 
SH2 do
STAT3 
STAT3 
JAK2 i
LY2784
pSTAT3
pSTAT3
STAT3 
STAT3 
STAT3 
STAT3 
mm3). 
STAT3 
STAT3 
STAT3 
STAT3 
pSTAT3
pSTAT3
STAT3 
STAT3 
STAT3 
STAT3 
PTPRZ1
PTPRZ1
V600E)
PTPRZ1
PTPRZ1
V600E)
variation:  Promoter Hypermethylation 

variation in text:
PTPN3,
PTPN13
PTPN14
PTPN3,
PTPN13
PTPN14
S1 and
S1 and
PTPN13
PTPN14
PTPN3.
B41, b
FN3, f
A5/PTP
S1). T
D1 and
D2) we
D1 mut
Q987K 
N1128I
D2 mut
R1212W
R13

TSC2GA
Asp65 
Asn164
TSC2GA
TSC2-m
TSC2GA
Tyr35-
mTORC1
TSC2GA
Tyr35 
mTORC1
Tyr35 
TSC2GA
mTORC1
Y35A s
Y35A m
variation:  Y35H 

variation in text:
cdc2 i
CDC28 
G2/M t
cdc2-r
cdc2, 
cdk2, 
cdk3 c
cdc28 
cdc2 a
G2/M t
cdk2 k
cdk3 m
cdk2 a
cdk3 a
D2, or
D3 sho
D2 or 
D3 can
cdc2-r
cdc2, 
cdk2, 
cdk3. 
cdk4 (
J3) an
cdk5 k
cdc2 a
cdk2, 
cdk5 k
cdk5 a
cdk4 i
cdk4 k
Sf9 ce
cdk4 p
cdk4 i
cdk4 p
cdk4. 
cdk4, 
cdc2-r
D2, an
D3 in 
cdk6, 
cdk6 t
cdk6 k
cdk6 k
cdk6 e
cdk6 k
G1, pr
cdk6 a
cdk6. 
HD11, 
HD33, 
HD63) 
D2 and
D3 ant
D2 and
D3 wer
cdk4 a
Na2 V0
V03, a
cdk6 a
pu1 of
cdk4 a
D2 or 
D3 or 
cdk6 a
cdk6 a
V8 par
cdk6 i
pGEM7 
D2 in 
D3 in 
pGEM7 
T7 RNA
V8 par
cdk6 a
cdk4 a
HD11 a
D2 hyb
D3 mon
cdk6 a
HD63 w
M73, a
cdk4 (
cdk6 (
G1 KIN
cdk6, 
pVL139
D1, D2
D2, an
D3 bac
Sf9 ce
Sf9 ce
MgCl2,
MnCl2,
cdk6 f
D2, an
D3. To
cdk6 p
cdk6 g
cdk6 a
cdk6 a
cdk6 a
cdk6, 
cdk6 a
cdk6 i
cdk6 m
cdk6 a
cdk6 g
V8 pro
cdk6 a
cdk6 p
cdk6 i
cdk6 :
cdk6 -
cdk6 4
cdk6 p
cdk6 a
cdk6 a
cdk6 a
V8 par
cdk6 p
cdk

pNKY6 
RAD50 
RAD50 
RAD50 
POL13 
RAD50 
POL10 
pNKY5.
pSG205
T4 DNA
Sal1 s
pNKY6 
Sal1 R
RAD50 
rad50-
M13 or
pNKY5 
BAL31 
pNKY5 
pNKY6 
BAL31 
Sal1 f
Sal1 w
pNKY5)
RAD50 
RAD50 
Sal1 c
pNKYlO
RAD50 
RAD50 
RAD50 
RAD50 
CEN4) 
I1 \ P
RAD50 
I1 = h
I1 (am
RAD50 
RAD50 
RAD50 
RAD50 
RAD50 
Sal1 f
RAD50 
Sal1 s
RAD50 
RAD50 
RAD50 
om4 ,,
Ld3S2 
S2 ?l 
RAD50 
mm44 r
r4 ko 
b4 Ai 
B2SrnW
WNNNNN
zzsi3 
F2 3 2
q22 r 
g200 -
e2 rn 
Ebk3 '
y1 m r
t3 g.s
d5 3 b
z52 .+
E2 r: 
g6 y~3
m6,n V
pg2< F
m4: kZ
CS5b e
I2 2 p
Hind11
Sal1 d
BAL31 
BAL31 
T4 DNA
T4 DNA
T4 DNA
T4 DNA
RAD50 
RAD50,
RAD50 
pNKY5 
pNKY6 
M13 or
RAD50 
RAD50 
Sal1 s
RAD50 
RAD50 
RAD3) 
RAD50 
RAD50 
RAD50 
S2 dom
RAD50 
I1 (74
RAD50 
I1 and
RAD50 
RAD50 
RAD50 
RAD50 
RAD50 
RAD50 
I1 spa
I1 and
RAD50 
RADS0 
I1 ...
I1 RAB
S2 (n)
I1 .I.
RAD50 
S2 (nt
nt258 
PAM250
S2 fra
RAD50 
RAD50 
I1 com
RAD50 
RAD50 
I1 Rab
S2) Ne
S2 dom
RAD50 
I1 and
RAD50 
RAD50 
RAD50 
RAD50 
I1 (HE
R1-R2 
R2 -1.
R1-R3 
R3 "" 
R1 and
R2 are
R3 is 

p53 (F
MDM2 i
p53 ef
p53 an
MDM2 (
S2A,B 
MDM2 a
p53 (s
S2C on
p53. (
U2OS, 
A549, 
HCT116
PC3 ce
MDM2, 
p53 an
p21WAF
WAF1 p
U2OS o
PC3 ce
MDM2-m
p53 ub
COS7 c
p53 an
p53 an
p53/RY
MDM2 w
S3A on
p53 in
COS7 c
MDM2, 
MDM2 l
S3B on
U2OS c
MDM2 a
p53 (F
MG132 
S3C on
COS7 c
T7-MDM
MDM2, 
p53 an
MDM2 a
p53, a
S3D on
MDM2-m
p53. (
U2OS c
p53 FL
FL393 
MDM2 H
H221, 
MDM2 a
p53 by
A549 c
T7-MDM
MDM2, 
p53 an
MDM2 t
p53 as
p53 de
S3E on
p53 ub
p53 an
S3E on
p53 ub
S3E on
p53 in
U2OS c
S3F on
MDM2 i
A549 c
MDM2 (
p53 ar
E3 lig
MDM2, 
MDM2 w
E3 lig
C464A)
U2OS c
p53 do
MDM2, 
MDM2 (
C464A)
p53 (F
p53 by
MDM2–p
p53 co
MDM2 a
p53, t
E3 lig
MDM2 f
p53. R
MDM2 w
MDM2. 
MDM2 b
MDM2. 
p53  T
p53 we
CDKN1A
p21) a
Bcl2-a
U2OS c
p21WAF
WAF1 p
S4A on
MCF7 c
p53, b
MCF7 c
p53 kn
S4B on
G1 arr
HCT116
p53+/+
HCT116
p53−/−
p53. (
U2OS c
U2OS c
p53, p
p21WAF
WAF1 a
MDM2 (
S4C,D 
p53, p
p21WAF
WAF1 a
MDM2 (
S4C,D 
U2OS c
p53, p
p21WAF
WAF1 a
MDM2 f
p53 re
p53 ha
p53 pr
MDM2 (
MDM2–p
p53 in
MDM2–p
p53 lo
p53 fo

variation in text:
regime
MLH1, 
MSH2, 
MSH6 a
PMS2 c
syndro
cancer
mortal
classe
models
data17
inform
settin
genes1
curati
approa
MLH1 (
MSH2 (
MSH6 (
PMS2 (
assay2
Databa
proces
HVP26,
system
public
MLH1 e
more49
MLH1, 
MSH2, 
MSH6 a
PMS2) 
risks2
MSH6 a
PMS2 v
PMS2 (
MSH6 (
pathog
articl
predic
MLH1 a
MSH2 c
MLH1 a
MSH2 m
BRCA1 
BRCA2 
scores
MLH1, 
MSH2, 
MSH6, 
PMS2, 
counse
assays
hemogl
standa
MSH6 a
PMS2 m
mutati
MSH6 a
PMS2 v
risk45
MMR46 
Mutaly
method
Regist
analys
phenot
data54
Projec
elsewh
data55
method
MLH1, 
MSH2, 
MSH6 a
PMS2 v
data16
analys
PMS2 v
phenot
varian
varian
H0: P 
MSH2-M
MSH6. 
MSH2-M
MSH6 w
MSH2-M
MSH6 s
S1 and
S2, al
MSH2-M
MSH6 c
MSH2 c
S2, an
MSH6 c
S1) (2
MSH2-M
MSH6 c
MSH2-M
MSH6 i
MSH2-M
MSH6 (
S1 and
S2 of 
MSH2-M
MSH6) 
S1 in 
MSH6. 
MSH2-M
MSH6 i
MSH2, 
MSH6) 
S1 or 
MSH6) 
S2 or 
MSH2 b
S1/MSH
MSH6 w
S1/MSH
MSH6 b
S2/MSH
MSH2. 
S1/MSH
MSH6 (
S1/MSH
MSH6 “
S1/MSH
MSH6 u
S2/MSH
MSH2. 
S2/MSH
MSH2 (
MSH2-M
MSH6 d
MSH2-M
MSH6. 
MSH2-M
MSH6, 
MSH2-M
M

variation:  Deletion 

variation in text:
MLH1, 
MSH2 o
MSH6 m
MLH1, 
MSH2, 
MSH6 m
MLH1, 
MSH2 a
MSH6 m
MLH1, 
MSH2 a
MSH6. 
MLH1 (
MSH2 (
MSH6 (
MLH1, 
MSH2 a
MSH6 d
MLH1),
MSH2) 
MSH6).
AC0118
MLH1),
AC0797
MSH2),
AC0065
MSH6).
MLH1 a
MSH2 w
MLH1 a
MSH2, 
MLH1, 
MSH2, 
MSH6 p
MLH1 (
G168-7
MSH2 (
FE11, 
MSH6 (
MLH1, 
MSH2 a
MSH6 s
MLH1, 
MSH2 a
MSH6 p
MLH1, 
MSH2 a
MSH6 s
MLH1 d
MSH2 a
MSH6 d
MSH6 o
MLH1, 
MSH2 a
MSH6 y
IVS14+
MLH1) 
MLH1 p
TACSTD
PREMM1
PREMM1
MLH1, 
MSH2 a
MSH6 g
MLH1 p
MLH1, 
MSH2, 
MSH6 m
PMS2 h
MLH1, 
MSH2 a
MSH6 r
MSH6 w
MLH1, 
MSH2, 
MSH6. 
MLH1, 
MSH2, 
MSH6 a
PMS2 p
MLH1, 
MSH2, 
MSH6 a
PMS2.1
MLH1 w
PMS2 a
MSH2 w
MSH6, 
MLH1 g
V600E)
V600E 
V600E 
MLH1, 
MSH2, 
MSH6 a
PMS2  
MLH1, 
PMS2, 
MSH2 a
MSH6 p
MLH1, 
PMS2, 
MSH2 a
MSH6 w
G168-1
A16-4,
G219-1
MLH1, 
MSH2, 
MSH6 a
PMS2 g
P003 (
MLH1 a
MSH2) 
P008 (
MSH6 a
PMS2).
P003 p
P248. 
v3.0 C
MLH1 a
MSH2 w
MSH6 w
PMS2 w
MLH1: 
NM0002
MSH2: 
NM0002
MSH6: 
NM0001
PMS2: 
NM0005
MLH1 a
PMS2 e
MSH2 a
MSH6 (
PMS2 o

SOX9 h
FZD10)
PI3K a
IGF2 a
IRS2 o
PIK3R1
R1 and
PIK3CA
databa
V842I 
ERBB2 
V104M 
ERBB3 
ERBB2 
PI3K p
PI3K p
cancer
TGFBR1
TGFBR2
ACVR2A
ACVR1B
SMAD2,
SMAD3 
SMAD4 
p53 pa
TP53 i
p53 fo
platfo
SCN5A3
A36, a
TP53, 
PIK3CA
FBXW7;
FBXW7 
q13.12
q12.3,
APOL63
FAM123
ARID1A
SOX9 a
SOX9 h
SOX9 i
SOX9 p
degrad
ARID1A
transc
analys
promis
PI3K p
IGF2, 
ERBB2,
ERBB3,
PI3-K 
transl
HumanM
MSH2-M
MSH6. 
MSH2-M
MSH6 w
MSH2-M
MSH6 s
S1 and
S2, al
MSH2-M
MSH6 c
MSH2 c
S2, an
MSH6 c
S1) (2
MSH2-M
MSH6 c
MSH2-M
MSH6 i
MSH2-M
MSH6 (
S1 and
S2 of 
MSH2-M
MSH6) 
S1 in 
MSH6. 
MSH2-M
MSH6 i
MSH2, 
MSH6) 
S1 or 
MSH6) 
S2 or 
MSH2 b
S1/MSH
MSH6 w
S1/MSH
MSH6 b
S2/MSH
MSH2. 
S1/MSH
MSH6 (
S1/MSH
MSH6 “
S1/MSH
MSH6 u
S2/MSH
MSH2. 
S2/MSH
MSH2 (
MSH2-M
MSH6 d
MSH2-M
MSH6. 
MSH2-M
MSH6, 
MSH2-M
MSH6 w
MSH2-M
MSH6 w
E3M (6
O8B (9
CHARMM
c33a2,
a2, an
MSH2-M
MSH6, 
MSH2-M
MSH6 a
MSH2-M
MSH6 f
MSH2-M
MSH6 i
MSH2-M
MSH6 w
MSH2-M
MSH6 w
MSH2-M
MSH6 f
MSH2-M
MSH6 c
S2. In
MSH6 a
S1. Fi
MSH6 (
MSH2 (
MSH2-M
MSH6 a

KB45 c
HOXC6 
KMT2D 
KDM6A 
KMT2D 
KMT2D 
KDM6A 
KMT2D 
KDM6A 
KMT2D 
KDM6A 
KMT2D 
KMT2D 
H3K4 m
K4 met
KMT2D-
ASH2L,
RBBP5,
WDR5, 
KMT2D-
KMT2D 
KMT2D 
C1430R
C1471Y
PHD4–6
PHD4–6
KMT2D,
KMT2D 
ASH2L,
RBBP5,
WDR5 [
KMT2D 
KMT2D 
KMT2D-
KMT2D–
HOXC6,
KMT2D 
KMT2D,
MLL3, 
H3 lys
HOXC6 
E2-dep
HOXC6 
KMT2D 
E2-ind
HOXC6 
HOXC6 
KDM6A 
KDM6A.
Kdm6a 
Kdm6a 
Kdm6a-
Kdm6a 
KDM6A 
KMT2D 
KMT2D 
KDM6A 
KMT2D 
KDM6A 
KMT2D 
KMT2D-
KMT2D 
KMT2D 
UPF1 o
UPF2. 
G418 a
PTC124
RTC13,
RTC14,
NB30 [
KMT2D 
KDM6A 
KMT2D 
KDM6A 
KMT2D 
MLL2 (
MLL4),
H3 lys
H3K4),
K4), w
KMT2D 
KMT2D 
KMT2D 
KMT2D 
KMT2D 
H3 mod
KMT2D 
H3K4 m
K4 mon
KMT2D 
KMT2D 
KMT2D 
KMT2D 
MLL2, 
KMT2C 
MLL3),
KMT2D 
MLL2/A
MLL4),
KDM6A 
KMT2D/
MLL2 a
KMT2C/
MLL3, 
KMT2D/
KMT2C 
KMT2D-
KMT2C-
KDM6A,
KMT2D 
KMT2D 
KMT2C 
KMT2D/
KMT2C 
KMT2D 
KMT2D 
KMT2D/
KMT2C 
KMT2D/
KMT2C 
H3 lys
H3K4) 
K4) me
H3K4 m
K4 met
KMT2A 
KMT2B 
MLL4),
KMT2C 
KMT2D,
NCOA6,
KDM6A 
KMT2C 
KMT2D 
KMT2C 
KMT2C 
KMT2C 
KMT2D 
KMT2D 
KMT2D 
KMT2D 
KMT2D-
KMT2D 

H1 kin
CDC2/C
LATS1-
G2/M b
LATS1-
LATS1-
CDC2/C
CDC2/C
LATS1 
LATS1-
G1, wh
G1 (Fi
LATS1 
LATS1 
LATS1-
G1 pha
LATS1-
LATS1 
LATS1-
LATS1-
LATS1-
BCL2 (
LATS1-
H460 (
LATS1 
LATS1-
P53, a
LATS1-
LATS1 
LATS1K
K734M)
H460 c
P53 di
LATS1-
Lats1-
p53-me
LATS1 
LATS1 
LATS1 
LATS1 
LATS1K
K734M 
LATS1S
S909A 
LATS1 
Lys72 
LATS1K
K734M 
LATS1 
LATS1S
S909A 
LATS1 
LATS1K
K734M 
LATS1S
S909A,
LATS1 
Lats1 
LATS1 
LATS1 
LATS1 
LATS1 
CDC2 i
G2/M t
CDC2/C
LATS1 
LATS1 
LATS1-
LATS1 
G2/M a
LATS1-
G1/S b
LATS1 
G2/M t
LATS1 
LATS1 
LATS1 
LATS1 
LATS1 
LATS1 
LATS1-
G2/M (
LATS1K
K734M 
LATS1S
S909A,
LATS1 
LATS1-
LATS1-
LATS1 
G2 tra
LATS1 
LATS1 
LATS1 
G1 24 
LATS1,
LATS1-
LATS1-
G2/M a
G1. Th
LATS1-
G2/M a
LATS1 
CDC2, 
CDK2, 
p27KIP
p21CIP
CDC2/C
LATS1-
CDC2/C
LATS1 
LATS1 
G2/M i
CDC2/C
CDC2/C
cdc2 a
cdc2c 
LATS1 
G2/M o
G1 (Fi
G1/S b
NIH3T3
T3 cel
G1/S t
G1 pop
G2/M a
G1 aft
LATS1-
G1/S t
G1 as 
LATS1-
G2/M. 
LATS1-
p53 (S
p53 (S
p53, M
MCG10 
G2/M a
LATS1 
LATS1 
Lats1 
H460 s
BCL2 r

BAF180
p21 so
BAF180
p21 tr
BAF180
p21 pr
BAF180
BAF180
BAF180
p21 pr
Brg1-b
p21/WA
WAF1 p
BAF180
p21 pr
p21 in
BAF180
p21 up
p21 pr
p53, S
SMAD2/
Stat3,
D3 rec
VD3R),
BAF180
BAF180
BAF180
p21 du
p53 an
BAF180
p21 in
MCF10A
p21 de
BAF180
MCF10A
G1 pha
G2 (Fi
BAF180
p21 an
MCF10A
p21 ex
BAF180
p21 up
p21 ac
G1 pop
BAF180
G1 pop
BAF180
p21 el
BAF180
G1 arr
BAF180
BAF180
G1 and
G2 arr
p21 ac
p53 (F
BAF180
p21 ac
G1 but
G2 arr
p21 ac
G2 arr
G1 arr
p21 in
BAF180
BAF180
p21 in
BAF180
p21 pr
SMAD2/
p53 bi
p21 ex
MCF10A
p21 up
p21 re
MCF10A
BAF180
p21 in
VD3R, 
p21, s
BAF180
BAF180
p21 tr
BAF180
p21. T
BAF180
p21 an
BAF180
p21 an
p21. I
BAF180
p21 RN
BAF180
BAF180
CDC25A
HCC114
SUM131
BAF180
p53 an
p16, w
BAF180
BAF180
BAF180
BAF180
G1 of 
KDM6A)
JARID1
KDM5C)
SETD22
JARID1
SETD2)
H3. Mo
H3 reg
contro
PBRM14
approa
PinDel
PBRM1.
PBRM1 
p21 an
BAF180
comple
domain
differ
SMARCB
BRG1 c
tumour
BRG1 m
types1
PBRM1 
PBRM1 
screen
PBRM1 
PBRM1 
M1209_
E1214d
comple
Ile57)
PBRM1 
code11
T232P,
A597D 

SF3B1 
B1 mut
SF3B1,
B1, ar
TP53 a
ASXL1,
SF3B1 
B1 mut
SF3B1 
B1 mut
States
TP53 a
RUNX1,
ASXL1,
DNMT3A
IDH1/2
TET2 a
EZH2 (
level5
U2AF35
AF35 (
U2AF1)
AF1), 
ZRSR2 
SRSF2 
SC35),
SF3A1,
A1, SF
SF3B1 
B1 and
PRPF40
U1, U2
U2, an
U4/5/6
U11/12
PRPF40
SF1 an
SRSF1 
SRSF2,
U2 aux
U2AF) 
U2AF65
AF65 (
U2AF2)
AF2)– 
U2AF35
AF35 h
U2 snR
SF3A1 
A1 as 
SF3B1,
B1, to
ZRSR2 
U2AF35
AF35, 
ZRSR2 
U2AF65
AF65, 
SRSF1 
SRSF2,
U2AF35
AF35 (
U2AF65
AF65, 
SF1 an
SRSF1,
U2AF35
AF35 (
SRSF2 
ZRSR2 
SF3B1 
B1 (N 
SF3A1 
A1 (N 
PRPF40
U2AF65
AF65 (
SF1 (N
SF3B1 
B1 mut
U1snRN
SF3B1 
B1 3′ 
SRSF2 
Zn2+ Z
Zn2+ Z
Zn2+ W
ZRSR2 
SF1 U2
U2AF35
AF35 U
Zn2+ Z
U2AF65
AF65 S
SF3B U
U2snRN
SF3A1 
A1 U1s
U1snRN
U1 snR
SF1 an
U2 aux
U2AF),
U2AF65
AF65, 
U2AF (
U2AF35
AF35) 
U2AF65
AF65 a
SRSF2,
ZRSR2 
U2AF a
U2 snR
SF3A1 
A1 and
SF3B1,
B1, is
U2AF35
AF35 (
q22.3)
S34F(2
S34Y(5
Q157R(
Q157P(
ZRSR2 
Xp22.1
I202N 
N261Y 
C302R 
H330R 
N382K*
I53T* 
S40X R
R68sp 
A96fs 
E118fs
R126X 
L237fs
K257sp
W291X 
G323fs
N327fs

Zn2+ Z
Zn2+ W
ZRSR2 
SF1 U2
U2AF35
AF35 U
Zn2+ Z
U2AF65
AF65 S
SF3B U
U2snRN
SF3A1 
A1 U1s
U1snRN
U1 snR
SF1 an
U2 aux
U2AF),
U2AF65
AF65, 
U2AF (
U2AF35
AF35) 
U2AF65
AF65 a
SRSF2,
ZRSR2 
U2AF a
U2 snR
SF3A1 
A1 and
SF3B1,
B1, is
U2AF35
AF35 (
q22.3)
S34F(2
S34Y(5
Q157R(
Q157P(
ZRSR2 
Xp22.1
I202N 
N261Y 
C302R 
H330R 
N382K*
I53T* 
S40X R
R68sp 
A96fs 
E118fs
R126X 
L237fs
K257sp
W291X 
G323fs
N327fs
F239V 
E148X 
E362X 
E133G 
C326R 
PRPF40
q13.12
SF3A1 
A1 Sur
q12.2)
A57S I
I141M*
Y772C 
M117I 
K166T 
E373D 
T374P 
M667V 
P95H(3
SRSF2 
q25.1)
Y347X 
A26V P
P383L 
P15H* 
M58I* 
P212L*
D442N 
P540S 
SF3B1 
B1 (2q
q33.1)
K700E(
K666N(
H662Q(
E622D(
Y623C 
R625L(
K182E 
G347V 
N626D 
D781G 
U2AF65
AF65 (
q13.42
R18W M
M144I 
L187V 
SF1 KH
q13.1)
G372V 
T454M 
T474A 
Y476C 
A508G 
ZRSR2 
anaemi
SF3B1 
B1 mut
SRSF2 
SRSF2 
SRSF2 
instab
SRSF2 
U2AF35
AF35 m
A26V i
U2AF35
AF35 e
S34 or
Q157) 
U2AF h
SRSF2 
P95 wi
SF3B1 
B1 mut
K700 a
K666, 
H662 a
E622, 
mutati
ZRSR2 
Xp22.1
ZRSR2 
ZRSR2 

Y764in
I759A 
I759A 
L858R 
L861Q,
A763_Y
Y764in
L858R 
A763_Y
Y764in
D761_E
E762in
E762. 
D770_N
N771in
D770_N
N771in
D770_N
N771in
L858R 
L858R 
D770_N
N771in
D770_N
N771in
T790M 
T790M 
T790M-
L858R 
D770_N
N771in
T790M 
D770_N
N771in
ERBB2 
A763_Y
Y764in
Leu858
L858R)
PF0029
Leu858
L858R)
Leu858
Gly719
Gly719
Leu861
Leu858
Leu858
delGlu
Ala750
delGlu
Ser752
delLeu
Thr751
delLeu
Ser752
delLeu
Pro753
Leu858
Leu861
Gly719
Leu858
Gly719
Thr790
Leu858
Gly719
Leu858
Thr790
PI3K/A
Glu762
Met766
Met766
Cys775
Glu762
Tyr764
Ala767
Cys775
Glu762
Cys775
Asp761
Glu762
Ala763
Tyr764
S1 764
Tyr764
Val765
Met766
Ala767
Ala767
Val769
Ala767
Ser768
Ser768
Asp770
Ser768
Val769
Ser768
Val769
S2,S3 
S3 769
Val769
Asp770
Val769
Asp770
Val769
Asp770
Val769
Asp770
Val769
Asp770
Val769
Asp770
Val769
Asp770
S2,S4–
S4–13 
Asp770
Asn771
Asp770
Asn771
Asp770
Asn771
Asp770
Asn771
Asp770
Asn771
Asp770
Asn771
Asp770
Asn771
Asp770
Asn771
Asp770
Asn771
Asp770
Asn771
Asp770
Asn771
Asp770
Asn771
delAsp
S1,S2,
S2,S6,