In [1]:
import pandas as pd
import numpy as np
import feather
from imblearn.over_sampling import SMOTE
from sklearn import preprocessing as pe
from tqdm import tqdm

In [2]:
train_stage1 = pd.read_csv('../data/training_variants')
test_stage1 = pd.read_csv('../data/test_variants')
trainx_stage1 = pd.read_csv('../data/training_text', sep="\|\|", engine='python', header=None, skiprows=1, names=["ID","Text"])
testx_stage1 = pd.read_csv('../data/test_text', sep="\|\|", engine='python', header=None, skiprows=1, names=["ID","Text"])

In [3]:
train_stage1 = pd.merge(train_stage1, trainx_stage1, how='left', on='ID').fillna('')
test_stage1 = pd.merge(test_stage1, testx_stage1, how='left', on='ID').fillna('')
#pid = test['ID'].values

In [4]:
train_stage1.head()

Unnamed: 0,ID,Gene,Variation,Class,Text
0,0,FAM58A,Truncating Mutations,1,Cyclin-dependent kinases (CDKs) regulate a var...
1,1,CBL,W802*,2,Abstract Background Non-small cell lung canc...
2,2,CBL,Q249E,2,Abstract Background Non-small cell lung canc...
3,3,CBL,N454D,3,Recent evidence has demonstrated that acquired...
4,4,CBL,L399V,4,Oncogenic mutations in the monomeric Casitas B...


In [5]:
test_stage1.head()

Unnamed: 0,ID,Gene,Variation,Text
0,0,ACSL4,R570S,2. This mutation resulted in a myeloproliferat...
1,1,NAGLU,P521L,Abstract The Large Tumor Suppressor 1 (LATS1)...
2,2,PAH,L333F,Vascular endothelial growth factor receptor (V...
3,3,ING1,A148D,Inflammatory myofibroblastic tumor (IMT) is a ...
4,4,TMEM216,G77A,Abstract Retinoblastoma is a pediatric retina...


In [6]:
train_stage1.shape

(3321, 5)

In [7]:
test_stage1.shape

(5668, 4)

In [8]:
train_stage1.tail()

Unnamed: 0,ID,Gene,Variation,Class,Text
3316,3316,RUNX1,D171N,4,Introduction Myelodysplastic syndromes (MDS) ...
3317,3317,RUNX1,A122*,1,Introduction Myelodysplastic syndromes (MDS) ...
3318,3318,RUNX1,Fusions,1,The Runt-related transcription factor 1 gene (...
3319,3319,RUNX1,R80C,4,The RUNX1/AML1 gene is the most frequent targe...
3320,3320,RUNX1,K83E,4,The most frequent mutations associated with le...


In [10]:
train_stage1.Class.value_counts()

7    953
4    686
1    568
2    452
6    275
5    242
3     89
9     37
8     19
Name: Class, dtype: int64

In [11]:
train_stage1.shape

(3321, 5)

In [12]:
len(train_stage1.ID.unique())

3321

In [13]:
train_stage1.shape

(3321, 5)

In [14]:
train_stage1.Class.value_counts()

7    953
4    686
1    568
2    452
6    275
5    242
3     89
9     37
8     19
Name: Class, dtype: int64

In [15]:
len(train_stage1.ID.unique())

3321

In [16]:
feather.write_dataframe(train_stage1, '../cache/train_stage1.feather')

In [17]:
train_stage1 = feather.read_dataframe('../cache/train_stage1.feather')

In [18]:
train_stage1.Class.value_counts()

7    953
4    686
1    568
2    452
6    275
5    242
3     89
9     37
8     19
Name: Class, dtype: int64

In [19]:
y_class = train_stage1.Class.values

In [20]:
train_stage1.ID[train_stage1['Class'] == 8]

121      121
220      220
306      306
307      307
308      308
750      750
1344    1344
1352    1352
1756    1756
1768    1768
1770    1770
1825    1825
2274    2274
2275    2275
2277    2277
2278    2278
2385    2385
3109    3109
3114    3114
Name: ID, dtype: int64

In [21]:
train_stage1[train_stage1.ID == 121]

Unnamed: 0,ID,Gene,Variation,Class,Text
121,121,SF3B1,K700R,8,The RNA maturation is an important and complex...


In [22]:
train_stage1 = train_stage1.drop('ID', axis=1)

In [23]:
max_rows = 953 # chosen because class 7 has 955 rows in stage2 train set

In [24]:
s = train_stage1.Class.value_counts()

In [26]:
num_classes = 9
rows_to_append = []
ids = {}
for i in tqdm(range(1, num_classes+1)):
    class_rows = max_rows - s[i]
    rows_to_append.append(class_rows)
    ids[i] = train_stage1.index[train_stage1['Class'] == i]

100%|██████████| 9/9 [00:00<00:00, 1877.67it/s]


In [27]:
rows_to_append

[385, 501, 864, 267, 711, 678, 0, 934, 916]

In [28]:
import math

In [30]:
for i in tqdm(range(len(rows_to_append))):
    if rows_to_append[i] > 0:
        to_append = train_stage1.index.isin(ids[i+1])
#         print(to_append)
        n = to_append[to_append==True].sum()
        if n > rows_to_append[i]:
            m = rows_to_append[i]
            df = train_stage1.loc[to_append].copy()
            df = df[0:m-1]
            train_stage1 = pd.concat([train_stage1, df],axis=0)
        else:
            m = math.floor(rows_to_append[i]/n)
            for j in range(m):
                df = train_stage1.loc[to_append].copy()
                train_stage1 = pd.concat([train_stage1, df],axis=0)
            
        #df_to_append = train_stage2.loc[to_append].copy()
        #train_stage2 = pd.concat([train_stage2,df_to_append],axis=0)

100%|██████████| 9/9 [00:00<00:00, 43.55it/s]


In [31]:
train_stage1.Class.value_counts()

1    1336
7     953
4     952
8     950
9     925
2     904
3     890
6     825
5     726
Name: Class, dtype: int64

In [32]:
train_stage1.shape

(8461, 4)

In [33]:
train_stage1.head()

Unnamed: 0,Gene,Variation,Class,Text
0,FAM58A,Truncating Mutations,1,Cyclin-dependent kinases (CDKs) regulate a var...
1,CBL,W802*,2,Abstract Background Non-small cell lung canc...
2,CBL,Q249E,2,Abstract Background Non-small cell lung canc...
3,CBL,N454D,3,Recent evidence has demonstrated that acquired...
4,CBL,L399V,4,Oncogenic mutations in the monomeric Casitas B...


In [34]:
train_stage1.Class.value_counts()

1    1336
7     953
4     952
8     950
9     925
2     904
3     890
6     825
5     726
Name: Class, dtype: int64

In [35]:
feather.write_dataframe(train_stage1, '../cache/train_stage1_os.feather')

In [36]:
test_stage1.head()

Unnamed: 0,ID,Gene,Variation,Text
0,0,ACSL4,R570S,2. This mutation resulted in a myeloproliferat...
1,1,NAGLU,P521L,Abstract The Large Tumor Suppressor 1 (LATS1)...
2,2,PAH,L333F,Vascular endothelial growth factor receptor (V...
3,3,ING1,A148D,Inflammatory myofibroblastic tumor (IMT) is a ...
4,4,TMEM216,G77A,Abstract Retinoblastoma is a pediatric retina...


In [37]:
testx_stage1.head()

Unnamed: 0,ID,Text
0,0,2. This mutation resulted in a myeloproliferat...
1,1,Abstract The Large Tumor Suppressor 1 (LATS1)...
2,2,Vascular endothelial growth factor receptor (V...
3,3,Inflammatory myofibroblastic tumor (IMT) is a ...
4,4,Abstract Retinoblastoma is a pediatric retina...


In [38]:
test_stage1.shape

(5668, 4)

In [39]:
len(set(test_stage1.Gene) -set(train_stage1.Gene))

1243

In [40]:
len(set(test_stage1.Variation) -set(train_stage1.Variation))

5613

In [42]:
feather.write_dataframe(test_stage1, '../cache/test_stage1.feather')