In [69]:
from keras.models import load_model
from sklearn import preprocessing
import matplotlib.pyplot as plt
from tqdm import tqdm
import os

In [70]:
import nltk
from nltk.util import ngrams
import spacy 
import numpy
import pandas as pd
import seaborn as sns
import re

In [None]:
#%%time
#nlp=spacy.load('en_vectors_web_lg')
%time nlp=spacy.load('./wikipedia-pubmed-and-PMC-w2v')


In [4]:
def get_features(docs, max_length):
    docs = list(docs)
    Xs = numpy.zeros((len(docs), max_length), dtype='int32')
    for i, doc in enumerate(docs):
        j = 0
        for token in doc:
            ##rever to word vector
            vector_id = token.vocab.vectors.find(key=token.orth)
            if vector_id >= 0:
                Xs[i, j] = vector_id
            else:
                Xs[i, j] = 0
            j += 1
            if j >= max_length:
                break
    return Xs

### take SRS descripitions for classification

In [9]:
%%time 
inS_dir='/cellar/users/btsui/Data/nrnb01_nobackup/METAMAP/allSRS.pickle'
srsS=pd.read_pickle(inS_dir)
srsS=pd.Series(data=srsS.values,index=pd.MultiIndex.from_arrays([srsS.index.get_level_values(0),
                                                            srsS.index.get_level_values(1).str.lower()]) )

CPU times: user 37.6 s, sys: 6.94 s, total: 44.5 s
Wall time: 44.4 s


In [10]:
sra_dump_pickle_dir='/cellar/users/btsui/Data/SRA/DUMP/sra_dump.pickle'

technical_meta_data_df=pd.read_pickle(sra_dump_pickle_dir)

In [11]:
noDupSampleS=technical_meta_data_df.groupby(['Study']).head(n=1)['Sample']

In [12]:
specie_m=srsS.isin(['Mus musculus','Homo sapiens'])

In [13]:
attrib_m=srsS.index.get_level_values(1)=='scientific_name'

In [14]:
mySpecieSrs=srsS[specie_m&attrib_m].index.get_level_values(0).unique()

In [15]:
specie_srs_m=srsS.index.get_level_values(0).isin(mySpecieSrs)

In [16]:
attrib_a=srsS.index.get_level_values(1)
attrib_m=attrib_a=='description'


In [17]:
oneInStudy_m=srsS.index.get_level_values(0).isin(noDupSampleS.values)

In [147]:
srsS_sub=srsS[attrib_m&specie_srs_m&oneInStudy_m].drop_duplicates()

#make sure the code doesn't sample from outliers
#20 words https://www.ijcai.org/proceedings/2017/0578.pdf
wordCountS=srsS_sub.str.count(' ')
lem_m=(wordCountS<=20)&(wordCountS>=2)
srsS_sub=srsS_sub[lem_m]

In [148]:
inTestStrS=srsS_sub.sample(n=1000,random_state=0)

### reload model

In [None]:
le = preprocessing.LabelEncoder()
le.classes_ = numpy.load('./model/classes.npy')



In [None]:
print(le.classes_)

In [149]:
%time model=load_model('./model/lstm.h5')

CPU times: user 20.5 s, sys: 8.22 s, total: 28.7 s
Wall time: 23.5 s


#### get baseline empty state

In [150]:
val_docs = list(nlp.pipe(' '))
val_X=get_features(val_docs,max_length=model.input_shape[1])

emptyState=model.predict_proba(val_X)[0,:]

### calculate NER score for each segment

In [None]:
myML=[]
minLength=3
for sent in tqdm(inTestStrS):
    sent=re.sub(r'[^a-zA-Z0-9 ]+', ' ', sent)
    tokens=re.split(pattern=' ',string=sent.lower())
    tokens=list(filter(lambda token:token!='' ,tokens))
    ###keep track of each token
    scoreDf=pd.DataFrame(columns=le.classes_,index=tokens).fillna(0)
    #for n_gram in range(1,len(tokens)+1):
    for n_gram in range(minLength,len(tokens)):
        
        grams=list(map(lambda L:" ".join(L),list(ngrams(tokens,n_gram))))
        val_docs = list(nlp.pipe(grams))
        val_X=get_features(val_docs,max_length=model.input_shape[1])
        predictM=model.predict_proba(val_X)
        tmpDf=pd.DataFrame(data=predictM,columns=le.classes_,index=grams)
        empty_mask=(tmpDf-emptyState).abs().sum(axis=1)<0.1
        tmpDf[empty_mask]=0
        #update score in table by 
        for i,gram in enumerate(tmpDf.index):# for each n-gram, #for each word add the score
                for j,one_gram in enumerate(gram.split(' ')): #check the score of each split compared to current ones
                        scoreDf.iloc[i+j]=scoreDf.iloc[i+j]+(tmpDf.iloc[i])
                        #scoreDf.iloc[i+j]=numpy.maximum(scoreDf.iloc[i+j],(tmpDf.iloc[i]))
    myML.append(scoreDf)






  0%|          | 0/1000 [00:00<?, ?it/s][A[A[A[A



  0%|          | 2/1000 [00:00<02:55,  5.68it/s][A[A[A[A



  0%|          | 3/1000 [00:00<02:51,  5.83it/s][A[A[A[A



  0%|          | 5/1000 [00:00<02:11,  7.58it/s][A[A[A[A



  1%|          | 7/1000 [00:00<01:59,  8.31it/s][A[A[A[A



  1%|          | 9/1000 [00:00<01:47,  9.26it/s][A[A[A[A



  1%|          | 11/1000 [00:01<02:59,  5.51it/s][A[A[A[A



  1%|          | 12/1000 [00:02<03:17,  5.00it/s][A[A[A[A



  1%|▏         | 14/1000 [00:02<02:56,  5.59it/s][A[A[A[A



  2%|▏         | 16/1000 [00:02<02:45,  5.95it/s][A[A[A[A



  2%|▏         | 18/1000 [00:03<03:12,  5.10it/s][A[A[A[A



  2%|▏         | 19/1000 [00:03<03:19,  4.92it/s][A[A[A[A



  2%|▏         | 20/1000 [00:04<03:29,  4.68it/s][A[A[A[A



  2%|▏         | 22/1000 [00:04<03:27,  4.71it/s][A[A[A[A



  2%|▏         | 23/1000 [00:04<03:30,  4.65it/s][A[A[A[A



  2%|▏         | 24/1000 [00:05<03:

In [None]:
#scoreDf.iloc[i+j]

In [None]:
mergedDf=pd.concat(myML,keys=list(inTestStrS))

In [None]:
#mergedDf

In [None]:
numeric_token_m=mergedDf.index.get_level_values(0).str.contains('^\d+$')

In [None]:
mergedSubDf=mergedDf[~numeric_token_m].copy()#.loc[:,mergedDf.columns!='age']

In [None]:
threshold=0.100000
mergedSubDf['undetected']=threshold
AmbigM=(mergedSubDf>=threshold).sum(axis=1)!=1 #this mark screw up the confoudning boundary 
mergedSubDf.loc[AmbigM,'undetected']=1.0
#mergedSubDf.loc[:,'undetected']=threshold

In [None]:
predDf=mergedSubDf.idxmax(axis=1).to_frame()

In [None]:
predDf

In [None]:
predDf.index.names=['Freetext','Token']
predDf['token_numeric']=predDf.index.get_level_values('Token').str.contains('^\d+$')

In [None]:
excel=pd.ExcelWriter('./Results/for_curation.xlsx')
predDf[~predDf['token_numeric']].to_excel(excel)
excel.close()

# scratch

In [None]:
asdasd

In [None]:
predDf[~predDf['token_numeric']]

In [None]:
!echo $PWD/./Results/for_curation.xlsx

In [None]:
#predDf

In [None]:
!echo $PWD/./Results/for_curation.xlsx

In [None]:
predDf[~predDf['token_numeric']].to_csv('tmp.csv')

In [None]:
#!rm tmp.csv

In [None]:
import numpy as np

In [None]:
#replace with duplicated states
#emptyStat=np.array([0.42332533, 0.4360587 , 0.61020947, 0.42082471, 0.4110575 ,
#       0.42533568, 0.47932082])

In [None]:
emptyState=mergedSubDf.groupby(mergedSubDf.columns.tolist(),as_index=False).size().sort_values().index[-1]
#emptyStat=np.array([0.42332533, 0.4360587 , 0.61020947, 0.42082471, 0.4110575 ,
#       0.42533568, 0.47932082])
print (emptyState)

In [None]:
noPredM=((mergedSubDf-emptyState).abs()<0.1).all(axis=1)
mergedSubDf[(~noPredM)&(mergedSubDf>0.5).sum(axis=1)==1]

In [None]:
#good examples: HAP1 LMTK3-KO cells, stimulated with WNT3, replicate R2

In [None]:
mergedSubDf.loc['HAP1 CCK4-KO cells, stimulated with RESV, replicate R1']

In [None]:
mergedSubDf[].iloc[0].values

In [None]:
sns.heatmap(data=scoreDf.T)

In [None]:
##among the ones with >0.5, take the ones that are unique
sns.heatmap(data=(scoreDf>0.6).T)

In [None]:
sns.heatmap(data=scoreDf.T)

In [None]:
### among the ones with clear boundry, it can classify well. 


"""
argue that it can salvage the data correctly. Among those sentences, 

take >0.5 as boundary, run top 10000 sentences 
"""
scoreDf

### show one

In [None]:
#sent='Human histone H3 di-methylated at lysine 4 (H3K4me2) in human blood CD4+ T cells, targeted using Abcam antibody ab7766'#inTestStrS.iloc[5]
sent='Human HapMap individual NA18500'
print(sent)

In [None]:
#sent=re.sub(r'[^a-zA-Z0-9 ]+', ' ', sent)
#print (sent)
tokens=re.split(pattern=' ',string=sent)
s=tokens
#print (tokens)
scoreDf=pd.DataFrame(columns=le.classes_,index=tokens).fillna(0)
#for n_gram in range(1,len(tokens)+1):
#for n_gram in range(1,len(tokens)):
n_gram=1
grams=list(map(lambda L:" ".join(L),list(ngrams(s,n_gram))))
#print (grams)
val_docs = list(nlp.pipe(grams))
val_X=get_features(val_docs,max_length=model.input_shape[1])
tmpDf=pd.DataFrame(data=model.predict_proba(val_X),columns=le.classes_,index=grams)
#tmpDf=pd.DataFrame(data=predictM,columns=le.classes_,index=grams)
empty_mask=(tmpDf-emptyState).abs().sum(axis=1)<0.01
tmpDf[empty_mask]=0

fig,ax=plt.subplots(figsize=(3,2.5*4))
sns.heatmap(tmpDf,annot=True,ax=ax,vmin=0,vmax=1.0,fmt='.2f',cbar=None)
#ax.set_xticklabels([])
"""break

#each n gram only advange 
for i,gram in enumerate(tmpDf.index):# for ec
    for j,one_gram in enumerate(gram.split(' ')):
        scoreDf.iloc[i+j]=numpy.maximum(scoreDf.iloc[i+j],(tmpDf.iloc[i]))
"""

In [None]:
#for each one, makes a prediction on the term, to see what it is supposed to be. 

### test one

In [None]:
sent=re.sub(r'[^a-zA-Z0-9 ]+', ' ', sent)
#print (sent)
tokens=re.split(pattern=' ',string=sent)
s=tokens
#print (tokens)
scoreDf=pd.DataFrame(columns=le.classes_,index=tokens).fillna(0)
#for n_gram in range(1,len(tokens)+1):
for n_gram in range(1,len(tokens)):
    grams=list(map(lambda L:" ".join(L),list(ngrams(s,n_gram))))
    #print (grams)
    val_docs = list(nlp.pipe(grams))
    val_X=get_features(val_docs,max_length=model.input_shape[1])
    predictM=model.predict_proba(val_X)

    tmpDf=pd.DataFrame(data=predictM,columns=le.classes_,index=grams)
    empty_mask=(tmpDf-emptyState).abs().sum(axis=1)<0.01
    tmpDf[empty_mask]=0

    """
    each n gram only advange 
    """
    for i,gram in enumerate(tmpDf.index):# for ec
        for j,one_gram in enumerate(gram.split(' ')):
            scoreDf.iloc[i+j]=numpy.maximum(scoreDf.iloc[i+j],(tmpDf.iloc[i]))


In [None]:
threshold=0.2
scoreDf[scoreDf<=threshold]=0
scoreDf['undetected']=threshold

scoreDf.idxmax(axis=1)

In [None]:
sns.heatmap(scoreDf,annot=True)

In [None]:
tokens

In [None]:
sns.heatmap(scoreDf.T,annot=True)

In [None]:
sexS=srsS[srsS.index.get_level_values(1)=='sex']

In [None]:
sexS.value_counts()