In [1]:
import os, re, time
import pandas as pd
from ftfy import fix_text
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import KMeans
import pandas as pd
from itertools import product



In [2]:
#Reads S800 Dataset

s800 = pd.read_csv('data/input/S800/S800.tsv',sep='\t',header=None)
s800_x = s800[[0,4]].drop_duplicates(subset=4).reset_index(drop=True)

In [3]:
#Reads other species datasets

speciesdct = {}
for file in os.listdir('data/input/'):
    if file.endswith('.PubTator'):
        with open('data/input/' + file) as f:
            lines = [line.rstrip('\n') for line in f]
            for line in lines:
                if ((len(line.split('\t')) > 1)):
                    if((line.split('\t')[4] == 'Species')):
                         speciesdct[line.split('\t')[3]] = line.split('\t')[-1]

In [4]:
#Constructs the train test

s800 = s800_x.rename(columns={4:'index'})
spec = pd.DataFrame.from_dict(speciesdct, orient='index').reset_index()
final = pd.concat([s800, spec])
final = final.drop_duplicates(subset=['index']).reset_index(drop=True)
train = final.head(200)

In [5]:
s = pd.concat([final['index'].tail(904).reset_index(drop=True).rename('S1'), final['index'].head(904).rename('S2')],axis=1)
s

Unnamed: 0,S1,S2
0,C. damasonium,Plasmodium falciparum
1,Paracercomonas marina,P falciparum
2,Paulinella chromatophora,Candida guilliermondii
3,Filoreta japonica,yeast
4,Cronartium quercuum f.sp. fusiforme,C. guilliermondii
...,...,...
899,Chironomus thummi,Callosobruchus maculatus
900,Fremyella diplosiphon,Chlamydomonas reinhardtii
901,Influenza A H1N1,Casaurina glauca
902,Picea glauca (Moench) Voss,Datisca glomerata


In [6]:
# Match n x n

# train_index = train['index']
# s = pd.DataFrame(list(product(train_index, train_index)), columns=['S1', 'S2'])

In [7]:
s

Unnamed: 0,S1,S2
0,C. damasonium,Plasmodium falciparum
1,Paracercomonas marina,P falciparum
2,Paulinella chromatophora,Candida guilliermondii
3,Filoreta japonica,yeast
4,Cronartium quercuum f.sp. fusiforme,C. guilliermondii
...,...,...
899,Chironomus thummi,Callosobruchus maculatus
900,Fremyella diplosiphon,Chlamydomonas reinhardtii
901,Influenza A H1N1,Casaurina glauca
902,Picea glauca (Moench) Voss,Datisca glomerata


In [8]:
def ngrams(string, n=3):
    string = str(string)

    # Fixes text for any possible decoding issues
    string = fix_text(string)

    # Removes non ascii chars
    string = string.lower()
    string = string.encode("ascii", errors="ignore").decode()

    # Cleaning unrelevant characters
    chars_to_remove = [")","(",".","|","[","]","{","}","'"]
    rx = '[' + re.escape(''.join(chars_to_remove)) + ']'
    string = re.sub(rx, '', string)
    string = string.replace('&', 'and')
    string = string.replace(',', ' ')
    string = string.replace('-', ' ')
    string = string.title()
    string = re.sub(' +',' ',string).strip()
    string = ' '+ string +' '
    string = re.sub(r'[,-./]|\sBD',r'', string)
    ngrams = zip(*[string[i:] for i in range(n)])
    return [''.join(ngram) for ngram in ngrams]

In [9]:
# vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams, lowercase=False)
vectorizer = TfidfVectorizer(min_df=1, analyzer='word', lowercase=True)
tfidf = vectorizer.fit_transform(s['S1'])
nbrs = NearestNeighbors(n_neighbors=1, n_jobs=-1).fit(tfidf)

In [10]:
tfidf

<904x1077 sparse matrix of type '<class 'numpy.float64'>'
	with 1689 stored elements in Compressed Sparse Row format>

In [11]:
def getNearestN(query):
    # Returns a number document-term matrix
    queryTFIDF_ = vectorizer.transform(query) 
    distances, _ = nbrs.kneighbors(queryTFIDF_)
    return distances, _

start_time = time.time()
distances, _ = getNearestN(s['S2'])
t = time.time() - start_time

In [12]:
s.insert(2,'Distance',distances, True)

In [13]:
s

Unnamed: 0,S1,S2,Distance
0,C. damasonium,Plasmodium falciparum,1.000000
1,Paracercomonas marina,P falciparum,1.000000
2,Paulinella chromatophora,Candida guilliermondii,0.738196
3,Filoreta japonica,yeast,0.765367
4,Cronartium quercuum f.sp. fusiforme,C. guilliermondii,1.000000
...,...,...,...
899,Chironomus thummi,Callosobruchus maculatus,1.000000
900,Fremyella diplosiphon,Chlamydomonas reinhardtii,1.000000
901,Influenza A H1N1,Casaurina glauca,0.765367
902,Picea glauca (Moench) Voss,Datisca glomerata,1.000000


In [14]:
#First scaled the distance by using min max scaler
from sklearn.preprocessing import MinMaxScaler

x = s['Distance'].values.reshape(-1,1)
min_max_scaler = MinMaxScaler()
scaled_conf = pd.Series(min_max_scaler.fit_transform(x).reshape(-1))
s.insert(3, "Scaled_Distance", scaled_conf, True)
s['Scaled_Distance'] = s['Scaled_Distance'].apply(lambda col: round(col,2))

In [15]:
proba = 1 - s['Scaled_Distance']
s.insert(4, "Probability", proba, True)

In [16]:
s.Probability.value_counts()

0.06    542
1.00    104
0.28     46
0.31     24
0.15     10
0.21     10
0.17      9
0.16      9
0.08      8
0.30      8
0.25      7
0.33      7
0.46      7
0.26      6
0.40      6
0.07      6
0.27      5
0.24      5
0.41      5
0.38      4
0.35      4
0.18      4
0.45      4
0.53      4
0.63      4
0.22      3
0.19      3
0.13      3
0.11      3
0.09      3
0.10      3
0.03      3
0.37      3
0.39      3
0.23      3
0.14      3
0.57      2
0.43      2
0.34      2
0.29      2
0.42      2
0.51      2
0.32      2
0.04      1
0.12      1
0.44      1
0.36      1
0.01      1
0.20      1
0.54      1
0.05      1
0.00      1
Name: Probability, dtype: int64

In [17]:
s

Unnamed: 0,S1,S2,Distance,Scaled_Distance,Probability
0,C. damasonium,Plasmodium falciparum,1.000000,0.94,0.06
1,Paracercomonas marina,P falciparum,1.000000,0.94,0.06
2,Paulinella chromatophora,Candida guilliermondii,0.738196,0.69,0.31
3,Filoreta japonica,yeast,0.765367,0.72,0.28
4,Cronartium quercuum f.sp. fusiforme,C. guilliermondii,1.000000,0.94,0.06
...,...,...,...,...,...
899,Chironomus thummi,Callosobruchus maculatus,1.000000,0.94,0.06
900,Fremyella diplosiphon,Chlamydomonas reinhardtii,1.000000,0.94,0.06
901,Influenza A H1N1,Casaurina glauca,0.765367,0.72,0.28
902,Picea glauca (Moench) Voss,Datisca glomerata,1.000000,0.94,0.06


In [18]:
s[s['Probability']>0.6]

Unnamed: 0,S1,S2,Distance,Scaled_Distance,Probability
19,DBM,black-legged kittiwake,0.0,0.0,1.0
27,Scytosiphon lomentaria,potato,0.0,0.0,1.0
31,ring ouzel,Arabidopsis thaliana,0.0,0.0,1.0
43,white spruce,rice,0.0,0.0,1.0
45,[Picea glauca (Moench) Voss],Saccharomyces cerevisiae,0.0,0.0,1.0
...,...,...,...,...,...
836,Blumeria graminis,vaccinia virus,0.0,0.0,1.0
845,S. nodorum,E. coli,0.0,0.0,1.0
856,cinchona,Hepatitis delta virus,0.0,0.0,1.0
857,smallpox,hepatitis D,0.0,0.0,1.0
