In [1]:
import os, re, time
import pandas as pd
from ftfy import fix_text
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import KMeans
import pandas as pd
from itertools import product



In [2]:
#Reads S800 Dataset

s800 = pd.read_csv('data/input/S800/S800.tsv',sep='\t',header=None)
s800_x = s800[[0,4]].drop_duplicates(subset=4).reset_index(drop=True)

In [3]:
#Reads other species datasets

speciesdct = {}
for file in os.listdir('data/input/'):
    if file.endswith('.PubTator'):
        with open('data/input/' + file) as f:
            lines = [line.rstrip('\n') for line in f]
            for line in lines:
                if ((len(line.split('\t')) > 1)):
                    if((line.split('\t')[4] == 'Species')):
                         speciesdct[line.split('\t')[3]] = line.split('\t')[-1]

In [4]:
#Constructs the train test

s800 = s800_x.rename(columns={4:'index'})
spec = pd.DataFrame.from_dict(speciesdct, orient='index').reset_index()
final = pd.concat([s800, spec])
final = final.drop_duplicates(subset=['index']).reset_index(drop=True)
train = final.head(200)

In [28]:
final

Unnamed: 0,0,index
0,5833,Plasmodium falciparum
1,5833,P falciparum
2,4929,Candida guilliermondii
3,4932,yeast
4,4929,C. guilliermondii
...,...,...
1804,7154,Chironomus thummi
1805,1197,Fremyella diplosiphon
1806,11320,Influenza A H1N1
1807,3330,Picea glauca (Moench) Voss


In [15]:
train

Unnamed: 0,0,index
0,5833,Plasmodium falciparum
1,5833,P falciparum
2,4929,Candida guilliermondii
3,4932,yeast
4,4929,C. guilliermondii
...,...,...
195,8036,charr's
196,8036,charr
197,148596,Mauritius kestrel
198,67593,Phytophthora sojae


In [16]:
# Match each entry with every entry

train_index = train['index']
s = pd.DataFrame(list(product(train_index, train_index)), columns=['S1', 'S2'])

In [17]:
s

Unnamed: 0,S1,S2
0,Plasmodium falciparum,Plasmodium falciparum
1,Plasmodium falciparum,P falciparum
2,Plasmodium falciparum,Candida guilliermondii
3,Plasmodium falciparum,yeast
4,Plasmodium falciparum,C. guilliermondii
...,...,...
39995,soybean,charr's
39996,soybean,charr
39997,soybean,Mauritius kestrel
39998,soybean,Phytophthora sojae


In [18]:
def ngrams(string, n=3):
    string = str(string)

    # Fixes text for any possible decoding issues
    string = fix_text(string)

    # Removes non ascii chars
    string = string.lower()
    string = string.encode("ascii", errors="ignore").decode()

    # Cleaning unrelevant characters
    chars_to_remove = [")","(",".","|","[","]","{","}","'"]
    rx = '[' + re.escape(''.join(chars_to_remove)) + ']'
    string = re.sub(rx, '', string)
    string = string.replace('&', 'and')
    string = string.replace(',', ' ')
    string = string.replace('-', ' ')
    string = string.title()
    string = re.sub(' +',' ',string).strip()
    string = ' '+ string +' '
    string = re.sub(r'[,-./]|\sBD',r'', string)
    ngrams = zip(*[string[i:] for i in range(n)])
    return [''.join(ngram) for ngram in ngrams]

In [19]:
# vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams, lowercase=False)
vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams, lowercase=True)
tfidf = vectorizer.fit_transform(s['S1'])
nbrs = NearestNeighbors(n_neighbors=1, n_jobs=-1).fit(tfidf)

In [20]:
tfidf

<40000x1509 sparse matrix of type '<class 'numpy.float64'>'
	with 659600 stored elements in Compressed Sparse Row format>

In [21]:
def getNearestN(query):
    # Returns a number document-term matrix
    queryTFIDF_ = vectorizer.transform(query) 
    distances, _ = nbrs.kneighbors(queryTFIDF_)
    return distances, _

start_time = time.time()
distances, _ = getNearestN(s['S2'])
t = time.time() - start_time

In [22]:
s.insert(2,'Distance',distances, True)

In [23]:
s

Unnamed: 0,S1,S2,Distance
0,Plasmodium falciparum,Plasmodium falciparum,1.490116e-08
1,Plasmodium falciparum,P falciparum,1.053671e-08
2,Plasmodium falciparum,Candida guilliermondii,0.000000e+00
3,Plasmodium falciparum,yeast,0.000000e+00
4,Plasmodium falciparum,C. guilliermondii,0.000000e+00
...,...,...,...
39995,soybean,charr's,1.490116e-08
39996,soybean,charr,0.000000e+00
39997,soybean,Mauritius kestrel,0.000000e+00
39998,soybean,Phytophthora sojae,0.000000e+00


In [24]:
#First scaled the distance by using min max scaler
from sklearn.preprocessing import MinMaxScaler

x = s['Distance'].values.reshape(-1,1)
min_max_scaler = MinMaxScaler()
scaled_conf = pd.Series(min_max_scaler.fit_transform(x).reshape(-1))
s.insert(3, "Scaled_Distance", scaled_conf, True)
s['Scaled_Distance'] = s['Scaled_Distance'].apply(lambda col: round(col,2))

In [25]:
proba = 1 - s['Scaled_Distance']
s.insert(4, "Probability", proba, True)

In [26]:
s.Probability.value_counts()

1.00    28200
0.37     6200
0.55     4600
0.11      400
0.23      400
0.00      200
Name: Probability, dtype: int64

In [27]:
s

Unnamed: 0,S1,S2,Distance,Scaled_Distance,Probability
0,Plasmodium falciparum,Plasmodium falciparum,1.490116e-08,0.63,0.37
1,Plasmodium falciparum,P falciparum,1.053671e-08,0.45,0.55
2,Plasmodium falciparum,Candida guilliermondii,0.000000e+00,0.00,1.00
3,Plasmodium falciparum,yeast,0.000000e+00,0.00,1.00
4,Plasmodium falciparum,C. guilliermondii,0.000000e+00,0.00,1.00
...,...,...,...,...,...
39995,soybean,charr's,1.490116e-08,0.63,0.37
39996,soybean,charr,0.000000e+00,0.00,1.00
39997,soybean,Mauritius kestrel,0.000000e+00,0.00,1.00
39998,soybean,Phytophthora sojae,0.000000e+00,0.00,1.00
