In [1]:
import os, re, time
import pandas as pd
import pandas as pd
from ftfy import fix_text
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
from itertools import product



In [2]:
#Reads S800 Dataset

s800 = pd.read_csv('data/input/S800/S800.tsv',sep='\t',header=None)
s800_x = s800[[0,4]].drop_duplicates(subset=4).reset_index(drop=True)

In [3]:
#Reads other species datasets

speciesdct = {}
for file in os.listdir('data/input/'):
    if file.endswith('.PubTator'):
        with open('data/input/' + file) as f:
            lines = [line.rstrip('\n') for line in f]
            for line in lines:
                if ((len(line.split('\t')) > 1)):
                    if((line.split('\t')[4] == 'Species')):
                         speciesdct[line.split('\t')[3]] = line.split('\t')[-1]

In [4]:
#Constructs the train test

s800 = s800_x.rename(columns={4:'index'})
spec = pd.DataFrame.from_dict(speciesdct, orient='index').reset_index()
final = pd.concat([s800, spec])
final = final.drop_duplicates(subset=['index']).reset_index(drop=True)
train = final.head(300)

In [5]:
s = pd.concat([final['index'].tail(904).reset_index(drop=True).rename('S1'), final['index'].head(904).rename('S2')],axis=1)
s

Unnamed: 0,S1,S2
0,C. damasonium,Plasmodium falciparum
1,Paracercomonas marina,P falciparum
2,Paulinella chromatophora,Candida guilliermondii
3,Filoreta japonica,yeast
4,Cronartium quercuum f.sp. fusiforme,C. guilliermondii
...,...,...
899,Chironomus thummi,Callosobruchus maculatus
900,Fremyella diplosiphon,Chlamydomonas reinhardtii
901,Influenza A H1N1,Casaurina glauca
902,Picea glauca (Moench) Voss,Datisca glomerata


In [6]:
# Match n x n

train_index = train['index']
s = pd.DataFrame(list(product(train_index, train_index)), columns=['S1', 'S2'])

In [7]:
def ngrams(string, n=2):
    string = str(string)

    # Fixes text for any possible decoding issues
    string = fix_text(string)

    # Removes non ascii chars
    string = string.lower()
    string = string.encode("ascii", errors="ignore").decode()

    # Cleaning unrelevant characters
    chars_to_remove = [")","(",".","|","[","]","{","}","'"]
    rx = '[' + re.escape(''.join(chars_to_remove)) + ']'
    string = re.sub(rx, '', string)
    string = string.replace('&', 'and')
    string = string.replace(',', ' ')
    string = string.replace('-', ' ')
    string = string.title()
    string = re.sub(' +',' ',string).strip()
    string = ' '+ string +' '
    string = re.sub(r'[,-./]|\sBD',r'', string)
    ngrams = zip(*[string[i:] for i in range(n)])
    return [''.join(ngram) for ngram in ngrams]

In [8]:
vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams)
tfidf = vectorizer.fit_transform(s['S1'].astype('U'))
nbrs = NearestNeighbors(n_neighbors=1, n_jobs=-1).fit(tfidf)

In [9]:
tfidf

<90000x633 sparse matrix of type '<class 'numpy.float64'>'
	with 1503000 stored elements in Compressed Sparse Row format>

In [None]:
def getNearestN(query):
    # Returns a number document-term matrix
    queryTFIDF_ = vectorizer.transform(query) 
    distances, _ = nbrs.kneighbors(queryTFIDF_)
    return distances, _

start_time = time.time()
distances, _ = getNearestN(s['S2'].astype('U'))
t = time.time() - start_time

In [None]:
s.insert(2,'Distance',distances, True)

In [None]:
s

# Applied sklearn's MinMaxScaler to scale the distances between the range of 0 and 1

In [None]:
#First scaled the distance by using min max scaler

x = s['Distance'].values.reshape(-1,1)
min_max_scaler = MinMaxScaler()
scaled_conf = pd.Series(min_max_scaler.fit_transform(x).reshape(-1))
s.insert(3, "Scaled_Distance", scaled_conf, True)
s['Scaled_Distance'] = s['Scaled_Distance'].apply(lambda col: round(col,2))

In [None]:
proba = 1 - s['Scaled_Distance']
s.insert(4, "Probability", proba, True)

In [None]:
s.Probability.value_counts()

In [None]:
s

In [None]:
s[s['Probability']>0.6]