In [1]:
import os, re
import pandas as pd
from ftfy import fix_text
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import KMeans

In [2]:
s800 = pd.read_csv('data/input/S800-1.0/S800.tsv',sep='\t',header=None)
s800_x = s800[[0,4]].drop_duplicates(subset=4).reset_index(drop=True)

In [3]:
os.listdir('data/input/')

['.DS_Store',
 'S800-1.0 (1).tar.gz',
 'SPECIES_Train.PubTator',
 'Linnaeus_Train.PubTator',
 'BioRED_TrainDev.PubTator',
 'S800-1.0']

In [4]:
speciesdct = {}
for file in os.listdir('data/input/'):
    if file.endswith('.PubTator'):
        with open('data/input/' + file) as f:
            lines = [line.rstrip('\n') for line in f]
            for line in lines:
                if ((len(line.split('\t')) > 1)):
                    if((line.split('\t')[4] == 'Species')):
                         speciesdct[line.split('\t')[3]] = line.split('\t')[-1]

In [5]:
s800_x

Unnamed: 0,0,4
0,5833,Plasmodium falciparum
1,5833,P falciparum
2,4929,Candida guilliermondii
3,4932,yeast
4,4929,C. guilliermondii
...,...,...
1498,49186,Marinobacterium stanieri
1499,693965,LMG 25435(T
1500,693965,CAIM 1449(T)
1501,4236,lettuce


In [6]:
list(s800[4].unique())

['Plasmodium falciparum',
 'P falciparum',
 'Candida guilliermondii',
 'yeast',
 'C. guilliermondii',
 'Tuber melanosporum',
 'T. melanosporum',
 'Aspergillus nidulans',
 'D. derouxi',
 'Geleia sinica spec. nov.',
 'K. flavus',
 'K. gracilis',
 'Geleia simplex (Faure-Fremiet, 1951)',
 'Kentrophoros flavus Raikov and Kovaleva, 1968',
 'Kentrophoros gracilis Raikov, 1963',
 'tobacco',
 'Neisseria gonorrhoeae',
 'Neisseria meningitidis',
 'meningococcal',
 'black-legged kittiwake',
 'Aspergillus fumigatus',
 'Candida albicans',
 'Norway spruce',
 'Picea abies',
 'Leuciscus leuciscus',
 'Solanum lycopersicum',
 'tomato',
 'potato',
 'Dunaliella tertiolecta',
 'Pelargonium x hortorum',
 'barley',
 'Arabidopsis thaliana',
 'Lolium perenne',
 'Cotterillia bromelicola nov. gen., nov. spec.',
 'Tillandsia heterophylla',
 'Cotterillia',
 'Cotterillia bromelicola',
 'Spodoptera litura Fabricius',
 'S. litura',
 'Rana limnocharis',
 'Oramoeba fumarolia',
 'Stachyamoeba sp. ATCC50324',
 'Vrihiamoeb

In [7]:
def ngrams(string, n=3):
    string = str(string)

    # Fixes text for any possible decoding issues
    string = fix_text(string)

    # Removes non ascii chars
    string = string.lower()
    string = string.encode("ascii", errors="ignore").decode()

    # Cleaning unrelevant characters
    chars_to_remove = [")","(",".","|","[","]","{","}","'"]
    rx = '[' + re.escape(''.join(chars_to_remove)) + ']'
    string = re.sub(rx, '', string)
    string = string.replace('&', 'and')
    string = string.replace(',', ' ')
    string = string.replace('-', ' ')
    string = string.title()
    string = re.sub(' +',' ',string).strip()
    string = ' '+ string +' '
    string = re.sub(r'[,-./]|\sBD',r'', string)
    ngrams = zip(*[string[i:] for i in range(n)])
    return [''.join(ngram) for ngram in ngrams]

In [8]:
vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams, lowercase=False)

In [9]:
tfidf = vectorizer.fit_transform(list(s800[4].unique()))

In [10]:
tfidf

<1503x4625 sparse matrix of type '<class 'numpy.float64'>'
	with 23924 stored elements in Compressed Sparse Row format>

In [11]:
kmeans = KMeans(n_clusters=2, random_state=0, n_init="auto").fit_predict(tfidf)

In [12]:
import time
# Create the kNN-model and learn from TF-IDF matrix
nbrs = NearestNeighbors(n_neighbors=1, n_jobs=-1).fit(tfidf)
unique_org = s800_x[4].values.astype('U').tolist()

# We will use the Eucledian distance (which is default) of two TF-IDF matrices.
def getNearestN(query):
    # Returns a number document-term matrix
    queryTFIDF_ = vectorizer.transform(query) 
    distances, _ = nbrs.kneighbors(queryTFIDF_)
    return distances, _

start_time = time.time()
distances, _ = getNearestN(unique_org)
t = time.time() - start_time
print("Completed in:", t)

Completed in: 0.11319494247436523


In [13]:
s800 = s800_x.rename(columns={4:'index'})

In [14]:
spec = pd.DataFrame.from_dict(speciesdct, orient='index').reset_index()

In [15]:
final = pd.concat([s800, spec])

In [16]:
final = final.drop_duplicates(subset=['index']).reset_index(drop=True)

In [17]:
train = final.head(100)

In [18]:
train

Unnamed: 0,0,index
0,5833,Plasmodium falciparum
1,5833,P falciparum
2,4929,Candida guilliermondii
3,4932,yeast
4,4929,C. guilliermondii
...,...,...
95,108931,Ni. lugens
96,7029,Acyrthosiphon pisum
97,1071257,Megaphorura arctica
98,1071257,Arctic springtail


In [19]:
# s = pd.DataFrame(columns=['S1','S2'])

# count = 0
# for x in train['index']:
#     for y in train['index']:
#         s.loc[count, "S1"] = x
#         s.loc[count, "S2"] = y
#         count += 1

In [20]:
import pandas as pd
from itertools import product

train_index = train['index']

s = pd.DataFrame(list(product(train_index, train_index)), columns=['S1', 'S2'])

In [21]:
s

Unnamed: 0,S1,S2
0,Plasmodium falciparum,Plasmodium falciparum
1,Plasmodium falciparum,P falciparum
2,Plasmodium falciparum,Candida guilliermondii
3,Plasmodium falciparum,yeast
4,Plasmodium falciparum,C. guilliermondii
...,...,...
9995,M. arctica,Ni. lugens
9996,M. arctica,Acyrthosiphon pisum
9997,M. arctica,Megaphorura arctica
9998,M. arctica,Arctic springtail


In [22]:
tfidf = vectorizer.fit_transform(s['S1'])
nbrs = NearestNeighbors(n_neighbors=1, n_jobs=-1).fit(tfidf)


In [23]:
def getNearestN(query):
    # Returns a number document-term matrix
    queryTFIDF_ = vectorizer.transform(query) 
    distances, _ = nbrs.kneighbors(queryTFIDF_)
    return distances, _

start_time = time.time()
distances, _ = getNearestN(s['S2'])
t = time.time() - start_time

In [28]:
s.insert(2,'Distance',distances, True)

In [29]:
s

Unnamed: 0,S1,S2,Distance
0,Plasmodium falciparum,Plasmodium falciparum,0.000000e+00
1,Plasmodium falciparum,P falciparum,0.000000e+00
2,Plasmodium falciparum,Candida guilliermondii,1.490116e-08
3,Plasmodium falciparum,yeast,0.000000e+00
4,Plasmodium falciparum,C. guilliermondii,0.000000e+00
...,...,...,...
9995,M. arctica,Ni. lugens,0.000000e+00
9996,M. arctica,Acyrthosiphon pisum,1.490116e-08
9997,M. arctica,Megaphorura arctica,1.053671e-08
9998,M. arctica,Arctic springtail,0.000000e+00


In [31]:
#First scaled the distance by using min max scaler
from sklearn.preprocessing import MinMaxScaler

x = s['Distance'].values.reshape(-1,1)
min_max_scaler = MinMaxScaler()
scaled_conf = pd.Series(min_max_scaler.fit_transform(x).reshape(-1))
s.insert(3, "Scaled_Distance", scaled_conf, True)
s['Scaled_Distance'] = s['Scaled_Distance'].apply(lambda col: round(col,2))

In [32]:
proba = 1 - s['Scaled_Distance']
s.insert(4, "Probability", proba, True)

In [33]:
s

Unnamed: 0,S1,S2,Distance,Scaled_Distance,Probability
0,Plasmodium falciparum,Plasmodium falciparum,0.000000e+00,0.00,1.00
1,Plasmodium falciparum,P falciparum,0.000000e+00,0.00,1.00
2,Plasmodium falciparum,Candida guilliermondii,1.490116e-08,0.71,0.29
3,Plasmodium falciparum,yeast,0.000000e+00,0.00,1.00
4,Plasmodium falciparum,C. guilliermondii,0.000000e+00,0.00,1.00
...,...,...,...,...,...
9995,M. arctica,Ni. lugens,0.000000e+00,0.00,1.00
9996,M. arctica,Acyrthosiphon pisum,1.490116e-08,0.71,0.29
9997,M. arctica,Megaphorura arctica,1.053671e-08,0.50,0.50
9998,M. arctica,Arctic springtail,0.000000e+00,0.00,1.00
