## Gene similarity

In [20]:
import os

#retrieves gene from a file in FASTA format
def readGene(filename):
    with open(filename, 'r') as f:
        data = f.read()
    return data[data.find('\n'):].replace('\n','')
    
    
#runs the GenCompress algorithm on the given gene and returns the size of the output
def compressGene(gene,condition):
    with open('tmp','w') as tmp:
        tmp.write(gene)
        if condition:
            with open('tmp_cond','w') as tmp_cond:
                tmp_cond.write(condition)
                os.system('./GenCompress tmp -c tmp_cond')
        else:
            os.system('./GenCompress tmp')
    size = os.stat('./tmp.GEN').st_size
    os.remove('./tmp')
    os.remove('./tmp.GEN')
    os.remove('./tmp.LOG')
    if condition:
        os.remove('./tmp_cond')
    return size

#returns similarity(distance) of the given genes
def geneSimilarity(uFile,vFile):
    u = readGene(uFile)
    v = readGene(vFile)
    cu = compressGene(u,None)
    cuv = compressGene(u+v,None)
    cu_v = compressGene(u,v)
    return 1 - 1.0*(cu - cu_v)/cu_v

## Similarity matrix

In [79]:
import numpy as np
import sys

filenames = []

def getNumber(filename):
    name = filename[filename.rfind(os.path.sep)+1:]
    return int(name[:name.find('.')])

for root, _, files in os.walk('genes'):      
    for filename in files:
        if(filename.endswith('.fasta')):
            filenames.append(os.path.join(root,filename))
            
size = len(filenames)
print('Found '+str(size)+' species')
simMatrix = np.zeros((size,size))

total = int(size*size-size)
done = 0

for i in range(0,size):
    for j in range(0,size):
        if i != j:
            uFile = filenames[i];
            vFile = filenames[j];
            uIndex = getNumber(uFile)-1
            vIndex = getNumber(vFile)-1
            simMatrix[uIndex,vIndex] = geneSimilarity(uFile,vFile)
            simMatrix[vIndex,uIndex] = simMatrix[uIndex,vIndex]
            done += 1
            sys.stdout.write("\rCalculating similarity {}/{}".format(done,total))
            sys.stdout.flush()
print('\nDone')

Found 29 species
Calculating similarity 812/812
Done


## Nearest neighbours

In [81]:
from tabulate import tabulate

simTmp = simMatrix.copy()
for i in range(0,size):
    simTmp[i,i]=1

nn = np.argmin(simTmp,0)

with open('species','r') as speciesFile:
    species = speciesFile.read().split('\n')[:-1]

width = max([len(s) for s in species]) + 1

nnPairs = [(species[i],species[nn[i]]) for i in range(0,size)]

print(tabulate(nnPairs,headers=['Species','Nearest neighbour'],tablefmt="fancy_grid"))


╒═══════════════════╤═════════════════════╕
│ Species           │ Nearest neighbour   │
╞═══════════════════╪═════════════════════╡
│ baboon            │ common chimpanzee   │
├───────────────────┼─────────────────────┤
│ cat               │ harbor seal         │
├───────────────────┼─────────────────────┤
│ common chimpanzee │ bonobo chimpanzee   │
├───────────────────┼─────────────────────┤
│ bonobo chimpanzee │ common chimpanzee   │
├───────────────────┼─────────────────────┤
│ cow               │ sheep               │
├───────────────────┼─────────────────────┤
│ dog               │ gray seal           │
├───────────────────┼─────────────────────┤
│ donkey            │ horse               │
├───────────────────┼─────────────────────┤
│ fat dormouse      │ rat                 │
├───────────────────┼─────────────────────┤
│ gibbon            │ common chimpanzee   │
├───────────────────┼─────────────────────┤
│ gorilla           │ human               │
├───────────────────┼───────────