In [1]:
from itertools import product
from Bio import SeqIO
import pandas as pd
from sklearn.decomposition import PCA
import matplotlib.pyplot as plot
import numpy as np


# create all possible kmers
def createKmers(string, k):
    kmers = [''.join(c) for c in product(string, repeat=k)]
    return(kmers)

# count kmers in string
def countKmers(string, kmers):
    kcount = list()
    for k in kmers:
        kcount = kcount + [string.count(k)]
    return(kcount)

# normalize kmer counts by total number of kmers
def normKmers(kdict):
    norm = dict()
    for key in kdict.keys():
        l = list()
        for i in kdict[key]:
            l = l + [i/sum(kdict[key])]
        norm[key] = l
    return(norm)

# create all possible kmers for a given length of k
k = 3
kmers = createKmers('ACGT', k)

# input fasta containing all sequences to be analysed
inputFasta = 'sample.fasta'

# this dictionary will contain the kmer counts for each sequence
kmerCounts = dict()

ffile = SeqIO.parse(inputFasta, 'fasta')
# iterate through each sequence in the fasta file
for seq_record in ffile:
    # extract just the accession number to use as ID
    id = str(seq_record.id)[1:]
    
    # count the occurance of each kmer and store in the dictionary
    kmerCounts[id] = countKmers(str(seq_record.seq), kmers)

kmerNorm = normKmers(kmerCounts)
kmerNorm

# create a dataframe from the dictionary
#df = pd.DataFrame.from_dict(kmerCounts, orient='index', columns=kmers)
#dfnorm = pd.DataFrame.from_dict(kmerNorm, orient='index', columns=kmers)
dfnorm = pd.DataFrame.from_dict(kmerNorm, orient = 'index',columns = kmers)
dfnorm

Unnamed: 0,AAA,AAC,AAG,AAT,ACA,ACC,ACG,ACT,AGA,AGC,...,TCG,TCT,TGA,TGC,TGG,TGT,TTA,TTC,TTG,TTT
etaCoV/Finland/1/2020|EPI_ISL_407079,0.022293,0.021213,0.020133,0.026229,0.026055,0.012923,0.005643,0.023233,0.019750,0.010450,...,0.003901,0.017521,0.021805,0.018914,0.019227,0.027726,0.030025,0.017764,0.028319,0.024313
etaCoV/Wuhan/HBCDC-HB-04/2019|EPI_ISL_412900,0.022272,0.021176,0.020221,0.026514,0.026337,0.013045,0.005480,0.023403,0.019797,0.010393,...,0.003677,0.017393,0.022024,0.018843,0.018949,0.027716,0.030297,0.017711,0.028388,0.024534
etaCoV/Shenzhen/SZTH-001/2020|EPI_ISL_406592,0.022204,0.021130,0.020091,0.026257,0.026153,0.012920,0.005681,0.023174,0.019675,0.010288,...,0.003810,0.017597,0.021650,0.019017,0.019225,0.027711,0.030032,0.017805,0.028266,0.024421
etaCoV/Beijing/IVDC-BJ-005/2020|EPI_ISL_408485,0.022240,0.021131,0.020092,0.026328,0.026051,0.012921,0.005681,0.023279,0.019676,0.010392,...,0.003915,0.017598,0.021755,0.018914,0.019191,0.027575,0.030173,0.017910,0.028164,0.024318
etaCoV/Switzerland/1000477377/2020|EPI_ISL_413020,0.022280,0.021206,0.020062,0.026265,0.026057,0.012890,0.005717,0.023354,0.019681,0.010395,...,0.003846,0.017533,0.021760,0.018954,0.019196,0.027720,0.030249,0.017775,0.028309,0.024428
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
etaCoV/France/IDF0373/2020|EPI_ISL_406597,0.022207,0.021168,0.020059,0.026295,0.026087,0.012922,0.005682,0.023315,0.019678,0.010393,...,0.003880,0.017565,0.021756,0.018916,0.019193,0.027646,0.030210,0.017876,0.028270,0.024528
etaCoV/France/IDF0372-isl/2020|EPI_ISL_410720,0.022201,0.021162,0.020053,0.026288,0.026080,0.012919,0.005680,0.023309,0.019672,0.010390,...,0.003879,0.017560,0.021750,0.018945,0.019187,0.027638,0.030201,0.017871,0.028262,0.024521
etaCoV/France/IDF0386-islP1/2020|EPI_ISL_411219,0.022201,0.021162,0.020053,0.026288,0.026080,0.012919,0.005680,0.023309,0.019672,0.010390,...,0.003879,0.017560,0.021750,0.018945,0.019187,0.027638,0.030201,0.017871,0.028262,0.024521
etaCoV/France/IDF0386-islP3/2020|EPI_ISL_411220,0.022201,0.021162,0.020053,0.026288,0.026080,0.012919,0.005680,0.023309,0.019672,0.010390,...,0.003879,0.017560,0.021750,0.018945,0.019187,0.027638,0.030201,0.017871,0.028262,0.024521
