In [1]:
import pandas as pd
import numpy as np

## K-mers

A k-mer is a part of a string of longitude k. This substrings are created consequently. So, all the k-mers of a string are all the consecutive substrings contained in a string.
The aim of this script is to generate a dictionary with all the possible k-mers we found among all the species. The dictionary have the possible k-mers as keys and the values will be the origin and the Scaffold where this k-mers have been found in the original string.

In [None]:
# To load all of them at once and afterwards accessing one by one, this might be an option.
#https://towardsdatascience.com/a-simple-trick-to-load-multiple-excel-worksheets-in-pandas-3fae4124345b
# Define filepath
filepath = '../Data/Raw/Tables_Filtered_IK_format.xlsx'

# Load Excel file using Pandas with `sheet_name=None`
df_dict = pd.read_excel(filepath, sheet_name=None)

# Preview
#df_dict

# Get a specific one
#human = df_dict.get('Human')

# aprox 3 min 40 secs

In [None]:
def preprocessing(specie):
    
    specie['Gene_non_or'] = specie['Locus'].str.split('(\d+)').str[0]
    specie['Gene'] = specie['Gene_non_or'] + specie['Strand']
    specie['Origin_Scaffold'] = specie['Origin'] + '_' + specie['Replicon Accession']
    #specie.reset_index(inplace= True)

    specie = specie[(specie['Gene_non_or'].str.contains('LOC') == False) & (specie['Gene_non_or'].str.isspace() == False) & (specie['Gene_non_or'] != '-') ]  

    return specie

In [None]:
# Data cleaning for each specie
df_species = pd.DataFrame()

species = df_dict.keys()

for s in species:
    aux = df_dict.get(s)
    aux['Origin'] = s
    df_species = pd.concat([df_species, aux]) #= pd.concat(df_species, aux)

#species = preprocessing(s)

In [None]:
df_species = preprocessing(df_species)

In [40]:
df_species

Unnamed: 0,#Replicon Name,Replicon Accession,Start,Stop,Strand,GeneID,Locus,Protein product,Length,Protein name,Origin,Unnamed: 10,Locus tag,Gene_non_or,Gene,Origin_Scaffold
7,Un,NW_001878251.4,61071,65540,+,563697,znf995,XP_017209820.1,358,gastrula zinc finger protein XlCGF8.2DB,Danio,,,znf,znf+,Danio_NW_001878251.4
26,Un,NW_001884452.4,167019,169622,+,108183944,si:ch211-198g14.4,XP_021326470.1,266,actin-binding protein-like isoform X2,Danio,,,si:ch,si:ch+,Danio_NW_001884452.4
37,Un,NW_003039384.4,161124,165069,+,100136843,zgc:174680,NP_001108034.1,504,uncharacterized protein LOC100136843,Danio,,,zgc:,zgc:+,Danio_NW_003039384.4
38,Un,NW_003039384.4,165289,171203,-,541543,zgc:112970,XP_021326739.1,660,uncharacterized protein LOC541543 isoform X2,Danio,,,zgc:,zgc:-,Danio_NW_003039384.4
52,Un,NW_003334026.1,60531,70535,-,795664,wu:fa56d06,XP_009296936.1,516,uncharacterized protein LOC795664 isoform X3,Danio,,,wu:fa,wu:fa-,Danio_NW_003334026.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18400,MT,NC_023889.1,9918,10214,+,18982988,ND4L,YP_009022039.1,98,NADH dehydrogenase subunit 4L (mitochondrion),Orca,,,ND,ND+,Orca_NC_023889.1
18401,MT,NC_023889.1,10208,11585,+,18982989,ND4,YP_009022040.1,459,NADH dehydrogenase subunit 4 (mitochondrion),Orca,,,ND,ND+,Orca_NC_023889.1
18402,MT,NC_023889.1,11787,13607,+,18982990,ND5,YP_009022041.1,606,NADH dehydrogenase subunit 5 (mitochondrion),Orca,,,ND,ND+,Orca_NC_023889.1
18403,MT,NC_023889.1,13591,14118,-,18982991,ND6,YP_009022042.1,175,NADH dehydrogenase subunit 6 (mitochondrion),Orca,,,ND,ND-,Orca_NC_023889.1


In [None]:
df_genes = df_species[['Gene_non_or', 'Origin_Scaffold']]
sca = df_genes['Origin_Scaffold'].unique()
sca = pd.DataFrame(sca)
sca.rename(columns={0:'Scaffold'}, inplace=True)

In [16]:
df_genes['Next Gene'] = df_genes['Gene_non_or'].shift(-1)
df_genes['2 Next Gene'] = df_genes['Gene_non_or'].shift(-2)
df_genes['3 Next Gene'] = df_genes['Gene_non_or'].shift(-3)
df_genes['4 Next Gene'] = df_genes['Gene_non_or'].shift(-4)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_genes['Next Gene'] = df_genes['Gene_non_or'].shift(-1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_genes['2 Next Gene'] = df_genes['Gene_non_or'].shift(-2)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_genes['3 Next Gene'] = df_genes['Gene_non_or'].shift(-3)
A value is trying to be set

In [None]:
df_genes['3_mers'] = np.where((df_genes['Origin_Scaffold'].shift(-2) == df_genes['Origin_Scaffold']), df_genes['Gene_non_or'] + '_' + df_genes['Gene_non_or'].shift(-1) + '_' + df_genes['2 Next Gene'].shift(-2), np.nan)
df_genes['4_mers']= np.where((df_genes['Origin_Scaffold'].shift(-3) == df_genes['Origin_Scaffold']), df_genes['Gene_non_or'] + '_' + df_genes['Next Gene'] + '_' + df_genes['2 Next Gene'] + '_' + df_genes['3 Next Gene'], np.nan)

In [18]:
df_3_mers = df_genes[['Origin_Scaffold', '3_mers']]
df_4_mers = df_genes[['Origin_Scaffold', '4_mers']]

In [20]:
df_3_mers = df_3_mers.groupby('3_mers', as_index = False).agg(list) 
df_4_mers = df_4_mers.groupby('4_mers', as_index = False).agg(list) 

In [42]:
df_3_mers.to_csv('../Data/Intermediate/3_mers.csv', index=False)
df_4_mers.to_csv('../Data/Intermediate/4_mers.csv', index=False)
sca.to_csv('../Data/Intermediate/scaffold.csv', index=False)