In [2]:
import pandas as pd
import numpy as np
import swco

## K-mers

A k-mer is a part of a string of longitude k. This substrings are created consequently. So, all the k-mers of a string are all the consecutive substrings contained in a string.
The aim of this script is to generate a dictionary with all the possible k-mers we found among all the species. The dictionary have the possible k-mers as keys and the values will be the Specie and the Scaffold where this k-mers have been found in the Specieal string.

In [3]:
# To load all of them at once and afterwards accessing one by one, this might be an option.
#https://towardsdatascience.com/a-simple-trick-to-load-multiple-excel-worksheets-in-pandas-3fae4124345b
# Define filepath
filepath = '../Data/Raw/Tables_Filtered_IK_format.xlsx'

# Load Excel file using Pandas with `sheet_name=None`
df_dict = pd.read_excel(filepath, sheet_name=None)

# Preview
#df_dict

# Get a specific one
#human = df_dict.get('Human')

# aprox 3 min 40 secs

In [4]:
# Data cleaning for each specie
df_species = pd.DataFrame()

species = df_dict.keys()

for s in species:
    aux = df_dict.get(s)
    aux['Specie'] = s
    df_species = pd.concat([df_species, aux])

In [5]:
df_species = swco.preprocessing(df_species)
df_species = swco.cons_duplicates_kmers(df_species)

In [6]:
# Take a look why they appear two indices
df_species.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 389222 entries, 0 to 389221
Data columns (total 19 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   Index               389222 non-null  int32  
 1   #Replicon Name      389222 non-null  object 
 2   Replicon Accession  389222 non-null  object 
 3   Start               389222 non-null  int64  
 4   Stop                389222 non-null  int64  
 5   Strand              389222 non-null  object 
 6   GeneID              367765 non-null  object 
 7   Locus               389222 non-null  object 
 8   Protein product     389222 non-null  object 
 9   Length              389222 non-null  int64  
 10  Protein name        389221 non-null  object 
 11  Unnamed: 10         25281 non-null   float64
 12  Specie              389222 non-null  object 
 13  Locus tag           47639 non-null   object 
 14  Geneid              21457 non-null   float64
 15  Gene_non_or         389222 non-nul

In [7]:
df_genes = df_species[['Gene_non_or', 'Index']]
(df_genes[['Index', 'Gene_non_or']]
                                    .groupby('Gene_non_or', as_index = False)
                                    .agg(Index=('Index', list), Num_Index=('Gene_non_or', 'count')))

Unnamed: 0,Gene_non_or,Index,Num_Index
0,,"[181167, 186864, 220972, 220998, 221024, 22103...",523
1,A,"[3361, 3365, 5904, 7122, 9055, 10593, 14041, 1...",147
2,AA,"[221541, 222841, 224880, 225893, 232421, 23899...",7
3,AAAS,"[10145, 16901, 37436, 51919, 62498, 71390, 788...",23
4,AAAS.L,"[346682, 365885]",2
...,...,...,...
11722,ZYMND,"[96912, 111335]",2
11723,ZYX,"[13909, 15878, 32117, 50907, 62924, 71526, 868...",23
11724,ZYX.S,"[358106, 371283]",2
11725,ZZEF,"[3543, 21922, 36676, 53430, 55072, 85652, 9385...",27


In [8]:
df_genes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 389222 entries, 0 to 389221
Data columns (total 2 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   Gene_non_or  389222 non-null  object
 1   Index        389222 non-null  int32 
dtypes: int32(1), object(1)
memory usage: 4.5+ MB


In [None]:
df_genes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 389222 entries, 0 to 389221
Data columns (total 5 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   Gene_non_or      389222 non-null  object
 1   Index            389222 non-null  object
 2   Specie_Scaffold  389222 non-null  object
 3   2_mers           354134 non-null  object
 4   2_mers_index     354134 non-null  object
dtypes: object(5)
memory usage: 14.8+ MB


In [12]:
# 1-mers
df_genes = df_species[['Gene_non_or', 'Index', 'Specie_Scaffold']]
df_genes['Index'] = df_genes['Index'].astype('str')

(df_genes[['Index', 'Gene_non_or']]
                                    .groupby('Gene_non_or', as_index = False)
                                    .agg(Index=('Index', list), Num_Index=('Gene_non_or', 'count'))
                                    .to_csv('../Data/Intermediate/k_mers/Index/1_mer.csv', index=False))

# 2-mers
df_genes.loc[df_genes['Specie_Scaffold'].shift(-1) == df_genes['Specie_Scaffold'], '2_mers'] = df_genes['Gene_non_or'] + '_' + df_genes['Gene_non_or'].shift(-1)
                                
df_genes.loc[df_genes['Specie_Scaffold'].shift(-1) == df_genes['Specie_Scaffold'], '2_mers_index'] = df_genes['Specie_Scaffold'] + '_' + df_genes['Index'] + '_' + df_genes['Index'].shift(-1)

k = 2
(df_genes[['Index', str(k) + '_mers', str(k) + '_mers_index']]
                                    .groupby(str(k) + '_mers', as_index = False)
                                    .agg(Index=(str(k) + '_mers_index', list), Num_Index=(str(k) + '_mers', 'count'))
                                    .to_csv('../Data/Intermediate/k_mers/Index/' + str(k) + '_mers.csv', index=False))

# 3-mers onwards
for k in range(3, 9):
    df_genes.loc[df_genes['Specie_Scaffold'].shift(-(k-1)) == df_genes['Specie_Scaffold'], str(k) + '_mers'] = df_genes[str(k-1)+'_mers'] + '_' + df_genes['Gene_non_or'].shift(-(k-1))
    
                                    
    df_genes.loc[df_genes['Specie_Scaffold'].shift(-(k-1)) == df_genes['Specie_Scaffold'], str(k) + '_mers_index'] = df_genes['Specie_Scaffold'] + '_' + df_genes['Index'] + '_' + df_genes['Index'].shift(-(k-1))

    (df_genes[['Index', str(k) + '_mers', str(k) + '_mers_index', str(k-1) + '_mers', str(k-1) + '_mers_index']]
                                        .groupby(str(k) + '_mers', as_index = False)
                                        .agg(Index=(str(k-1) + '_mers_index', list), Num_Index=(str(k) + '_mers_index', 'count'))
                                        .to_csv('../Data/Intermediate/k_mers/Index/' + str(k) + '_mers.csv', index=False))

    del df_genes[str(k-1) + '_mers'], df_genes[str(k-1) + '_mers_index']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_genes['Index'] = df_genes['Index'].astype('str')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_genes['2_mers'] = np.where((df_genes['Specie_Scaffold'].shift(-1) == df_genes['Specie_Scaffold']),
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_genes['2_mers_index'] = np.where((df_genes['Spec