In [1]:
import pandas as pd
import time
import numpy as np
from enum import IntEnum
from datetime import datetime as dt
import os
import seaborn as sns
import xlsxwriter
from matplotlib import pyplot as plt
import feather
import pathlib as Path
import logging

logging.basicConfig(
    format='%(asctime)s - %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S',
    level=logging.INFO,
)

# Import the functions from the other scripts
import read_k_mers as rkm
import k_mers_metric as kmm
import count_k_mers as ckm
import swco
import metric as me
import alignment as al
import annotation_length
import read_annotations as ra

%load_ext autoreload
%autoreload 2

In [2]:
# Read the annotations of all the species
# These annotations are all ready preprocessed (filtered, deduplicated and enriched with features)
# Read them after the add_features step
#df_annotations = ra.read_annotations('../Data/Intermediate/Dmitrii/interim/feature_enriched/', species=['Golden spiny mouse', 'Human'])
df_annotations = pd.read_feather('../Data/Intermediate/interim/feature_enriched/df_annotations.feather')

In [12]:
df_annotations.protein_product.nunique()

526948

In [13]:
gene_id_check = df_annotations[['gene_id', 'protein_product']].drop_duplicates()
gene_id_check.protein_product.count()

526948

In [7]:
# Check if there is any NaN in gene_id or protein_product fields
df_annotations[df_annotations['protein_product'].isna()]

Unnamed: 0,index,replicon_accession,start,stop,strand,gene_id,locus,protein_product,protein_name,replicon_length,replicon_name,species,strict_locus,relaxed_locus,species_gcf_id
27349,27349,NC_000002.12,88857361,88857683,-,3514,IGKC,,,242193529,2,Human,IGKC,IGKC,GCF_000001405
27350,27350,NC_000002.12,88860568,88860605,-,28946,IGKJ5,,,242193529,2,Human,IGKJ,IGKJ,GCF_000001405
27351,27351,NC_000002.12,88860886,88860923,-,28947,IGKJ4,,,242193529,2,Human,IGKJ,IGKJ,GCF_000001405
27352,27352,NC_000002.12,88861221,88861258,-,28948,IGKJ3,,,242193529,2,Human,IGKJ,IGKJ,GCF_000001405
27353,27353,NC_000002.12,88861525,88861563,-,28949,IGKJ2,,,242193529,2,Human,IGKJ,IGKJ,GCF_000001405
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2278300,160994,NC_052566.1,138778,139080,+,431524,VH26L1,,Ig mu chain C region-like isoform X3,554126,35,Chicken,VH,VH26L,GCF_016699485
2278301,160995,NC_052566.1,144497,144805,+,431524,VH26L1,,Ig mu chain C region-like isoform X3,554126,35,Chicken,VH,VH26L,GCF_016699485
2278302,160996,NC_052566.1,151876,152202,+,431524,VH26L1,,Ig mu chain C region-like isoform X3,554126,35,Chicken,VH,VH26L,GCF_016699485
2278303,160997,NC_052566.1,154380,154741,+,431524,VH26L1,,Ig mu chain C region-like isoform X3,554126,35,Chicken,VH,VH26L,GCF_016699485


In [3]:
df_annotations.replicon_accession.nunique()

5595

In [14]:
df_annotations.loc[df_annotations['species'] == 'Golden spiny mouse'].to_csv('../Data/Intermediate/golden_spiny_mouse_annotations.csv', index=False)

In [4]:
# Read the length of each scaffold for all the species

# If we need to create/update the length files
# annotation_length.get_length(df_annotations, 'replicon_accession')

# If we already have the length files
df_replicon_accession_length = annotation_length.read_length('replicon_accession')

In [5]:
df_replicon_accession_length.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5595 entries, 0 to 5594
Data columns (total 4 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   replicon_accession  5595 non-null   string
 1   length              5595 non-null   int64 
 2   start               5595 non-null   int64 
 3   stop                5595 non-null   int64 
dtypes: int64(3), string(1)
memory usage: 175.0 KB


In [6]:
# Take a look at the length of the replicons
df_replicon_accession_length['length'].describe()

count     5595.000000
mean       754.608043
std       2662.841903
min          1.000000
25%         12.000000
50%         37.000000
75%        129.000000
max      40050.000000
Name: length, dtype: float64

Observe there are a lot of replicon accession with small amount of genes. Do not makes sense to keep them. We will remove them.
Let us define a function which includes a threshold to remove replicons with small amount of genes.

In [7]:
# Ivan suggest to make the threshold equal to 10
df_annotations = annotation_length.remove_short_replicons(df_annotations, df_replicon_accession_length, 10)

In [8]:
df_annotations.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4215465 entries, 0 to 4222031
Data columns (total 15 columns):
 #   Column              Dtype 
---  ------              ----- 
 0   index               int64 
 1   replicon_accession  string
 2   start               int64 
 3   stop                int64 
 4   strand              string
 5   gene_id             string
 6   locus               string
 7   protein_product     string
 8   protein_name        string
 9   replicon_length     int64 
 10  replicon_name       string
 11  species             object
 12  strict_locus        string
 13  relaxed_locus       string
 14  species_gcf_id      object
dtypes: int64(4), object(2), string(9)
memory usage: 514.6+ MB


In [9]:
# Read the k-mers of all the species

# To update them (if needed)
# I want to update them once I run generate_k_mers rule from Snakemake
#rkm.update_k_mers('../Data/Intermediate/k_mers/processed/', mode='strict')

# To read them
df_k_mers = pd.read_feather('../Data/Intermediate/k_mers/processed/df_k_mers_relaxed.feather')

In [10]:
df_k_mers.replicon_accession.nunique()

3950

In [11]:
# Read the number of k-mers per scaffold

# To update them (if needed)
# I want to update them once I run generate_k_mers rule from Snakemake
#ckm.count_k_mers(df_k_mers)

# Read them
k_mers_counter = pd.read_feather('../Data/Intermediate/k_mers/strict_count_k_mers_replicon_accession.feather')


In [12]:
k_mers_counter.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28594 entries, 0 to 28593
Data columns (total 4 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   species             28594 non-null  object
 1   replicon_accession  28594 non-null  object
 2   k                   28594 non-null  object
 3   total_k_mers        28594 non-null  int64 
dtypes: int64(1), object(3)
memory usage: 893.7+ KB


In [13]:
df_annotations.replicon_accession.nunique()

4347

In [14]:
df_k_mers.replicon_accession.nunique()

3950

In [15]:
k_mers_counter.replicon_accession.nunique()

3819

In [16]:
identify_no_k_mers = df_annotations.loc[~df_annotations.replicon_accession.isin(df_k_mers.replicon_accession), ['species', 'replicon_accession']]

In [17]:
identify_no_k_mers_species = identify_no_k_mers.drop_duplicates()

In [18]:
identify_no_k_mers_length = identify_no_k_mers_species.merge(df_replicon_accession_length, on='replicon_accession')

In [2]:
# Alignment


# We will align the target species or scaffold vs. all the query species or scaffold
# Always in this direction: target vs. query
# We will use the Smith-Waterman algorithm
# If those variables are empty, we will align all the species/scaffolds vs. all the species/scaffolds given in the dataframe

def alignment(mode, target_species=[], target_scaffold=[], query_species=[], query_scaffold=[], output_path='../Data/Intermediate/alignment/', df_k_mers=df_k_mers, k_mers_counter=k_mers_counter, df_replicon_accession_length=df_replicon_accession_length, df_annotations=df_annotations):

    # Create the output_folder where the alignment results and plots will be stored
    output_path = al.create_output_folder(output_path)

   # If the target_species is empty, we will align all the species vs. all the species
    if not target_species:
        target_species = df_annotations['species'].unique()

    # If the target_scaffold is empty, we will align all the scaffolds of the target_species vs. all the scaffolds of the query_species
    if not target_scaffold:
        target_scaffold = df_annotations.loc[df_annotations['species'].isin(target_species), 'replicon_accession'].unique()
    else:
        # Check if the target scaffold belongs to the target species
        if not df_annotations.loc[df_annotations['replicon_accession'].isin(target_scaffold), 'species'].isin(target_species).all():
            raise Exception('The target scaffolds introduced do not belong to one of the target species')

    # For visualization purposes, initialise some variables
    blocks = []
    align_counter = 0

    # Smith-Waterman algorithm has some parameters
    # Let's store them to write them in the output file
    # The parameters can be modified in the swco.py script
    params = pd.DataFrame({
        'Description': ['', 'Parameters', 'Match', 'Mismatch', 'First gap', 'Consecutive gap'], 
        'Value': ['', '', swco.Score.MATCH, swco.Score.MISMATCH, swco.Score.FIRST_GAP, swco.Score.CONS_GAP]})

    # When we do the alignment, we would like to keep the position of the genes in the table
    # So, we will create a new column with the index of the row
    df_annotations['index'] = df_annotations.index

    for target_sp in target_species:

        # Define the query_species
        if not query_species:
            query_species = df_annotations.loc[df_annotations['species'] != target_sp, 'species'].unique()
        # Check that target_sp is not in query_species
        elif target_sp in query_species:
            query_species.remove(target_sp)
        
        # Define the query_scaffold
        if not query_scaffold:
            query_scaffold = df_annotations.loc[df_annotations['species'].isin(query_species), 'replicon_accession'].unique()
        else:
            # Sanity check: verify that the query_scaffold belong to the query_species
            query_scaffold = df_annotations.loc[df_annotations['species'].isin(query_species) & df_annotations['replicon_accession'].isin(query_scaffold), 'replicon_accession'].unique()

        # For each target_scaffold
        for target_sc in target_scaffold:
            
            logging.info(f'Aligning {target_sp} {target_sc}')

            # Define target as the annotations of this target scaffold
            df_target = df_annotations.loc[(df_annotations['species'] == target_sp) & (df_annotations['replicon_accession'] == target_sc)]

            # Now, we want to order the query scaffolds per species by similarity to the target scaffold
            # So, we will first compare the target scaffold with the most similar query scaffolds
            # Then, we will compare the target scaffold non-matching part with the second most similar query scaffolds and so on
            # We have implemented a metric in order to compare them based in the k-mers
            # This is implemented in the k_mers_metric.py script
            
            # Filter to keep only the relevant query scaffolds for this alignment
            k_mers_counter_query = k_mers_counter.loc[k_mers_counter['replicon_accession'].isin(query_scaffold), ['species', 'replicon_accession', 'k', 'total_k_mers']]

            # Return the similar scaffolds ordered by species and similarity descending
            df_scaffold_cand = pd.DataFrame()
            df_scaffold_cand = kmm.k_mers_to_metric(target_sc, level='replicon_accession', k_mers_counter=k_mers_counter_query, df_length=df_replicon_accession_length, df_k_mers=df_k_mers)

            if df_scaffold_cand.empty:
                logging.warning(f'No similar scaffolds for {target_sp} {target_sc}')
                continue

            # Target scaffold is compared versus the similarest scaffold of one specie
            # Then, the non-matching part of the target scaffold is compared versus the second similarest scaffold of the same specie
            # So, our df_target is going to be updated after each alignment, if we compare to the same species
            # Then, we need a variable to keep the original df_target and to check if we change of query species
            df_target_original = df_target.copy()
            query_sp_old = ''

            # Create an excel file to store the alignment results per each target scaffold
            writer = pd.ExcelWriter(f'{output_path}/{target_sp}_{target_sc}.xlsx', engine='xlsxwriter')

            # Write the results and the align species variables are needed
            results = pd.DataFrame()
            align_species = pd.DataFrame()

            # Go through the list of similar scaffolds to apply Smith-Waterman algorithm
            for query_sp, query_sc in zip(df_scaffold_cand['species'], df_scaffold_cand['replicon_accession']):
                
                # Keep the similarity ratio with us
                sim_ratio = df_scaffold_cand.loc[(df_scaffold_cand['species'] == query_sp) & (df_scaffold_cand['replicon_accession'] == query_sc), 'metric'].values[0]
                logging.info(f'Aligning {target_sp} {target_sc} vs. {query_sp} {query_sc} with similarity ratio: {sim_ratio:0.4f}')
                # I do not get why this is not printing correctly sim_ratio. If I tried appart, it works.
                
                # If we change of query species, we need to update the df_target
                if query_sp != query_sp_old:
                    df_target = df_target_original.copy()
                    query_sp_old = query_sp

                    # Write the alignment results of the previous species
                    if not align_species.empty:
                        swco.write_alignment(align_species, query_sp, query_sc, writer)

                # Define query as the annotations of this query scaffold
                df_query = df_annotations.loc[(df_annotations['species'] == query_sp) & (df_annotations['replicon_accession'] == query_sc)]
                
                # Increase the align counter
                align_counter += 1

                # Initialise variables for the alignment
                aligned_query = []
                aligned_target = []
                index_query = []
                index_target = []
                matrix = [[]]

                ## Apply Smith-Waterman algorithm
                # The results are stored in a dataframe
                tic = time.perf_counter()
                aligned_target, aligned_query, index_target, index_query, max_score, max_index, max_i, max_j, matrix = swco.smith_waterman(df_target, df_query, mode=mode)
                toc = time.perf_counter()

                # Time
                logging.info(f'Alignment done in {toc - tic:0.4f} seconds')

                tic = time.perf_counter()

                ## Post processing
                # From here on, we will process the results of the alignment, transforming them into some format to report the main information.
                # That said, everything can be changed, except the blocks csv file which is used afterwards for the alignment visualization.

                # If the alignment is empty, we will not store the results
                if not aligned_target:
                    logging.info(f'No alignment found for {target_sp} {target_sc} vs. {query_sp} {query_sc}')
                    continue
                
                # Smith Waterman algorithm returns a target and a query aligned sequences, getting back the index and the locus. 
                # Merge this sequences to be able to represent them in a table
                # Moreover, merge the aligned sequences with the master information from the original sequences, such as, replicon_accession, start, end, strand, etc.
                align = pd.DataFrame()
                align = swco.merge(df_target, df_query, aligned_target, index_target, aligned_query, index_query)

                # | target locus | Result | query locus |
                # |--------------|--------|-------------|
                # | ABBB         |  Match | ABBB        |
                # | ABBB        |  Gap    | -           |
                # 
                # Scenarios like this should be tagged as a Duplicate genes, instead of a Gap
                # There is a function to do this
                # Look for the genes which have been duplicated and tag them as so, instead of tagging it as a Gap
                align = swco.duplicate(align)

                # Keep the results of the alignement per species
                align_species = pd.concat([align_species, align])

                toc = time.perf_counter()

                d = toc-tic
                logging.info(f'Postprocessed in {d:0.4f} seconds')

                ## Write the alignment results
                tic = time.perf_counter()

                # Create an aggregation of the results information and write it in the excel workbook
                # For this, we need all the information regarding the alignment and the two aligned sequences
                # Gives us back the index information for later usage
                # Also, another dataframe called blocks for doing the alignment visualization
                len_target = len(df_target)
                len_query = len(df_query)
                index_target_no_gaps, actual_results, blocks = swco.summary_results(writer, aligned_query, aligned_target, index_query, index_target, align, target_sp, target_sc, query_sp, query_sc, len_target, len_query, sim_ratio, params, blocks, align_counter)
                
                # Keep track of the results per target_scaffold
                # Aim is to have a table with some summarizing KPIs for all the alignments done
                results = pd.concat([results, actual_results])

                # Create a heatmap with the matrix information to represent which part of the sequences are more similar
                swco.heatmap(writer, matrix, target_sp, target_sc, query_sp, query_sc, output_path)
                
                toc = time.perf_counter()

                d = toc-tic
                logging.info(f'Written in {d:0.4f} seconds')

                # Remove the already aligned parts for the target scaffold
                # For doing so, we need the index of the target scaffold already mapped
                df_target = swco.remove_match_parts(df_target, index_target_no_gaps)

                # Clean variables
                del aligned_query, aligned_target, index_query, index_target, matrix, align

            # Write the last alignment results
            swco.write_alignment(align_species, query_sp, query_sc, writer)

            # Write the results dataframe
            results.to_excel(writer, sheet_name = 'Results', index = False)
            writer.sheets['Results'].activate()
            writer.close()   

    # Before saving blocks, let's add column names to the list
    blocks = pd.DataFrame(blocks)
    blocks.columns = ['species', 'replicon_accession', 'target_replicon_accession', 'comparing_species', 'alignment_id', 'start', 'stop', 'match_perc']
    blocks.to_csv(path + timestamp + '/blocks.csv', index = False)

NameError: name 'df_k_mers' is not defined

In [1]:
alignment(target_species=["Human", 'Rhesus Macaque'], query_species=['Rhesus Macaque', 'Monito del monte', 'Chicken'], mode='strict')

NameError: name 'alignment' is not defined

In [22]:
# Save to csv
df_replicon_accession_length.to_csv('../Data/Intermediate/replicon_accession_length.csv', index = False)

In [132]:
target_species=[]
target_scaffold=[]
query_species=[]
query_scaffold=[]
target_species=['Human']
mode='strict'
output_path='../Data/Intermediate/alignment/'

In [88]:
target_sp=['Human']
query_species=['Rhesus Macaque']

In [90]:
df_replicon_accession_length.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4347 entries, 0 to 4346
Data columns (total 2 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   replicon_accession  4347 non-null   string
 1   length              4347 non-null   int64 
dtypes: int64(1), string(1)
memory usage: 68.0 KB


In [94]:
k_mers_counter['replicon_accession'].nunique()

901

In [34]:
kmm.k_mers_to_metric(target_sc, level='replicon_accession', k_mers_counter=k_mers_counter_query, df_length=df_replicon_accession_length, df_k_mers=df_k_mers)

2023-04-16 10:47:08 - Computing shared k-mers for: NC_000001.11 replicon_accession
2023-04-16 10:47:27 - Computing the metric for each replicon_accession


Unnamed: 0,species,replicon_accession,metric


In [60]:
level='replicon_accession'
origin = 'NC_000024.10'
#k_mers_counter = pd.read_feather('../Data/Intermediate/k_mers/strict_count_k_mers_replicon_accession.feather')

In [22]:
query_sp = 'Zebra Finch'
query_sc = 'NC_044224.2'
target_sp = 'Human'
target_sc = 'NC_000001.11'

In [61]:
df_scaffold_cand = kmm.k_mers_to_metric(origin, level='replicon_accession', k_mers_counter=k_mers_counter, df_length=df_replicon_accession_length, df_k_mers=df_k_mers)

2023-04-20 17:53:44 - Computing shared k-mers for NC_000024.10 replicon_accession
2023-04-20 17:53:59 - Computing generated k-mers for NC_000024.10 replicon_accession
2023-04-20 17:53:59 - Computing metric for NC_000024.10 replicon_accession


In [75]:
query_sp = 'Aardvark'
query_sc = 'NW_006922250.1'
target_sp = 'Human'
target_sc = 'NC_000001.11'

In [77]:
sim_ratio = df_scaffold_cand.loc[(df_scaffold_cand['species'] == query_sp) & (df_scaffold_cand['replicon_accession'] == query_sc), 'metric'].values[0]
logging.info(f'Aligning {target_sp} {target_sc} vs. {query_sp} {query_sc} with similarity ratio: {sim_ratio:0.4f}')

2023-04-20 17:58:37 - Aligning Human NC_000001.11 vs. Aardvark NW_006922250.1 with similarity ratio: 0.7538


In [65]:
sim_ratio

0.7538004810394989

In [71]:
logging.info(f'Hello {query_sc} {sim_ratio:0.4f}')

2023-04-20 17:56:31 - Hello NW_006922250.1 0.7538


In [76]:
logging.info(f'Aligning {target_sp} {target_sc} vs. {query_sp} {query_sc} with similarity ratio: {sim_ratio:0.4f}')

2023-04-20 17:57:31 - Aligning Human NC_000001.11 vs. Aardvark NW_006922250.1 with similarity ratio: 0.7538


In [62]:
df_scaffold_cand

Unnamed: 0,species,replicon_accession,metric
0,Aardvark,NW_006922250.1,0.7538
1,African clawed frog,NC_054374.1,0.981315
2,Agile Gracile Mouse Opossum,NC_058132.1,0.994229
3,Asiatic toad,NC_058082.1,0.982779
4,Asiatic toad,NC_058088.1,0.979735
5,Australian saltwater crocodile,NW_017728906.1,0.973412
6,Australian saltwater crocodile,NW_017728924.1,0.8577
7,Chicken,NC_052532.1,0.988938
9,Coelacanth,NW_005819565.1,0.968139
8,Coelacanth,NW_005819502.1,0.899555


In [53]:
k_mers_counter

Unnamed: 0,species,replicon_accession,k,total_k_mers
0,Aardvark,NW_006921588.1,11,105
1,Aardvark,NW_006921588.1,13,103
2,Aardvark,NW_006921588.1,17,99
3,Aardvark,NW_006921588.1,19,97
4,Aardvark,NW_006921588.1,2,114
...,...,...,...,...
28589,Zebra Finch,NC_054769.1,7,50
28590,Zebra Finch,NW_024545399.1,2,2
28591,Zebra Finch,NW_024545399.1,3,1
28592,Zebra Finch,NW_024545436.1,2,2


In [62]:
df_replicon_accession_length.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5595 entries, 0 to 5594
Data columns (total 4 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   replicon_accession  5595 non-null   string
 1   length              5595 non-null   int64 
 2   start               5595 non-null   int64 
 3   stop                5595 non-null   int64 
dtypes: int64(3), string(1)
memory usage: 175.0 KB


In [54]:
# Add the length of the scaffold or species to the dataframe
k_mers_counter = k_mers_counter.merge(df_replicon_accession_length[[level, 'length']], on = level, how = 'left')

In [24]:
k_mers_counter.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 28594 entries, 0 to 28593
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   species             28594 non-null  object
 1   replicon_accession  28594 non-null  object
 2   k                   28594 non-null  object
 3   total_k_mers        28594 non-null  int64 
 4   length              28594 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 1.3+ MB


In [25]:

# Now, we need to compute the number of k-mers shared from the origin with the rest of elements
logging.info('Computing shared k-mers for: ' + origin + ' ' + level)
df_common_k_mers = kmm.shared_k_mers(origin, level, df_k_mers)


2023-04-19 12:46:55 - Computing shared k-mers for: NC_000001.11 replicon_accession


In [26]:
df_common_k_mers

Unnamed: 0,k,replicon_accession,common_k_mers
0,11,NC_024220.2,4
1,11,NC_024225.2,13
2,11,NC_024234.2,1
3,11,NC_041732.1,3
4,11,NC_041734.1,19
...,...,...,...
1392,7,NW_024885816.1,31
1393,7,NW_024885850.1,11
1394,7,NW_024885897.1,1
1395,71,NC_041754.1,6


In [56]:
k_mers_counter_origin = k_mers_counter.loc[k_mers_counter[level] == origin, ['k', 'total_k_mers', 'length']]

In [57]:
k_mers_counter = k_mers_counter.join(df_common_k_mers.set_index(['k', level]), on=['k', level], how='left')

In [38]:
k_mers_counter.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 28594 entries, 0 to 28593
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   species             28594 non-null  object 
 1   replicon_accession  28594 non-null  object 
 2   k                   28594 non-null  object 
 3   total_k_mers        28594 non-null  int64  
 4   length              28594 non-null  int64  
 5   common_k_mers       1396 non-null   float64
dtypes: float64(1), int64(2), object(3)
memory usage: 1.5+ MB


In [58]:
k_mers_counter_origin = k_mers_counter.loc[k_mers_counter[level] == origin, ['k', 'total_k_mers', 'length']]

# Remove the rows with NaN values
k_mers_counter_common = k_mers_counter.loc[k_mers_counter['common_k_mers'].notnull()]

# Merge the two dataframes, creating a new column containing the number of k-mers generated by the original scaffold
k_mers_counter_common = k_mers_counter_common.merge(k_mers_counter_origin[['k', 'total_k_mers', 'length']], on = ['k'], how = 'left', suffixes = ('', '_origin'))



In [60]:
# Compute the number of k-mers generated by the two scaffolds
k_mers_counter_common['generated_k_mers'] = k_mers_counter_common['total_k_mers'] + k_mers_counter_common['total_k_mers_origin'] - k_mers_counter_common['common_k_mers']


In [61]:
k_mers_counter_common.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1396 entries, 0 to 1395
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   species              1396 non-null   object 
 1   replicon_accession   1396 non-null   object 
 2   k                    1396 non-null   object 
 3   total_k_mers         1396 non-null   int64  
 4   length               1396 non-null   int64  
 5   common_k_mers        1396 non-null   float64
 6   total_k_mers_origin  1396 non-null   int64  
 7   length_origin        1396 non-null   int64  
 8   generated_k_mers     1396 non-null   float64
dtypes: float64(2), int64(4), object(3)
memory usage: 109.1+ KB


In [30]:
k_mers_counter_origin.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25 entries, 15731 to 15755
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   k             25 non-null     object
 1   total_k_mers  25 non-null     int64 
 2   length        25 non-null     int64 
dtypes: int64(2), object(1)
memory usage: 800.0+ bytes


In [39]:
k_mers_counter = k_mers_counter.loc[k_mers_counter['common_k_mers'].notnull()]

In [32]:
k_mers_counter.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1396 entries, 132 to 28412
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   species             1396 non-null   object 
 1   replicon_accession  1396 non-null   object 
 2   k                   1396 non-null   object 
 3   total_k_mers        1396 non-null   int64  
 4   length              1396 non-null   int64  
 5   common_k_mers       1396 non-null   float64
dtypes: float64(1), int64(2), object(3)
memory usage: 76.3+ KB


In [50]:
# Now, we have the total k-mers per scaffols and the number of k-mers shared with the rest of scaffolds, i.e., 
# the intersection between the two groups.
# We want to compute the number of k-mers generated by the two scaffolds
# We can do this by computing the number of k-mers generated by the two scaffolds and subtracting the number 
# of k-mers shared with the rest of scaffolds. Done under generated_k_mers function

k_mers_counter = kmm.generated_k_mers(origin, level, k_mers_counter)

In [51]:
k_mers_counter.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1396 entries, 0 to 1395
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   species              1396 non-null   object 
 1   replicon_accession   1396 non-null   object 
 2   k                    1396 non-null   object 
 3   total_k_mers         1396 non-null   int64  
 4   length               1396 non-null   int64  
 5   common_k_mers        1396 non-null   float64
 6   total_k_mers_origin  0 non-null      float64
 7   length_origin        0 non-null      float64
 8   generated_k_mers     0 non-null      float64
dtypes: float64(4), int64(2), object(3)
memory usage: 109.1+ KB


In [45]:
k_mers_counter_common = k_mers_counter.loc[k_mers_counter['common_k_mers'].notnull()]

In [49]:
k_mers_counter_common.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1396 entries, 0 to 1395
Data columns (total 8 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   species              1396 non-null   object 
 1   replicon_accession   1396 non-null   object 
 2   k                    1396 non-null   object 
 3   total_k_mers         1396 non-null   int64  
 4   length               1396 non-null   int64  
 5   common_k_mers        1396 non-null   float64
 6   total_k_mers_origin  1396 non-null   int64  
 7   length_origin        1396 non-null   int64  
dtypes: float64(1), int64(4), object(3)
memory usage: 98.2+ KB


In [47]:
k_mers_counter_origin.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25 entries, 15731 to 15755
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   k             25 non-null     object
 1   total_k_mers  25 non-null     int64 
 2   length        25 non-null     int64 
dtypes: int64(2), object(1)
memory usage: 800.0+ bytes


In [48]:
# Merge the two dataframes, creating a new column containing the number of k-mers generated by the original scaffold
k_mers_counter_common = k_mers_counter_common.merge(k_mers_counter_origin[['k', 'total_k_mers', 'length']], on = ['k'], how = 'left', suffixes = ('', '_origin'))


In [43]:
k_mers_counter_common.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1396 entries, 0 to 1395
Data columns (total 8 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   species              1396 non-null   object 
 1   replicon_accession   1396 non-null   object 
 2   k                    1396 non-null   object 
 3   total_k_mers         1396 non-null   int64  
 4   length               1396 non-null   int64  
 5   common_k_mers        1396 non-null   float64
 6   total_k_mers_origin  1396 non-null   int64  
 7   length_origin        1396 non-null   int64  
dtypes: float64(1), int64(4), object(3)
memory usage: 98.2+ KB


In [46]:
k_mers_counter = k_mers_counter[k_mers_counter.notna().all(axis=1)]

# Compute the probability of finding a common k-mer
k_mers_counter['probability'] = k_mers_counter['common_k_mers'] / k_mers_counter['generated_k_mers']

# First, we want to apply the desired formula:
# (1 - probability) * (1 - (m / (n*k^2)))
# Where: m, n are the length of the two scaffolds/species with m being the shortest represented in length and length_origin columns
shortest_length = k_mers_counter[['length', 'length_origin']].min(axis=1)
longest_length = k_mers_counter[['length', 'length_origin']].max(axis=1)
k_mers_counter['k'] = k_mers_counter['k'].astype(int)
k_mers_counter['metric'] = ((1 - k_mers_counter['probability']) * (1 - (shortest_length / (longest_length * pow(k_mers_counter['k'], 2))))).astype(float)
#k_mers_counter['metric'] = (1 - k_mers_counter['probability']) * (1 - (k_mers_counter[['length', 'length_origin']].apply(min, axis=1) / (k_mers_counter[['length', 'length_origin']].apply(max, axis=1) * k_mers_counter['k']**2)))



In [48]:
# Do the product of the metric for each scaffold/species among all the k values
if level == 'replicon_accession':
    logging.info('Computing the metric for each replicon_accession')
    metric = (k_mers_counter[['species', 'replicon_accession', 'metric']]
            .groupby(['species', 'replicon_accession'], as_index=False)
            .prod()
            .sort_values(by=['species', 'metric'], ascending=False))
elif level == 'species':
    logging.info('Computing the metric for each species')
    metric = (k_mers_counter[['species', 'metric']]
            .groupby('species', as_index=False)
            .prod()
            .sort_values(by=['species', 'metric'], ascending=False))


2023-04-16 10:55:19 - Computing the metric for each replicon_accession


In [106]:
metric

Unnamed: 0,species,replicon_accession,metric
38,Mexican tetra,NC_064428.1,0.948264
40,Mexican tetra,NC_064430.1,0.948007
42,Mexican tetra,NC_064432.1,0.942066
39,Mexican tetra,NC_064429.1,0.929241
36,Mexican tetra,NC_064426.1,0.928543
26,Mexican tetra,NC_064416.1,0.926792
35,Mexican tetra,NC_064425.1,0.922499
37,Mexican tetra,NC_064427.1,0.920533
27,Mexican tetra,NC_064417.1,0.918638
34,Mexican tetra,NC_064424.1,0.914867
