In [1]:
import alignment_visualization as av
import pandas as pd
import phylogenetic_tree as pt
from pathlib import Path
#import k_mers_metric as kmm
#import annotation_length
#import read_annotations as ra

%load_ext autoreload
%autoreload 2



# Metric

Let us understand how the metric is computed and why we are getting negative values.

The metric is computed as follows:  

$1 - \left( \prod_{isprime(k)}\underbrace{\left(1 - \frac{common\_k\_mers}{generated\_k\_mers}\right)}_{probability} \cdot \underbrace{\left( 1 -\frac{n}{m\cdot k^2} \right)}_{reducing factor} \right)$

Hence, we have to multiplicators: the prpbability and the reducing factor, each one with its own ingredients.  
Let us dive in these calculations. 


Loading the data of the comparison between the Zebra finch species and all the other species in our dataset.

In [2]:
# This file contains all the data elements needed for computing the metric.
# Load them to analyse
pre_prob = pd.read_csv('../Data/Intermediate/k_mers_counter.csv')

In [3]:
pre_prob.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 318 entries, 0 to 317
Data columns (total 8 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   species              318 non-null    object 
 1   k                    318 non-null    int64  
 2   total_k_mers         318 non-null    int64  
 3   length               318 non-null    int64  
 4   common_k_mers        318 non-null    float64
 5   total_k_mers_origin  318 non-null    int64  
 6   length_origin        318 non-null    int64  
 7   generated_k_mers     318 non-null    float64
dtypes: float64(2), int64(5), object(1)
memory usage: 20.0+ KB


In [4]:
pre_prob.rename(columns={'species':'species2'}, inplace=True)

# Add a column in pos 0 of pre_prob
pre_prob.insert(0, 'origin', 'Zebra finch')

# Sort the table by species2 and k
pre_prob.sort_values(by=['species2', 'k'], inplace=True)

In [5]:
pre_prob.head()

Unnamed: 0,origin,species2,k,total_k_mers,length,common_k_mers,total_k_mers_origin,length_origin,generated_k_mers
4,Zebra finch,Aardvark,2,21677,16131,22618.0,19505,13483,18564.0
6,Zebra finch,Aardvark,3,25385,16131,9740.0,23859,13483,39504.0
7,Zebra finch,Aardvark,5,25266,16131,4000.0,25777,13483,47043.0
8,Zebra finch,Aardvark,7,23450,16131,2039.0,25720,13483,47131.0
0,Zebra finch,Aardvark,11,20324,16131,599.0,25406,13483,45131.0


Relate formula ingredients with table fields:

$\prod_1^{k\_max}\underbrace{(1 - \frac{common\_k\_mers}{generated\_k\_mers})}_{probability} \cdot \underbrace{( 1 -\frac{n}{m\cdot k^2} )}_{reducing factor}$

- common_k_mers: are computed by comparing the k_mers of two species. This is the number of k_mers that are common between the two species. When comparing, we search for the EXACT same k-mer, MUST BE EQUAL.

- generated_k_mers: computed by the formula:   
$generated\_k\_mers = total\_k\_mers + total\_k\_mers\_origin - common\_k\_mers$  
This is the number of k_mers that are generated by the two species.
Observe that if $common\_k\_mers > generated\_k\_mers$, then probability is negative!

- $n$, $m$ are the length of each species. So, we will use the length and length_origin columns. Always we will have $n<m$, i.e., denominator greater than nominator.


Let us compute the probability and the reducing factor for the Zebra finch species and the other species.

In [6]:
pre_prob['probability'] = 1 - (pre_prob['common_k_mers'] / pre_prob['generated_k_mers'])

# Compute reducing factor
# We need to know the shortest and longest length
shortest_length = pre_prob[['length', 'length_origin']].min(axis=1)
longest_length = pre_prob[['length', 'length_origin']].max(axis=1)

pre_prob['reducing_factor'] = (1 - (shortest_length / (longest_length * pow(pre_prob['k'], 2)))).astype(float)


In [7]:
pre_prob.head()

Unnamed: 0,origin,species2,k,total_k_mers,length,common_k_mers,total_k_mers_origin,length_origin,generated_k_mers,probability,reducing_factor
4,Zebra finch,Aardvark,2,21677,16131,22618.0,19505,13483,18564.0,-0.21838,0.791039
6,Zebra finch,Aardvark,3,25385,16131,9740.0,23859,13483,39504.0,0.753443,0.907128
7,Zebra finch,Aardvark,5,25266,16131,4000.0,25777,13483,47043.0,0.914971,0.966566
8,Zebra finch,Aardvark,7,23450,16131,2039.0,25720,13483,47131.0,0.956738,0.982942
0,Zebra finch,Aardvark,11,20324,16131,599.0,25406,13483,45131.0,0.986728,0.993092


In [8]:
# Check when the probability column is negative
pre_prob[pre_prob['probability'] < 0]

Unnamed: 0,origin,species2,k,total_k_mers,length,common_k_mers,total_k_mers_origin,length_origin,generated_k_mers,probability,reducing_factor
4,Zebra finch,Aardvark,2,21677,16131,22618.0,19505,13483,18564.0,-0.21838,0.791039
22,Zebra finch,Agile Gracile Mouse Opossum,2,20581,14397,25742.0,19505,13483,14344.0,-0.794618,0.765871
39,Zebra finch,Asiatic toad,2,17899,12886,21342.0,19505,13483,16062.0,-0.328726,0.761069
54,Zebra finch,Australian saltwater crocodile,2,18401,12681,24414.0,19505,13483,13492.0,-0.809517,0.764871
67,Zebra finch,Chicken,2,21647,15454,29220.0,19505,13483,11932.0,-1.448877,0.781885
95,Zebra finch,Dingo,2,23333,16545,27824.0,19505,13483,15014.0,-0.853204,0.796268
108,Zebra finch,Domestic cat,2,24334,17369,27012.0,19505,13483,16827.0,-0.605277,0.805933
120,Zebra finch,Fence lizard,2,19900,13668,26164.0,19505,13483,13241.0,-0.975984,0.753384
149,Zebra finch,Human,2,34774,32521,27708.0,19505,13483,26571.0,-0.042791,0.896352
158,Zebra finch,Lion,2,22786,16220,26388.0,19505,13483,15903.0,-0.65931,0.792186


In [9]:
pre_prob[pre_prob['probability'] < 0].count()[0]

18

In [10]:
# Check when the reducing_factor column is negative
pre_prob[pre_prob['reducing_factor'] < 0]

Unnamed: 0,origin,species2,k,total_k_mers,length,common_k_mers,total_k_mers_origin,length_origin,generated_k_mers,probability,reducing_factor


Only the probability column is negative, the reducing factor is always positive.  

Observe: this happens not for all species, but always when k=2!

Since the probability is negative and it happens just once (odd number), we are getting negative values. 

In [11]:
pre_prob.species2.nunique()

31

## Why the probability is negative?
Failing scenarios:
1. common_k_mers is greater than generated_k_mers
2. common_k_mers is greater than total_k_mers or total_k_mers_origin

### First scenario
Check when common_k_mers is greater than generated_k_mers.

In [12]:
#common_k_mers > generated_k_mers
pre_prob[pre_prob['common_k_mers'] > pre_prob['generated_k_mers']]

Unnamed: 0,origin,species2,k,total_k_mers,length,common_k_mers,total_k_mers_origin,length_origin,generated_k_mers,probability,reducing_factor
4,Zebra finch,Aardvark,2,21677,16131,22618.0,19505,13483,18564.0,-0.21838,0.791039
22,Zebra finch,Agile Gracile Mouse Opossum,2,20581,14397,25742.0,19505,13483,14344.0,-0.794618,0.765871
39,Zebra finch,Asiatic toad,2,17899,12886,21342.0,19505,13483,16062.0,-0.328726,0.761069
54,Zebra finch,Australian saltwater crocodile,2,18401,12681,24414.0,19505,13483,13492.0,-0.809517,0.764871
67,Zebra finch,Chicken,2,21647,15454,29220.0,19505,13483,11932.0,-1.448877,0.781885
95,Zebra finch,Dingo,2,23333,16545,27824.0,19505,13483,15014.0,-0.853204,0.796268
108,Zebra finch,Domestic cat,2,24334,17369,27012.0,19505,13483,16827.0,-0.605277,0.805933
120,Zebra finch,Fence lizard,2,19900,13668,26164.0,19505,13483,13241.0,-0.975984,0.753384
149,Zebra finch,Human,2,34774,32521,27708.0,19505,13483,26571.0,-0.042791,0.896352
158,Zebra finch,Lion,2,22786,16220,26388.0,19505,13483,15903.0,-0.65931,0.792186


In [13]:
pre_prob[pre_prob['common_k_mers'] > pre_prob['generated_k_mers']].count()[0]

18

Oh! We find the same species failing, which makes sense.

### Second scenario
Check when common_k_mers is greater than total_k_mers or total_k_mers_origin.

In [14]:
# Check if there is any case where common_k_mers is greater than total_k_mers or total_k_mers_origin
pre_prob[pre_prob.common_k_mers > pre_prob.total_k_mers]

Unnamed: 0,origin,species2,k,total_k_mers,length,common_k_mers,total_k_mers_origin,length_origin,generated_k_mers,probability,reducing_factor
4,Zebra finch,Aardvark,2,21677,16131,22618.0,19505,13483,18564.0,-0.21838,0.791039
22,Zebra finch,Agile Gracile Mouse Opossum,2,20581,14397,25742.0,19505,13483,14344.0,-0.794618,0.765871
39,Zebra finch,Asiatic toad,2,17899,12886,21342.0,19505,13483,16062.0,-0.328726,0.761069
54,Zebra finch,Australian saltwater crocodile,2,18401,12681,24414.0,19505,13483,13492.0,-0.809517,0.764871
67,Zebra finch,Chicken,2,21647,15454,29220.0,19505,13483,11932.0,-1.448877,0.781885
85,Zebra finch,Coelacanth,2,15150,12890,16790.0,19505,13483,17865.0,0.060174,0.760995
95,Zebra finch,Dingo,2,23333,16545,27824.0,19505,13483,15014.0,-0.853204,0.796268
108,Zebra finch,Domestic cat,2,24334,17369,27012.0,19505,13483,16827.0,-0.605277,0.805933
120,Zebra finch,Fence lizard,2,19900,13668,26164.0,19505,13483,13241.0,-0.975984,0.753384
158,Zebra finch,Lion,2,22786,16220,26388.0,19505,13483,15903.0,-0.65931,0.792186


In [15]:
# Now, with total_k_mers_origin
pre_prob[pre_prob.common_k_mers > pre_prob.total_k_mers_origin]

Unnamed: 0,origin,species2,k,total_k_mers,length,common_k_mers,total_k_mers_origin,length_origin,generated_k_mers,probability,reducing_factor
4,Zebra finch,Aardvark,2,21677,16131,22618.0,19505,13483,18564.0,-0.21838,0.791039
14,Zebra finch,African clawed frog,2,33020,25084,22698.0,19505,13483,29827.0,0.239012,0.865622
22,Zebra finch,Agile Gracile Mouse Opossum,2,20581,14397,25742.0,19505,13483,14344.0,-0.794618,0.765871
39,Zebra finch,Asiatic toad,2,17899,12886,21342.0,19505,13483,16062.0,-0.328726,0.761069
54,Zebra finch,Australian saltwater crocodile,2,18401,12681,24414.0,19505,13483,13492.0,-0.809517,0.764871
67,Zebra finch,Chicken,2,21647,15454,29220.0,19505,13483,11932.0,-1.448877,0.781885
95,Zebra finch,Dingo,2,23333,16545,27824.0,19505,13483,15014.0,-0.853204,0.796268
108,Zebra finch,Domestic cat,2,24334,17369,27012.0,19505,13483,16827.0,-0.605277,0.805933
120,Zebra finch,Fence lizard,2,19900,13668,26164.0,19505,13483,13241.0,-0.975984,0.753384
149,Zebra finch,Human,2,34774,32521,27708.0,19505,13483,26571.0,-0.042791,0.896352


The same... Should be something wrong with common_k_mers computation.  
But it is also strange that happens when k=2, let us check how many k-mers we have when k=2 wrt greater k values. There should be more than any other k values. 

## Check original data

In [16]:
# Open the file with the number of k-mers per replicon
k_mers_count = pd.read_feather('../Data/Intermediate/k_mers/relaxed_count_k_mers_replicon_accession.feather')

In [17]:
k_mers_count.k = k_mers_count.k.astype('int')

In [18]:
k_mers_count.sort_values(by=['species', 'replicon_accession', 'k'], inplace=True)

In [19]:
k_mers_count.head()

Unnamed: 0,species,replicon_accession,k,total_k_mers
4,Aardvark,NW_006921588.1,2,118
7,Aardvark,NW_006921588.1,3,117
13,Aardvark,NW_006921588.1,5,115
18,Aardvark,NW_006921588.1,7,113
0,Aardvark,NW_006921588.1,11,109


total_k_mers value should be decreasing with k, but there are some cases which this is not happening.  
Let us see when is not happening.

In [20]:
# I want to know if the number of total_k_mers is increasing or decreasing
# within every replicon_accession
k_mers_count['increase'] = k_mers_count.groupby(['species', 'replicon_accession'])['total_k_mers'].diff()

In [21]:
# Is there any positive increase?
not_increasing = k_mers_count[k_mers_count.increase > 0]
not_increasing

Unnamed: 0,species,replicon_accession,k,total_k_mers,increase
1389,Aardvark,NW_006921667.1,3,45,1.0
4691,African clawed frog,NC_054371.1,3,1786,5.0
4716,African clawed frog,NC_054372.1,3,1352,3.0
4741,African clawed frog,NC_054373.1,3,1482,4.0
4766,African clawed frog,NC_054374.1,3,1261,1.0
...,...,...,...,...,...
28539,Zebra Finch,NC_044226.2,3,346,1.0
28564,Zebra Finch,NC_044227.2,3,322,3.0
28598,Zebra Finch,NC_044229.2,3,257,1.0
28898,Zebra Finch,NC_044241.2,3,621,3.0


Here are the rows which have greater values than the previous ones.

For which species?

In [22]:
not_increasing.species.unique()

array(['Aardvark', 'African clawed frog', 'Agile Gracile Mouse Opossum',
       'Asiatic toad', 'Australian saltwater crocodile', 'Chicken',
       'Coelacanth', 'Dingo', 'Domestic cat', 'Fence lizard',
       'Great White Shark', 'Human', 'Lion', 'Loggerhead turtle',
       'Mexican tetra', 'Monito del monte', 'Platypus', 'Rhesus Macaque',
       'Sheep', 'Thorny Skate', "Townsend's dwarf sphaero",
       'West African lungfish', 'Western painted turtle',
       'Whitespotted Bambooshark', 'Yellow-footed Antechinus',
       'Zebra Finch'], dtype=object)

In [23]:
not_increasing.species.nunique()

26

In [24]:
not_increasing.k.unique()

array([ 3,  5,  7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53])

In [25]:
# Species and replicon_accession and k with positive increase
not_increasing[['species', 'replicon_accession', 'k', 'increase']].drop_duplicates().to_csv('../Data/Intermediate/k_mers/relaxed_count_k_mers_replicon_accession_increase.csv', index=False)

# Dmitrii Stop here

## Merge
Check if the merge is performing correctly.  

Let's do it for species = 'Zebra finch'.

In [26]:
# Load k_mers dataframe
df_k_mers = pd.read_feather('../Data/Intermediate/accumulated/df_k_mers_relaxed.feather')

In [38]:
level = 'species'
origin = 'Zebra Finch'

In [40]:
df_k_mers.head()

Unnamed: 0,species_gcf_id,k,k_mer,replicon_accession,species
0,GCF_000001405.40_GRCh38.p14_genomic,11,{MIR}{MIR1302-2HG}{MIR}{FAM138A}{OR4F}{MIR}{OR...,NC_000001.11,Human
1,GCF_000001405.40_GRCh38.p14_genomic,11,{MIR1302-2HG}{MIR}{FAM138A}{OR4F}{MIR}{OR4F}{L...,NC_000001.11,Human
2,GCF_000001405.40_GRCh38.p14_genomic,11,{MIR}{FAM138A}{OR4F}{MIR}{OR4F}{LINC}{FAM87B}{...,NC_000001.11,Human
3,GCF_000001405.40_GRCh38.p14_genomic,11,{FAM138A}{OR4F}{MIR}{OR4F}{LINC}{FAM87B}{LINC}...,NC_000001.11,Human
4,GCF_000001405.40_GRCh38.p14_genomic,11,{OR4F}{MIR}{OR4F}{LINC}{FAM87B}{LINC}{FAM41C}{...,NC_000001.11,Human


In [49]:
df_non_origin = df_k_mers[df_k_mers[level] != origin]

In [None]:
k_mer_non_origin = df_k_mers['k_mer'].tolist()
#35 secs

In [41]:
df_origin = df_k_mers[df_k_mers[level] == origin]

In [None]:
k_mer_origin = df_origin['k_mer'].tolist()

In [52]:
# Intersect between the two arrays
import numpy as np
common_k_mers = np.intersect1d(k_mer_origin, k_mer_non_origin)

KeyboardInterrupt: 

In [42]:
df_shared = df_origin.merge(k_mers, on=['k', 'k_mer'], how='inner')

In [43]:
df_shared.head()

Unnamed: 0,species_gcf_id_x,k,k_mer,replicon_accession_x,species_x,species_gcf_id_y,replicon_accession_y,species_y
0,GCF_003957565.2_bTaeGut1.4.pri_genomic,11,{DCBLD}{CMSS}{FILIP}{TBC1D}{NIT}{TOMM}{LNP}{TM...,NC_044211.2,Zebra Finch,GCF_003957565.2_bTaeGut1.4.pri_genomic,NC_044211.2,Zebra Finch
1,GCF_003957565.2_bTaeGut1.4.pri_genomic,11,{CMSS}{FILIP}{TBC1D}{NIT}{TOMM}{LNP}{TMEM45A}{...,NC_044211.2,Zebra Finch,GCF_003957565.2_bTaeGut1.4.pri_genomic,NC_044211.2,Zebra Finch
2,GCF_003957565.2_bTaeGut1.4.pri_genomic,11,{FILIP}{TBC1D}{NIT}{TOMM}{LNP}{TMEM45A}{TFG}{I...,NC_044211.2,Zebra Finch,GCF_003957565.2_bTaeGut1.4.pri_genomic,NC_044211.2,Zebra Finch
3,GCF_003957565.2_bTaeGut1.4.pri_genomic,11,{TBC1D}{NIT}{TOMM}{LNP}{TMEM45A}{TFG}{IMPG}{SE...,NC_044211.2,Zebra Finch,GCF_003957565.2_bTaeGut1.4.pri_genomic,NC_044211.2,Zebra Finch
4,GCF_003957565.2_bTaeGut1.4.pri_genomic,11,{NIT}{TOMM}{LNP}{TMEM45A}{TFG}{IMPG}{SENP}{TXN...,NC_044211.2,Zebra Finch,GCF_003957565.2_bTaeGut1.4.pri_genomic,NC_044211.2,Zebra Finch


Now, analyse the file with the computation numbers for the metric

In [26]:
import alignment_visualization as av
import logging

In [27]:
folder_path = '../Data/Intermediate/alignment/20230421_125415'

# Read the blocks
blocks = av.read_blocks(input_file_path=folder_path + '/blocks.csv')
logging.info('Blocks read')



# Add length information
blocks = av.merge_length('../Data/Intermediate/replicon_accession_length.feather', blocks=blocks)

In [28]:
blocks

Unnamed: 0.1,Unnamed: 0,species,replicon_accession,target_replicon_accession,comparing_species,alignment_id,start,stop,match_perc,start_replicon_accession,stop_replicon_accession
0,0,Human,NC_000024.10,NC_000024.10,"['Human', 'Chicken']",1,12709448,12859416,100.00%,276356,57203350
1,1,Chicken,NC_052532.1,NC_000024.10,"['Human', 'Chicken']",1,112178332,112248243,100.00%,51035,196308806
2,2,Human,NC_000024.10,NC_000024.10,"['Human', 'Monito del monte']",2,12709448,12859416,100.00%,276356,57203350
3,3,Monito del monte,NC_057863.1,NC_000024.10,"['Human', 'Monito del monte']",2,277778887,277959985,100.00%,182383,670632939
4,4,Human,NC_000024.10,NC_000024.10,"['Human', 'Rhesus Macaque']",3,2786989,20779666,68.63%,276356,57203350
5,5,Rhesus Macaque,NC_027914.1,NC_000024.10,"['Human', 'Rhesus Macaque']",3,81761,6082904,68.63%,81625,9195139
6,6,Human,NC_000024.10,NC_000024.10,"['Human', 'Rhesus Macaque']",4,634618,2740804,61.29%,276356,57203350
7,7,Rhesus Macaque,NC_041774.1,NC_000024.10,"['Human', 'Rhesus Macaque']",4,81585,2320117,61.29%,81458,151945197


In [29]:
blocks.to_csv(input_file_path)

NameError: name 'input_file_path' is not defined

In [None]:
df_annotations.to_csv('../Data/Intermediate/df_sheep.csv', index=False)

In [None]:
k_mers_counter = pd.read_feather('../Data/Intermediate/k_mers/strict_count_k_mers_species.feather')

df_k_mers = pd.read_feather('../Data/Intermediate/k_mers/processed/df_k_mers_relaxed.feather')

df_length = annotation_length.read_length('species')


In [None]:
species = 'Reedfish'

In [None]:
kmm.k_mers_to_metric(origin=species, level='species', k_mers_counter=k_mers_counter, df_length=df_length, df_k_mers=df_k_mers)

2023-04-26 11:13:52 - Computing shared k-mers for Reedfish species
2023-04-26 11:14:40 - Computing generated k-mers for Reedfish species
2023-04-26 11:14:40 - Computing metric for Reedfish species


Unnamed: 0,species,metric
0,Aardvark,0.961262
1,African clawed frog,0.97271
2,Agile Gracile Mouse Opossum,0.957892
3,Asiatic toad,0.954084
4,Australian saltwater crocodile,0.954941
5,Chicken,0.961235
6,Coelacanth,0.95265
7,Dingo,0.964634
8,Domestic cat,0.965553
9,Fence lizard,0.95826


In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

import yaml
config_path = Path('./k_mers_generation/gff_to_k_mer_pipeline/config/config.yaml')
with open(config_path) as yaml_file:
    config = yaml.safe_load(yaml_file)

faulty_replicon_species = {}
for organism_name, organism in config['organisms'].items():
    organism_df = pd.read_feather(Path(f'../Data/Intermediate/interim/feature_enriched/{organism["filename"]}.features.feather'))
    #strict_k_mer_df = pd.read_feather(Path(f'../Data/Intermediate/k_mers/processed/{organism["filename"]}/2.strict_locus.feather'))
    relaxed_k_mer_df = pd.read_feather(Path(f'../Data/Intermediate/k_mers/processed/{organism["filename"]}/2.relaxed_locus.feather'))
    
    organism_df_replicon_number = organism_df.replicon_accession.nunique()
    #strict_k_mer_df_replicon_number = strict_k_mer_df.replicon_accession.nunique()
    relaxed_k_mer_df_replicon_number = relaxed_k_mer_df.replicon_accession.nunique()
    
    print(f'Organism: {organism_name}, org_df: {organism_df_replicon_number}, relaxed_k: {relaxed_k_mer_df_replicon_number}')
    
    if not (organism_df_replicon_number == strict_k_mer_df_replicon_number == relaxed_k_mer_df_replicon_number):
        faulty_replicon_species[organism_name] = {'org_df': organism_df_replicon_number, 'relaxed_k': relaxed_k_mer_df_replicon_number}        

Organism: golden_spiny_mouse, org_df: 33, relaxed_k: 24
Organism: elephant, org_df: 27, relaxed_k: 12
Organism: aardvark, org_df: 848, relaxed_k: 689
Organism: human, org_df: 396, relaxed_k: 297
Organism: rhesus_macaque, org_df: 40, relaxed_k: 26
Organism: platypus, org_df: 60, relaxed_k: 34
Organism: mardo, org_df: 7, relaxed_k: 6
Organism: agile_gracile_mouse_opossum, org_df: 10, relaxed_k: 7
Organism: monito_del_monte, org_df: 7, relaxed_k: 7
Organism: chicken, org_df: 79, relaxed_k: 57
Organism: zebra_finch, org_df: 48, relaxed_k: 43
Organism: townsends_dwarf_sphaero, org_df: 37, relaxed_k: 30
Organism: fence_lizard, org_df: 13, relaxed_k: 11
Organism: aeolian_wall_lizard, org_df: 19, relaxed_k: 18
Organism: western_painted_turtle, org_df: 481, relaxed_k: 334
Organism: mexican_gopher_tortoise, org_df: 25, relaxed_k: 15
Organism: loggerhead_turtle, org_df: 38, relaxed_k: 37
Organism: cat, org_df: 23, relaxed_k: 20
Organism: lion, org_df: 21, relaxed_k: 19
Organism: dingo, org_df: 41

In [None]:
faulty_replicon_species

{'golden_spiny_mouse': {'org_df': 33, 'relaxed_k': 24},
 'elephant': {'org_df': 27, 'relaxed_k': 12},
 'aardvark': {'org_df': 848, 'relaxed_k': 689},
 'human': {'org_df': 396, 'relaxed_k': 297},
 'rhesus_macaque': {'org_df': 40, 'relaxed_k': 26},
 'platypus': {'org_df': 60, 'relaxed_k': 34},
 'mardo': {'org_df': 7, 'relaxed_k': 6},
 'agile_gracile_mouse_opossum': {'org_df': 10, 'relaxed_k': 7},
 'monito_del_monte': {'org_df': 7, 'relaxed_k': 7},
 'chicken': {'org_df': 79, 'relaxed_k': 57},
 'zebra_finch': {'org_df': 48, 'relaxed_k': 43},
 'townsends_dwarf_sphaero': {'org_df': 37, 'relaxed_k': 30},
 'fence_lizard': {'org_df': 13, 'relaxed_k': 11},
 'aeolian_wall_lizard': {'org_df': 19, 'relaxed_k': 18},
 'western_painted_turtle': {'org_df': 481, 'relaxed_k': 334},
 'mexican_gopher_tortoise': {'org_df': 25, 'relaxed_k': 15},
 'loggerhead_turtle': {'org_df': 38, 'relaxed_k': 37},
 'cat': {'org_df': 23, 'relaxed_k': 20},
 'lion': {'org_df': 21, 'relaxed_k': 19},
 'dingo': {'org_df': 41, 'r

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

import yaml
config_path = Path('./k_mers_generation/gff_to_k_mer_pipeline/config/config.yaml')
with open(config_path) as yaml_file:
    config = yaml.safe_load(yaml_file)

# Check how many annotations have replicon accession with less than 2 different genes
faulty_replicon_species = {}
for organism_name, organism in config['organisms'].items():
    organism_df = pd.read_feather(Path(f'../Data/Intermediate/interim/feature_enriched/{organism["filename"]}.features.feather'))
    organism_df_less_2 = organism_df[['replicon_accession', 'locus']].groupby(['replicon_accession'], as_index=False).nunique().loc[lambda x: x['locus'] < 2]

    if len(organism_df_less_2) > 0:
        faulty_replicon_species[organism_name] = organism_df_less_2.replicon_accession.count()
    
faulty_replicon_species

In [None]:
organism_df_less_2.head()

Unnamed: 0_level_0,locus
replicon_accession,Unnamed: 1_level_1
NW_024599809.1,1
NW_024599828.1,1


In [None]:
faulty_replicon_species

{'golden_spiny_mouse': 2,
 'elephant': 1,
 'aardvark': 156,
 'human': 77,
 'rhesus_macaque': 14,
 'platypus': 19,
 'mardo': 1,
 'agile_gracile_mouse_opossum': 3,
 'chicken': 20,
 'zebra_finch': 5,
 'townsends_dwarf_sphaero': 7,
 'fence_lizard': 2,
 'western_painted_turtle': 146,
 'mexican_gopher_tortoise': 3,
 'loggerhead_turtle': 1,
 'cat': 3,
 'lion': 2,
 'dingo': 2,
 'asiatic_toad': 105,
 'west_african_lungfish': 1,
 'reedfish': 1,
 'mexican_tetra': 3,
 'whitespotted_bambooshark': 98,
 'thorny_skate': 54,
 'great_white_shark': 59,
 'coelacanth': 784,
 'sheep': 2}

In [None]:
rep_acc_feather = pd.read_feather('../Data/Intermediate/replicon_accession_length.feather')

In [None]:
rep_acc_feather.to_csv('../Data/Intermediate/rep_acc_length.csv')

In [None]:
rep_acc_feather[rep_acc_feather['replicon_accession'] == 'NC_000024.10']

Unnamed: 0,replicon_accession,length,start,stop
23,NC_000024.10,599,284188,57196372


In [None]:
# Read the blocks
blocks = av.read_blocks(input_file_path=folder_path + '/blocks.csv')
logging.info('Blocks read')


# Add length information
blocks = av.merge_length('../Data/Intermediate/replicon_accession_length.feather', blocks=blocks)

In [None]:
blocks

Unnamed: 0.1,Unnamed: 0,species,replicon_accession,target_replicon_accession,comparing_species,alignment_id,start,stop,match_perc,start_replicon_accession,stop_replicon_accession
0,1,Chicken,NC_052532.1,NC_000024.10,"['Human', 'Chicken']",1,112178332,112248243,100.00%,60815,196308409
1,0,Human,NC_000024.10,NC_000024.10,"['Human', 'Chicken']",1,12709448,12859416,100.00%,284188,57196372
2,2,Human,NC_000024.10,NC_000024.10,"['Human', 'Monito del monte']",2,12709448,12859416,100.00%,0,57196372
3,4,Human,NC_000024.10,NC_000024.10,"['Human', 'Rhesus Macaque']",3,2786989,20779666,68.63%,0,57196372
4,6,Human,NC_000024.10,NC_000024.10,"['Human', 'Rhesus Macaque']",4,634618,2740804,61.29%,0,57196372
5,3,Monito del monte,NC_057863.1,NC_000024.10,"['Human', 'Monito del monte']",2,277778887,277959985,100.00%,182519,670632862
6,5,Rhesus Macaque,NC_027914.1,NC_000024.10,"['Human', 'Rhesus Macaque']",3,81761,6082904,68.63%,81625,9194841
7,7,Rhesus Macaque,NC_041774.1,NC_000024.10,"['Human', 'Rhesus Macaque']",4,9276426,11514958,61.29%,9194841,161139631


In [None]:
locks_start_stop = av.recalculate_start_stop(blocks)

In [None]:
rep_acc_feather

Unnamed: 0,replicon_accession,length,start,stop
0,NC_000001.11,21757,65565,248930126
1,NC_000002.12,16439,41608,241900369
2,NC_000003.12,12589,319777,198038667
3,NC_000004.12,8282,53489,190175198
4,NC_000005.10,9437,92232,181368225
...,...,...,...,...
5590,NW_026041599.1,46,116055,501685
5591,NW_026041621.1,13,423515,467986
5592,NW_026041720.1,130,9192,141926
5593,NW_026041731.1,1,141890,142543


In [None]:

av.alignment_vis(path='../Data/S_W_Intermediate/Scaffold/Human_Gorilla/', timestamp='20230306_093245')

Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "c:\Users\bertr\miniconda3\envs\thesis\lib\site-packages\IPython\core\interactiveshell.py", line 3442, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "C:\Users\bertr\AppData\Local\Temp\ipykernel_11664\4135374952.py", line 1, in <module>
    av.alignment_vis(path='../Data/S_W_Intermediate/Scaffold/Human_Gorilla/', timestamp='20230306_093245')
  File "c:\Users\bertr\LRZ Sync+Share\Thesis\Code\alignment_visualization.py", line 226, in alignment_vis
    blocks = merge_length('../Data/Intermediate/replicon_accession_length.feather', blocks)
  File "c:\Users\bertr\LRZ Sync+Share\Thesis\Code\alignment_visualization.py", line 35, in merge_length
    blocks = blocks.merge(length, how='left', left_on='replicon_accession')
  File "c:\Users\bertr\miniconda3\envs\thesis\lib\site-packages\pandas\core\frame.py", line 10093, in merge
    return merge(
  File "c:\Users\bertr\miniconda3\envs\thesis\lib\site-packages\pandas\core\reshape\

In [None]:
# This script will update all the files depending on:
# - annotations (after feature_enriched step)
# - k-mers (after k_mers step)


import pandas as pd
import logging

import annotation_length
import read_k_mers as rkm
import count_k_mers as ckm
import read_annotations as ra

df_annotations = ra.read_annotations('../Data/Intermediate/Dmitrii/interim/feature_enriched/')

# Update length file
annotation_length.both_length(df_annotations)
logging.info('Length files updated')

# Update k-mers feather file
rkm.update_k_mers('../Data/Intermediate/k_mers/processed/', mode='strict')
rkm.update_k_mers('../Data/Intermediate/k_mers/processed/', mode='relaxed')
logging.info('K-mers files updated')

# Now, read the k-mers, as they are now updated
df_k_mers = pd.read_feather('../Data/Intermediate/k_mers/processed/df_k_mers.feather')

# Update k-mers counter feather file
ckm.count_k_mers(df_k_mers)
logging.info('K-mers counter files updated')
logging.info('All files updated')

ArrowMemoryError: realloc of size 2147483648 failed

In [None]:
# Update k-mers feather file
rkm.update_k_mers('../Data/Intermediate/k_mers/processed/', mode='strict')

In [None]:
input_file_path = '../Data/Intermediate/k_mers/processed/'
mode='strict'

In [None]:
import read_k_mers as rkm
import pandas as pd

In [None]:
data_list = rkm.process_folder(input_file_path, mode=mode)
print('Process finished')

# Convert the list of dictionaries into a single DataFrame, including the species and k columns
df_list = pd.concat([d['df'].assign(species_gcf_id=d['species_gcf_id'], k=d['k']) for d in data_list], ignore_index=True)
df_list = df_list[['species_gcf_id', 'k', 'k_mer', 'replicon_accession']]           #.rename(columns={'replicon_accession': 'scaffold'})
print('Concat finished')

# Create a column in the dataframe with the species name
file_species = rkm.map_species_to_file()
df_list['species'] = df_list['species_gcf_id'].map(file_species)

# Save the dataframe as a feather file
#df_list.to_feather('../Data/Intermediate/k_mers/processed/df_k_mers_' + mode + '.feather')


In [None]:
df_list.to_feather('../Data/Intermediate/k_mers/processed/df_k_mers_' + mode + '.feather')


In [None]:
df_list.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7692395 entries, 0 to 7692394
Data columns (total 5 columns):
 #   Column              Dtype 
---  ------              ----- 
 0   species_gcf_id      object
 1   k                   object
 2   k_mer               object
 3   replicon_accession  object
 4   species             object
dtypes: object(5)
memory usage: 293.4+ MB
