In [1]:
import pandas as pd
import numpy as np

## K-mers

A k-mer is a part of a string of longitude k. This substrings are created consequently. So, all the k-mers of a string are all the consecutive substrings contained in a string.
The aim of this script is to generate a dictionary with all the possible k-mers we found among all the species. The dictionary have the possible k-mers as keys and the values will be the Specie and the Scaffold where this k-mers have been found in the Specieal string.

In [2]:
# To load all of them at once and afterwards accessing one by one, this might be an option.
#https://towardsdatascience.com/a-simple-trick-to-load-multiple-excel-worksheets-in-pandas-3fae4124345b
# Define filepath
filepath = '../Data/Raw/Tables_Filtered_IK_format.xlsx'

# Load Excel file using Pandas with `sheet_name=None`
df_dict = pd.read_excel(filepath, sheet_name=None)

# Preview
#df_dict

# Get a specific one
#human = df_dict.get('Human')

# aprox 3 min 40 secs

KeyboardInterrupt: 

In [None]:
# Removing consecutive duplicates without considering orientation
def cons_duplicates(data):
    data = data.sort_values(by=['Specie_Scaffold', 'Gene_non_or'])
    data['New'] = np.where((data['#Replicon Name'].shift() == data['#Replicon Name']) & (data['Replicon Accession'].shift() == data['Replicon Accession']) & (data['Gene_non_or'].shift() == data['Gene_non_or']), 0, 1)
    data['Change'] = data['New'].cumsum()
    data = data.reset_index()
    return data.groupby('Change').first()

def preprocessing(specie):
    
    specie['Gene_non_or'] = specie['Locus'].str.split('(\d+)').str[0]
    specie['Gene'] = specie['Gene_non_or'] + specie['Strand']
    specie['Specie_Scaffold'] = specie['Specie'] + '_' + specie['Replicon Accession']
    #specie.reset_index(inplace= True)

    specie = specie[(specie['Gene_non_or'].str.contains('LOC') == False) & (specie['Gene_non_or'].str.isspace() == False) & (specie['Gene_non_or'] != '-') ]  

    # Remove consecutive duplicates here to merge in to one gene
    specie = cons_duplicates(specie)

    specie = specie.reset_index()
        
    return specie

In [None]:
sorted(species)

['Aadvark',
 'Alligator M',
 'Alligator S',
 'Anolis',
 'Chelonia',
 'Chrysemys',
 'Croco',
 'Danio',
 'Devil',
 'Dog',
 'Fugu',
 'Gecko',
 'Gorilla',
 'Human',
 'KCobra',
 'Koala',
 'Monodelphis',
 'Mouse',
 'Orca',
 'Ostrich',
 'Papio',
 'Pelodiscus',
 'Pigeon',
 'Platypus',
 'Pogona',
 'Python',
 'Thamnophis',
 'Xlaevis',
 'Xtropicalis']

In [None]:
# Data cleaning for each specie
df_species = pd.DataFrame()

species = df_dict.keys()

for s in species:
    aux = df_dict.get(s)
    aux['Specie'] = s
    df_species = pd.concat([df_species, aux])

In [None]:
df_species = preprocessing(df_species)

In [None]:
df_species

Unnamed: 0,Change,index,#Replicon Name,Replicon Accession,Start,Stop,Strand,GeneID,Locus,Protein product,Length,Protein name,Specie,Unnamed: 10,Locus tag,Gene_non_or,Gene,Specie_Scaffold,New
0,1,19621,MT,NC_002078.1,7788,7991,+,808413,ATP8,NP_008579.1,67,ATP synthase F0 subunit 8 (mitochondrion),Aadvark,,,ATP,ATP+,Aadvark_NC_002078.1,1
1,2,19619,MT,NC_002078.1,5332,6879,+,808415,COX1,NP_008577.1,515,cytochrome c oxidase subunit I (mitochondrion),Aadvark,,,COX,COX+,Aadvark_NC_002078.1,1
2,3,19629,MT,NC_002078.1,14171,15310,+,808409,CYTB,NP_008587.1,379,cytochrome b (mitochondrion),Aadvark,,,CYTB,CYTB+,Aadvark_NC_002078.1,1
3,4,19617,MT,NC_002078.1,2744,3697,+,808404,ND1,NP_008575.1,317,NADH dehydrogenase subunit 1 (mitochondrion),Aadvark,,,ND,ND+,Aadvark_NC_002078.1,1
4,5,118,Un,NW_006921588.1,32266820,32366215,+,103201168,ABCB11,XP_007944027.1,1325,PREDICTED: bile salt export pump isoform X1,Aadvark,,,ABCB,ABCB+,Aadvark_NW_006921588.1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
321932,321933,3478,Un,NW_016689751.1,557,1575,+,780193,acp6,NP_001072736.1,418,lysophosphatidic acid phosphatase type 6,Xtropicalis,,,acp,acp+,Xtropicalis_NW_016689751.1,1
321933,321934,3480,Un,NW_016689781.1,1617,2094,+,549260,tcf15,NP_001016506.1,183,transcription factor 15 (basic helix-loop-helix),Xtropicalis,,,tcf,tcf+,Xtropicalis_NW_016689781.1,1
321934,321935,3481,Un,NW_016689786.1,888,1010,+,448134,otud5,NP_001004849.1,518,OTU domain-containing protein 5,Xtropicalis,,,otud,otud+,Xtropicalis_NW_016689786.1,1
321935,321936,3486,Un,NW_016689912.1,326,410,-,779964,tubg1,NP_001072509.1,451,tubulin gamma 1,Xtropicalis,,,tubg,tubg-,Xtropicalis_NW_016689912.1,1


In [None]:
df_species.groupby('Specie')['New'].sum()

Specie
Aadvark        14077
Alligator M    12670
Alligator S    12873
Anolis         10632
Chelonia       11955
Chrysemys      10773
Croco          10183
Danio          18749
Devil          13787
Dog               58
Fugu            1720
Gecko          12599
Gorilla        12052
Human          14138
KCobra         13760
Koala          13782
Monodelphis     9587
Mouse          13351
Orca           13778
Ostrich        11112
Papio          12027
Pelodiscus     12230
Pigeon           539
Platypus       10216
Pogona         13223
Python         12303
Thamnophis     10512
Xlaevis         8624
Xtropicalis    10627
Name: New, dtype: int32

In [None]:
df_genes = df_species[['Gene_non_or', 'Specie_Scaffold', 'index']]

#sca = df_genes['Specie_Scaffold'].unique()
#sca = pd.DataFrame(sca)
#sca.rename(columns={0:'Scaffold'}, inplace=True)

In [None]:
df_genes['Next Gene'] = df_genes['Gene_non_or'].shift(-1)
df_genes['2 Next Gene'] = df_genes['Gene_non_or'].shift(-2)
df_genes['3 Next Gene'] = df_genes['Gene_non_or'].shift(-3)
df_genes['4 Next Gene'] = df_genes['Gene_non_or'].shift(-4)
df_genes['5 Next Gene'] = df_genes['Gene_non_or'].shift(-5)
df_genes['6 Next Gene'] = df_genes['Gene_non_or'].shift(-6)
df_genes['7 Next Gene'] = df_genes['Gene_non_or'].shift(-7)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_genes['Next Gene'] = df_genes['Gene_non_or'].shift(-1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_genes['2 Next Gene'] = df_genes['Gene_non_or'].shift(-2)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_genes['3 Next Gene'] = df_genes['Gene_non_or'].shift(-3)
A value is trying to be set

In [None]:
df_genes['3_mers'] = np.where((df_genes['Specie_Scaffold'].shift(-2) == df_genes['Specie_Scaffold']), df_genes['Gene_non_or'] + '_' + df_genes['Next Gene'] + '_' + df_genes['2 Next Gene'], np.nan)
df_genes['4_mers']= np.where((df_genes['Specie_Scaffold'].shift(-3) == df_genes['Specie_Scaffold']), df_genes['Gene_non_or'] + '_' + df_genes['Next Gene'] + '_' + df_genes['2 Next Gene'] + '_' + df_genes['3 Next Gene'], np.nan)
df_genes['5_mers']= np.where((df_genes['Specie_Scaffold'].shift(-4) == df_genes['Specie_Scaffold']), df_genes['Gene_non_or'] + '_' + df_genes['Next Gene'] + '_' + df_genes['2 Next Gene'] + '_' + df_genes['3 Next Gene'] + '_' + df_genes['4 Next Gene'], np.nan)
df_genes['6_mers']= np.where((df_genes['Specie_Scaffold'].shift(-5) == df_genes['Specie_Scaffold']), df_genes['Gene_non_or'] + '_' + df_genes['Next Gene'] + '_' + df_genes['2 Next Gene'] + '_' + df_genes['3 Next Gene'] + '_' + df_genes['4 Next Gene'] + '_' + df_genes['5 Next Gene'], np.nan)
df_genes['7_mers']= np.where((df_genes['Specie_Scaffold'].shift(-6) == df_genes['Specie_Scaffold']), df_genes['Gene_non_or'] + '_' + df_genes['Next Gene'] + '_' + df_genes['2 Next Gene'] + '_' + df_genes['3 Next Gene'] + '_' + df_genes['4 Next Gene'] + '_' + df_genes['5 Next Gene'] + '_' + df_genes['6 Next Gene'], np.nan)
df_genes['8_mers']= np.where((df_genes['Specie_Scaffold'].shift(-7) == df_genes['Specie_Scaffold']), df_genes['Gene_non_or'] + '_' + df_genes['Next Gene'] + '_' + df_genes['2 Next Gene'] + '_' + df_genes['3 Next Gene'] + '_' + df_genes['4 Next Gene'] + '_' + df_genes['5 Next Gene'] + '_' + df_genes['6 Next Gene'] + '_' + df_genes['7 Next Gene'], np.nan)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_genes['3_mers'] = np.where((df_genes['Specie_Scaffold'].shift(-2) == df_genes['Specie_Scaffold']), df_genes['Gene_non_or'] + '_' + df_genes['Next Gene'] + '_' + df_genes['2 Next Gene'], np.nan)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_genes['4_mers']= np.where((df_genes['Specie_Scaffold'].shift(-3) == df_genes['Specie_Scaffold']), df_genes['Gene_non_or'] + '_' + df_genes['Next Gene'] + '_' + df_genes['2 Next Gene'] + '_' + df_genes['3 Next Gene'], np.nan)
A value is trying to be set on a copy of a sl

In [None]:
df_genes['Next Index'] = df_genes['index'].shift(-1).astype(str)
df_genes['2 Next Index'] = df_genes['index'].shift(-2).astype(str)
df_genes['3 Next Index'] = df_genes['index'].shift(-3).astype(str)
df_genes['4 Next Index'] = df_genes['index'].shift(-4).astype(str)
df_genes['5 Next Index'] = df_genes['index'].shift(-5).astype(str)
df_genes['6 Next Index'] = df_genes['index'].shift(-6).astype(str)
df_genes['7 Next Index'] = df_genes['index'].shift(-7).astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_genes['Next Index'] = df_genes['index'].shift(-1).astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_genes['2 Next Index'] = df_genes['index'].shift(-2).astype(str)


In [None]:
df_genes['Specie_Scaffold']

0                Aadvark_NC_002078.1
1                Aadvark_NC_002078.1
2                Aadvark_NC_002078.1
3                Aadvark_NC_002078.1
4             Aadvark_NW_006921588.1
                     ...            
321932    Xtropicalis_NW_016689751.1
321933    Xtropicalis_NW_016689781.1
321934    Xtropicalis_NW_016689786.1
321935    Xtropicalis_NW_016689912.1
321936    Xtropicalis_NW_016690069.1
Name: Specie_Scaffold, Length: 321937, dtype: object

In [None]:
df_genes['3_mers_index']= np.where((df_genes['Specie_Scaffold'].shift(-2) == df_genes['Specie_Scaffold']), df_genes['Specie_Scaffold'] + '_' + df_genes['index'].astype(str) + '_' + df_genes['2 Next Index'].astype(str), np.nan)
df_genes['4_mers_index']= np.where((df_genes['Specie_Scaffold'].shift(-3) == df_genes['Specie_Scaffold']), df_genes['Specie_Scaffold'] + '_' + df_genes['index'].astype(str) + '_' + df_genes['3 Next Index'].astype(str), np.nan)
df_genes['5_mers_index']= np.where((df_genes['Specie_Scaffold'].shift(-4) == df_genes['Specie_Scaffold']), df_genes['Specie_Scaffold'] + '_' + df_genes['index'].astype(str) + '_' + df_genes['4 Next Index'].astype(str), np.nan)
df_genes['6_mers_index']= np.where((df_genes['Specie_Scaffold'].shift(-5) == df_genes['Specie_Scaffold']), df_genes['Specie_Scaffold'] + '_' + df_genes['index'].astype(str) + '_' + df_genes['5 Next Index'].astype(str), np.nan)
df_genes['7_mers_index']= np.where((df_genes['Specie_Scaffold'].shift(-6) == df_genes['Specie_Scaffold']), df_genes['Specie_Scaffold'] + '_' + df_genes['index'].astype(str) + '_' + df_genes['6 Next Index'].astype(str), np.nan)
df_genes['8_mers_index']= np.where((df_genes['Specie_Scaffold'].shift(-7) == df_genes['Specie_Scaffold']), df_genes['Specie_Scaffold'] + '_' + df_genes['index'].astype(str) + '_' + df_genes['7 Next Index'].astype(str), np.nan)


In [None]:
df_genes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 321937 entries, 0 to 321936
Data columns (total 29 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   Gene_non_or      321937 non-null  object
 1   Specie_Scaffold  321937 non-null  object
 2   index            321937 non-null  int64 
 3   Next Gene        321936 non-null  object
 4   2 Next Gene      321935 non-null  object
 5   3 Next Gene      321934 non-null  object
 6   4 Next Gene      321933 non-null  object
 7   5 Next Gene      321932 non-null  object
 8   6 Next Gene      321931 non-null  object
 9   7 Next Gene      321930 non-null  object
 10  3_mers           265760 non-null  object
 11  4_mers           249787 non-null  object
 12  5_mers           236910 non-null  object
 13  6_mers           226193 non-null  object
 14  7_mers           217116 non-null  object
 15  8_mers           209270 non-null  object
 16  Next Index       321937 non-null  object
 17  2 Next Ind

In [None]:
df_3_mers.head()

NameError: name 'df_3_mers' is not defined

In [None]:
df_3_mers = df_genes[['3_mers_index', '3_mers']].rename(columns={'3_mers_index' : 'index'})
df_4_mers = df_genes[['4_mers_index', '4_mers']].rename(columns={'4_mers_index' : 'index'})
df_5_mers = df_genes[['5_mers_index', '5_mers']].rename(columns={'5_mers_index' : 'index'})
df_6_mers = df_genes[['6_mers_index', '6_mers']].rename(columns={'6_mers_index' : 'index'})
df_7_mers = df_genes[['7_mers_index', '7_mers']].rename(columns={'7_mers_index' : 'index'})
df_8_mers = df_genes[['8_mers_index', '8_mers']].rename(columns={'8_mers_index' : 'index'})

In [None]:
df_3_mers = df_3_mers.groupby('3_mers', as_index = False).agg(list) 
df_4_mers = df_4_mers.groupby('4_mers', as_index = False).agg(list) 
df_5_mers = df_5_mers.groupby('5_mers', as_index = False).agg(list) 
df_6_mers = df_6_mers.groupby('6_mers', as_index = False).agg(list) 
df_7_mers = df_7_mers.groupby('7_mers', as_index = False).agg(list) 
df_8_mers = df_8_mers.groupby('8_mers', as_index = False).agg(list) 

In [None]:
df_3_mers.head()

Unnamed: 0,3_mers,index
0,AAAS_AACS_ABCB,"[Gorilla_NC_018436.2_14203_14822.0, Human_NC_0..."
1,AAAS_ACVR_ACVRL,"[Aadvark_NW_006921834.1_12460_12510.0, Koala_N..."
2,AAAS_ADAMTSL_DHRS,[Thamnophis_NW_013657961.1_7057_7076.0]
3,AAAS_AGAP_AMHR,"[Gecko_NW_015173137.1_14651_14654.0, Pogona_NW..."
4,AAAS_AMHR_ASB,"[Alligator M_NW_017708309.1_2957_2965.0, Allig..."


In [None]:
df_3_mers.to_csv('../Data/Intermediate/Index/3_mers_index.csv', index=False)
df_4_mers.to_csv('../Data/Intermediate/Index/4_mers_index.csv', index=False)
df_5_mers.to_csv('../Data/Intermediate/Index/5_mers_index.csv', index=False)
df_6_mers.to_csv('../Data/Intermediate/Index/6_mers_index.csv', index=False)
df_7_mers.to_csv('../Data/Intermediate/Index/7_mers_index.csv', index=False)
df_8_mers.to_csv('../Data/Intermediate/Index/8_mers_index.csv', index=False)

In [None]:
df_query = df_8_mers

In [None]:
df_query['# Coincidences'] = df_query.

In [None]:
df_query

Unnamed: 0,8_mers,index,# Coincidences
0,AAAS_AACS_ABCB_ABCC_ABCD_ABTB_ACACB_ACAD,[Papio_NC_018162.1_13050_13347.0],191915
1,AAAS_AACS_ABCB_ABCC_ABCD_ACACB_ACAD_ACADS,"[Gorilla_NC_018436.2_14203_14786.0, Human_NC_0...",191915
2,AAAS_ACVR_ACVRL_ADAMTS_ADCY_AMHR_AMIGO_ANKRD,[Orca_NW_004438475.1_8912_8845.0],191915
3,AAAS_ACVR_ACVRL_AMHR_ANKRD_AQP_ARF_ASIC,[Koala_NW_018344117.1_14650_14571.0],191915
4,AAAS_ACVR_ACVRL_AMHR_ANKRD_ATF_ATG_ATP,[Aadvark_NW_006921834.1_12460_12450.0],191915
...,...,...,...
191910,zmpste_zmym_znf_znhit_zp_zpld_zrsr_zswim,[Xtropicalis_NC_030678.1_7163_6419.0],191915
191911,zmym_znf_znhit_zp_zpld_zrsr_zswim_zzef,[Xtropicalis_NC_030678.1_6803_7338.0],191915
191912,zmynd_znf_znhit_znrf_zpd_zranb_zswim_zxdc,[Xtropicalis_NC_030680.1_12273_12387.0],191915
191913,znf_znhit_znrf_zpd_zranb_zswim_zxdc_zyg,[Xtropicalis_NC_030680.1_10526_11746.0],191915


In [None]:
df_query['index'] = df_query['index'].astype(str).str.replace("\[", "")
df_query['index'] = df_query['index'].astype(str).str.replace("\]", "")

sep = df_query['index'].astype(str).str.split(", ", expand=True)
df_query = sep.merge(df_query['3_mers'], left_index=True, right_index=True, how='right')
melt = pd.melt(df_query, id_vars=['3_mers']).dropna().drop('variable',1).rename(columns = {'value': 'Origin'})

  df_query['index'] = df_query['index'].astype(str).str.replace("\[", "")
  df_query['index'] = df_query['index'].astype(str).str.replace("\]", "")


KeyError: '3_mers'

In [None]:
match.head()

Unnamed: 0,3_mers,# Appearances
74224,KDM_KIAA_KIF,37
55173,GABRA_GABRB_GABRG,37
74468,KIAA_KIF_KLF,32
122337,RNF_RPL_RPS,28
186492,si:ch_si:dkey-_si:dkeyp-,27


In [None]:
match = melt.groupby('3_mers', as_index=False).count().rename(columns={'Origin':'# Appearances'}).sort_values('# Appearances', ascending=False)
match[['Specie', 'Scaffold', 'Start', 'Stop']] = match['index'].astype(str).str.split("_", expand=True)
match = match.loc[match['# Appearances'] > min, ['Specie', 'Scaffold', '# Appearances']]

KeyError: 'index'

In [None]:
match.head()

Unnamed: 0,3_mers,# Appearances
74224,KDM_KIAA_KIF,37
55173,GABRA_GABRB_GABRG,37
74468,KIAA_KIF_KLF,32
122337,RNF_RPL_RPS,28
186492,si:ch_si:dkey-_si:dkeyp-,27
