# Sample Identification

This notebook performs Sample Identification.

# Import modules

In [51]:
import pandas as pd
import os
import matplotlib.pyplot as plt

# Set variables

In [52]:
# Input variables
run_number = "run3"
chrom = "chr11"
dis = "sca"
chrom_dis = f"{chrom}_{dis}"
datadir = f"/mnt/aretian/genomics/nanopore/{run_number}"

# Load data

In [53]:
df = pd.read_csv(f'{datadir}/{run_number}_{chrom_dis}_bool_tagged_reads.csv')

In [54]:
df

Unnamed: 0,read_id,Human_STR_205253,Human_STR_205293,Human_STR_205294,Human_STR_205352,Human_STR_205354,Human_STR_205373,Human_STR_205411,Human_STR_205412,Human_STR_205413,...,Human_STR_207240,Human_STR_207241,Human_STR_207242,Human_STR_207243,Human_STR_207244,Human_STR_207245,Human_STR_207258,Human_STR_207259,Human_STR_207262,Human_STR_207263
0,run3_chr11_sca_person0-0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,run3_chr11_sca_person0-1,1,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,run3_chr11_sca_person0-10,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,run3_chr11_sca_person0-11,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,run3_chr11_sca_person0-12,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,run3_chr11_sca_person0-13,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,run3_chr11_sca_person0-14,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,run3_chr11_sca_person0-15,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,run3_chr11_sca_person0-2,1,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,0,0,0,0
9,run3_chr11_sca_person0-3,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


## Mother-child shares

In [55]:
print("Calculating mother-child shares and STR shares.")
# Create Dataframe with shares
df['person']=df['read_id'].str.split('_').str[3].str[:7]
df_out = pd.DataFrame({'n_long_reads':df['person'].value_counts()})
df_out['share_long_reads']=df_out['n_long_reads']/df_out['n_long_reads'].sum()
df_out.index = df_out.index.rename('person')
# Save
df_out.to_csv(f'{datadir}/{run_number}_{chrom_dis}_sample_id_read_shares.csv')
print(f"Saved: {datadir}/{run_number}_{chrom_dis}_sample_id_read_shares.csv")

Calculating mother-child shares and STR shares.
Saved: /mnt/aretian/genomics/nanopore/run3/run3_chr11_sca_sample_id_read_shares.csv


## Sum all the STRs

In [57]:
sums_df = df.groupby('person').sum().sum(axis=1)
sums_df = pd.DataFrame({'n_strs':sums_df})
sums_df['share_strs'] = sums_df['n_strs']/sums_df['n_strs'].sum()
sums_df.loc[sums_df['n_strs']<sums_df['n_strs'].mean(), 'is_child']=1
# Save
sums_df.to_csv(f'{datadir}/{run_number}_{chrom_dis}_sample_id_str_shares.csv')
print(f"Saved: {datadir}/{run_number}_{chrom_dis}_sample_id_str_shares.csv")

Saved: /mnt/aretian/genomics/nanopore/run3/run3_chr11_sca_sample_id_str_shares.csv


## Plot the matrix

In [50]:
print('Plotting the boolean matrix.')

df_plot = df.iloc[:,1:-1]

fig, ax = plt.subplots(figsize=(20,5), dpi=100)
myplot = ax.imshow(df_plot.to_numpy(),
          cmap=plt.cm.Blues, 
          aspect='auto',
          interpolation='none')


fig.colorbar(myplot, ax=ax)
ax.set_xlabel('STRs')
ax.set_ylabel('Long Reads')

# Save fig
# plt.savefig(f"cluster_plots/{run_number}_{chrom_dis}_boolean_matrix.png",dpi=100)
print(f"Saved: cluster_plots/{run_number}_{chrom_dis}_boolean_matrix.png")