# Exploring the data
Breakdown of samples, sexes, superpopulations, whats missing etc

In [2]:
import pandas as pd

# Load your feature counts dataframes for CHM13
batch1_chm13 = pd.read_csv('/home/alextu/scratch/results/summary_stats/chm13/verkko_batch1_chm13_collapsed_summary_metrics.csv')
batch2_chm13 = pd.read_csv('/home/alextu/scratch/results/summary_stats/chm13/verkko_batch2_chm13_collapsed_summary_metrics.csv')
batch3_chm13 = pd.read_csv('/home/alextu/scratch/results/summary_stats/chm13/verkko_batch3_chm13_collapsed_summary_metrics.csv')
batch4_chm13 = pd.read_csv('/home/alextu/scratch/results/summary_stats/chm13/verkko_batch123_chm13_collapsed_quadron_summary_metrics_combined.csv')

# Combine the CHM13 dataframes
df_chm13 = pd.concat([batch1_chm13, batch2_chm13, batch3_chm13, batch4_chm13], ignore_index=True)

# Extract sample identifier and haplotype from 'Sample_Haplotype' column for CHM13
df_chm13[['Sample', 'Haplotype']] = df_chm13['Sample_Haplotype'].str.split('_', expand=True)

# Remove duplicate haplotypes for CHM13
unique_haplotypes_df_chm13 = df_chm13.drop_duplicates(subset=['Sample_Haplotype'])

# Load metadata
metadata_df = pd.read_csv('/home/alextu/scratch/igsr_sample_metadata/igsr_samples.tsv', delimiter='\t')

# Merge metadata with the unique haplotypes DataFrame for CHM13
merged_df_chm13 = pd.merge(unique_haplotypes_df_chm13, metadata_df[['Sample name', 'Sex', 'Superpopulation name', 'Superpopulation code']], left_on='Sample', right_on='Sample name', how='left')

# Count the total number of unique haplotypes for CHM13
total_haplotypes_chm13 = len(merged_df_chm13['Sample_Haplotype'].unique())
print(f"Total number of unique haplotypes for CHM13: {total_haplotypes_chm13}")

# Check and count the sex and superpopulation annotations for CHM13
sex_counts_chm13 = merged_df_chm13['Sex'].value_counts()
print("Sex counts for CHM13:\n", sex_counts_chm13)

superpop_counts_chm13 = merged_df_chm13['Superpopulation name'].value_counts()
print("Superpopulation counts for CHM13:\n", superpop_counts_chm13)

# Identify samples with missing annotations for CHM13
missing_sex_chm13 = merged_df_chm13[merged_df_chm13['Sex'].isna()]['Sample']
missing_superpop_chm13 = merged_df_chm13[merged_df_chm13['Superpopulation name'].isna()]['Sample']

print("Samples with missing Sex annotation for CHM13:")
print(missing_sex_chm13)

print("\nSamples with missing Superpopulation annotation for CHM13:")
print(missing_superpop_chm13)

# Group by 'Superpopulation name' and 'Sex', then count occurrences for CHM13
haplotype_by_superpop_chm13 = merged_df_chm13.groupby(['Superpopulation name', 'Sex']).size().unstack(fill_value=0)

# Display the resulting DataFrame for CHM13
print("\nHaplotype by Superpopulation for CHM13:\n", haplotype_by_superpop_chm13)


# Load your feature counts dataframes for HG38
batch1_hg38 = pd.read_csv('/home/alextu/scratch/results/summary_stats/hg38/verkko_batch1_hg38_collapsed_summary_metrics.csv')
batch2_hg38 = pd.read_csv('/home/alextu/scratch/results/summary_stats/hg38/verkko_batch2_hg38_collapsed_summary_metrics.csv')
batch3_hg38 = pd.read_csv('/home/alextu/scratch/results/summary_stats/hg38/verkko_batch3_hg38_collapsed_summary_metrics.csv')
batch4_hg38 = pd.read_csv('/home/alextu/scratch/results/summary_stats/hg38/verkko_batch123_hg38_collapsed_quadron_summary_metrics_combined.csv')

# Combine the HG38 dataframes
df_hg38 = pd.concat([batch1_hg38, batch2_hg38, batch3_hg38, batch4_hg38], ignore_index=True)

# Extract sample identifier and haplotype from 'Sample_Haplotype' column for HG38
df_hg38[['Sample', 'Haplotype']] = df_hg38['Sample_Haplotype'].str.split('_', expand=True)

# Remove duplicate haplotypes for HG38
unique_haplotypes_df_hg38 = df_hg38.drop_duplicates(subset=['Sample_Haplotype'])

# Merge metadata with the unique haplotypes DataFrame for HG38
merged_df_hg38 = pd.merge(unique_haplotypes_df_hg38, metadata_df[['Sample name', 'Sex', 'Superpopulation name', 'Superpopulation code']], left_on='Sample', right_on='Sample name', how='left')

# Count the total number of unique haplotypes for HG38
total_haplotypes_hg38 = len(merged_df_hg38['Sample_Haplotype'].unique())
print(f"Total number of unique haplotypes for HG38: {total_haplotypes_hg38}")

# Check and count the sex and superpopulation annotations for HG38
sex_counts_hg38 = merged_df_hg38['Sex'].value_counts()
print("Sex counts for HG38:\n", sex_counts_hg38)

superpop_counts_hg38 = merged_df_hg38['Superpopulation name'].value_counts()
print("Superpopulation counts for HG38:\n", superpop_counts_hg38)

# Identify samples with missing annotations for HG38
missing_sex_hg38 = merged_df_hg38[merged_df_hg38['Sex'].isna()]['Sample']
missing_superpop_hg38 = merged_df_hg38[merged_df_hg38['Superpopulation name'].isna()]['Sample']

print("Samples with missing Sex annotation for HG38:")
print(missing_sex_hg38)

print("\nSamples with missing Superpopulation annotation for HG38:")
print(missing_superpop_hg38)

# Group by 'Superpopulation name' and 'Sex', then count occurrences for HG38
haplotype_by_superpop_hg38 = merged_df_hg38.groupby(['Superpopulation name', 'Sex']).size().unstack(fill_value=0)

# Display the resulting DataFrame for HG38
print("\nHaplotype by Superpopulation for HG38:\n", haplotype_by_superpop_hg38)


Total number of unique haplotypes for CHM13: 130
Sex counts for CHM13:
 Sex
female    68
male      58
Name: count, dtype: int64
Superpopulation counts for CHM13:
 Superpopulation name
African Ancestry        58
East Asian Ancestry     20
American Ancestry       18
South Asian Ancestry    16
European Ancestry       14
Name: count, dtype: int64
Samples with missing Sex annotation for CHM13:
11    NA21487
28    NA21487
81    NA24385
87    NA24385
Name: Sample, dtype: object

Samples with missing Superpopulation annotation for CHM13:
11    NA21487
28    NA21487
81    NA24385
87    NA24385
Name: Sample, dtype: object

Haplotype by Superpopulation for CHM13:
 Sex                   female  male
Superpopulation name              
African Ancestry          28    30
American Ancestry         12     6
East Asian Ancestry       12     8
European Ancestry          6     8
South Asian Ancestry      10     6
Total number of unique haplotypes for HG38: 130
Sex counts for HG38:
 Sex
female    68
male  

In [7]:
# Plotting % of Genome Covered by Non-B DNA Features

import pandas as pd
import matplotlib.pyplot as plt

# Load your feature counts dataframes for CHM13
batch1_chm13 = pd.read_csv('/home/alextu/scratch/results/summary_stats/chm13/verkko_batch1_chm13_collapsed_summary_metrics.csv')
batch2_chm13 = pd.read_csv('/home/alextu/scratch/results/summary_stats/chm13/verkko_batch2_chm13_collapsed_summary_metrics.csv')
batch3_chm13 = pd.read_csv('/home/alextu/scratch/results/summary_stats/chm13/verkko_batch3_chm13_collapsed_summary_metrics.csv')
batch4_chm13 = pd.read_csv('/home/alextu/scratch/results/summary_stats/chm13/verkko_batch123_chm13_collapsed_quadron_summary_metrics_combined.csv')

# Combine the CHM13 dataframes
df_chm13 = pd.concat([batch1_chm13, batch2_chm13, batch3_chm13, batch4_chm13], ignore_index=True)

# Load your feature counts dataframes for HG38
batch1_hg38 = pd.read_csv('/home/alextu/scratch/results/summary_stats/hg38/verkko_batch1_hg38_collapsed_summary_metrics.csv')
batch2_hg38 = pd.read_csv('/home/alextu/scratch/results/summary_stats/hg38/verkko_batch2_hg38_collapsed_summary_metrics.csv')
batch3_hg38 = pd.read_csv('/home/alextu/scratch/results/summary_stats/hg38/verkko_batch3_hg38_collapsed_summary_metrics.csv')
batch4_hg38 = pd.read_csv('/home/alextu/scratch/results/summary_stats/hg38/verkko_batch123_hg38_collapsed_quadron_summary_metrics_combined.csv')

# Combine the HG38 dataframes
df_hg38 = pd.concat([batch1_hg38, batch2_hg38, batch3_hg38, batch4_hg38], ignore_index=True)

# Add a new column to distinguish between CHM13 and HG38
df_chm13['Reference'] = 'CHM13'
df_hg38['Reference'] = 'HG38'

# Combine both dataframes
df = pd.concat([df_chm13, df_hg38], ignore_index=True)

# Extract sample identifier and haplotype from 'Sample_Haplotype' column
df[['Sample', 'Haplotype']] = df['Sample_Haplotype'].str.split('_', expand=True)

# Separate autosomes and sex chromosomes
autosomes = [f'chr{i}' for i in range(1, 23)]
sex_chromosomes = ['chrX', 'chrY']

df['Chromosome_Type'] = df['Chromosome'].apply(lambda x: 'Autosomes' if x in autosomes else 'Sex Chromosomes')

# Group by Reference, Chromosome_Type, Sample_Haplotype, and Feature, then sum the Percent of Genome
grouped = df.groupby(['Reference', 'Chromosome_Type', 'Sample_Haplotype', 'Feature'])['Percent of Genome'].sum().unstack(fill_value=0)

# Plotting the stacked bar chart
fig, ax = plt.subplots(figsize=(15, 10))

# Separate the data for plotting
chm13_autosomes = grouped.loc['CHM13', 'Autosomes']
chm13_sex_chromosomes = grouped.loc['CHM13', 'Sex Chromosomes']
hg38_autosomes = grouped.loc['HG38', 'Autosomes']
hg38_sex_chromosomes = grouped.loc['HG38', 'Sex Chromosomes']

# Plot each category separately
chm13_autosomes.plot(kind='bar', stacked=True, ax=ax, position=1, width=0.2, color=plt.cm.Paired.colors, legend=False)
chm13_sex_chromosomes.plot(kind='bar', stacked=True, ax=ax, position=2, width=0.2, color=plt.cm.Paired.colors, legend=False)
hg38_autosomes.plot(kind='bar', stacked=True, ax=ax, position=3, width=0.2, color=plt.cm.Paired.colors, legend=False)
hg38_sex_chromosomes.plot(kind='bar', stacked=True, ax=ax, position=4, width=0.2, color=plt.cm.Paired.colors, legend=False)

# Set the x-ticks to show Sample_Haplotype
ax.set_xticks(range(len(chm13_autosomes)))
ax.set_xticklabels(chm13_autosomes.index, rotation=90)

# Add labels and title
ax.set_xlabel('Sample_Haplotype')
ax.set_ylabel('Percent of Genome Covered')
ax.set_title('Comparison of Percent Genome Covered by Each Feature for Each Haplotype')

# Add a legend
handles, labels = ax.get_legend_handles_labels()
ax.legend(handles, labels, title='Feature')

plt.tight_layout()
plt.show()


KeyError: 'Chromosome'