In [1]:
import pandas as pd
import glob
import os

def read_vcf_header(file_path):
    """Reads the VCF file header (column names) and returns it."""
    with open(file_path, 'r') as file:
        for line in file:
            if line.startswith('#CHROM'):
                return line.strip().split('\t')
    return []

def extract_snp_data(file_path, header):
    """Extracts SNP data from a VCF file."""
    snp_data = []
    with open(file_path, 'r') as file:
        for line in file:
            if not line.startswith('#'):
                fields = line.strip().split('\t')
                chrom, pos, ref, alt = fields[0], fields[1], fields[3], fields[4]
                snp_id = f"chr{chrom}_{pos}_{ref}_{alt}"
                genotype_data = fields[9:]  # Genotype data starts from the 10th column
                snp_data.append((snp_id, genotype_data))
    return snp_data

def process_vcf_files(directory):
    """Processes all VCF files in the specified directory."""
    vcf_files = glob.glob(os.path.join(directory, '*.vcf'))
    all_snp_data = []
    patient_ids = None
    
    for file_path in vcf_files:
        if patient_ids is None:
            patient_ids = read_vcf_header(file_path)[9:]  # Assuming the first 9 columns are standard VCF columns
        snp_data = extract_snp_data(file_path, patient_ids)
        all_snp_data.extend(snp_data)

    # Convert the collected SNP data to a DataFrame
    data_dict = {snp_id: genotypes for snp_id, genotypes in all_snp_data}
    df = pd.DataFrame(data_dict, index=patient_ids).transpose()
    
    return df

In [6]:
df_bcftools.max()

Series([], dtype: float64)

In [7]:
# Replace 'your_directory_path' with the path to your directory containing the VCF files
directory_path = '../../Data/bcftools_results/'
df_bcftools = process_vcf_files(directory_path).transpose()


# If you want to save this DataFrame to a CSV file for further analysis
df_bcftools.to_csv('concatenated_snps.csv')

In [8]:
# List number of files in the directory
vcf_files = glob.glob(os.path.join(directory_path, '*.vcf'))
print(f"Number of VCF files: {len(vcf_files)}")

Number of VCF files: 0


In [26]:
# Get all chromosome 10 SNPs
chrom_10_snps = df_bcftools.columns[df_bcftools.columns.str.startswith('chr22_')]
print(chrom_10_snps)
print(len(chrom_10_snps))

Index(['chr22_19707371', 'chr22_19766137', 'chr22_29114293', 'chr22_29121087',
       'chr22_29135543', 'chr22_29203724', 'chr22_29551872', 'chr22_38583315',
       'chr22_39343916', 'chr22_40904707', 'chr22_43433100', 'chr22_45319942',
       'chr22_45319953', 'chr22_46283297'],
      dtype='object')
14


In [4]:

# Replace 'your_directory_path' with the path to your directory containing the TSV files
directory_path = '../Grep_tsv_files/'

import os

def extract_alleles(directory):
    
    files = {}
    for filename in os.listdir(directory):
        alleles = []

        if filename.endswith(".vcf"):
            filepath = os.path.join(directory, filename)
            with open(filepath, "r") as file:
                line = file.readline().strip()
                fields = line.split("\t")
                
                if len(fields) >= 10:
                    genotypes = fields[9:]
                    for genotype in genotypes:
                        
                        alleles.append(genotype.replace("|", ""))
            files["chr" + filename.split(".")[0]] = alleles
    
    return pd.DataFrame(files).T

df_grep = extract_alleles(directory_path)

In [5]:

# Select a SNP ID that is present in both dataframes
snp_id = 'chr22_40904707'

# Get the genotypes for the selected SNP from both dataframes
genotypes_bcftools = df_bcftools.loc[snp_id].values
genotypes_plink = df_plink.loc[snp_id].values
genotypes_grep = df_grep.loc[snp_id].values


# Replace the '/' and '|' characters in the genotypes
genotypes_bcftools = [genotype.replace('/', '').replace('|', '') for genotype in genotypes_bcftools]
genotypes_plink = [genotype.replace('/', '').replace('|', '') for genotype in genotypes_plink]


KeyError: 'chr22_40904707'

In [68]:
# Check if the genotypes are equal
genotypes_equal = all(g1 == g2 for g1, g2 in zip(genotypes_bcftools, genotypes_plink))

if genotypes_equal:
    print(f"The genotypes for SNP {snp_id} are equal in both dataframes.")
else:
    print(f"The genotypes for SNP {snp_id} are not equal in both dataframes.")
    # Get the particular patients that are not equal
    counter = 0
    for patient_id, g1, g2 in zip(df_bcftools.columns, genotypes_bcftools, genotypes_plink):
        if g1 != g2:
            print(f"Patient {patient_id}: BCFTOOLS - {g1}, PLINK - {g2}")
            counter += 1
    print(f"Total number of patients with different genotypes: {counter}")

The genotypes for SNP chr22_40904707 are not equal in both dataframes.
Patient HG00119: BCFTOOLS - 10, PLINK - 01
Patient HG00132: BCFTOOLS - 10, PLINK - 01
Patient HG00136: BCFTOOLS - 10, PLINK - 01
Patient HG00158: BCFTOOLS - 10, PLINK - 01
Patient HG00171: BCFTOOLS - 10, PLINK - 01
Patient HG00186: BCFTOOLS - 10, PLINK - 01
Patient HG00251: BCFTOOLS - 10, PLINK - 01
Patient HG00255: BCFTOOLS - 10, PLINK - 01
Patient HG00267: BCFTOOLS - 10, PLINK - 01
Patient HG00271: BCFTOOLS - 10, PLINK - 01
Patient HG00274: BCFTOOLS - 10, PLINK - 01
Patient HG00280: BCFTOOLS - 10, PLINK - 01
Patient HG00281: BCFTOOLS - 10, PLINK - 01
Patient HG00304: BCFTOOLS - 10, PLINK - 01
Patient HG00306: BCFTOOLS - 10, PLINK - 01
Patient HG00309: BCFTOOLS - 10, PLINK - 01
Patient HG00313: BCFTOOLS - 10, PLINK - 01
Patient HG00326: BCFTOOLS - 10, PLINK - 01
Patient HG00327: BCFTOOLS - 10, PLINK - 01
Patient HG00341: BCFTOOLS - 10, PLINK - 01
Patient HG00344: BCFTOOLS - 10, PLINK - 01
Patient HG00357: BCFTOOLS 

In [61]:
# Check if the genotypes are equal between bcftools and grep

genotypes_equal = all(g1 == g2 for g1, g2 in zip(genotypes_bcftools, genotypes_grep))

if genotypes_equal:
    
    print(f"The genotypes for SNP {snp_id} are equal in both dataframes.")
else:
    print(f"The genotypes for SNP {snp_id} are not equal in both dataframes.")
    # Get the particular patients that are not equal
    counter = 0
    for patient_id, g1, g2 in zip(df_bcftools.columns, genotypes_bcftools, genotypes_grep):
        if g1 != g2:
            print(f"Patient {patient_id}: BCFTOOLS - {g1}, GREP - {g2}")
            counter += 1
    print(f"Total number of patients with different genotypes: {counter}")

The genotypes for SNP chr22_40904707 are equal in both dataframes.


In [1]:
import re
# Regular expression to extract chromosome number and position
pattern = r"chr(\d+)_([0-9]+)"

# Function to apply the pattern and format the string
def extract_format(s):
    match = re.match(pattern, s)
    if match:
        chromosome = match.group(1)
        position = match.group(2)
        return f"{chromosome}_{position}"
    else:
        return None

# Apply the function to the DataFrame index and create a list
formatted_list = [extract_format(index) for index in df.index]

print(formatted_list)

NameError: name 'df' is not defined

In [6]:
len(formatted_list)

406

In [7]:
# Load the specific column from the Excel file
PRS_313_mutations = pd.read_excel("../PRS313.xlsx", usecols=["SNPa"])

# Function to remove everything after the second underscore
def remove_after_second_underscore(s):
    parts = s.split('_')
    if len(parts) > 2:
        return '_'.join(parts[:2])
    else:
        return s

# Apply the function to each string in the column
PRS_313_mutations['SNPa'] = PRS_313_mutations['SNPa'].apply(remove_after_second_underscore)

In [8]:
PRS_313_mutations.SNPa

0      1_100880328
1       1_10566215
2      1_110198129
3      1_114445880
4      1_118141492
          ...     
308    22_39343916
309    22_40904707
310    22_43433100
311    22_45319953
312    22_46283297
Name: SNPa, Length: 313, dtype: object

In [9]:
def difference_list(list_a, list_b):
    """
    Returns elements that are in list_a but not in list_b.
    
    Parameters:
    - list_a: First list.
    - list_b: Second list to compare against.
    
    Returns:
    A new list with elements that are in list_a but not in list_b.
    """
    return [item for item in list_a if item not in list_b]

In [12]:
missing_SNPs = difference_list(PRS_313_mutations.SNPa,formatted_list)

In [13]:
missing_SNPs

[]

In [11]:
with open('../positions/missing_positions_2.txt', 'w') as file:
    for item in missing_SNPs:
        chromosome, position = item.split('_')
        file.write(f"{chromosome} {position}\n")