In [None]:
# Asking user's inputs
# Sample responses:
# user_vcf_file_path = /content/WES_UG_TM_full_vep.vcf
# first_letter_unwanted_sample = T
# batch_size = 20
# LD_threshold = 0.2
# user_data_population = UG

# Inputs for Step 1
user_vcf_file_path = str(input("(1) What is the path of the VCF file containing your exome data? "))
print("- Path of VCF file containing user's exome data:", user_vcf_file_path)
print("")

first_letter_unwanted_sample = str(input("(2) What is the first letter of the sample IDs for populations that you want to exclude from the analysis? "))
print("- First letter of the sample IDs for populations that user wants to exclude from the analysis:", first_letter_unwanted_sample)
print("")

# Inputs for Step 2
batch_size = int(input("(3) What batch size would you like to use when processing the 1000 Genome data? (< 20 is recommended) "))
print("- Batch size for 1000 Genome data processing:", batch_size)
print("")

# Inputs for Step 3
LD_threshold = float(input("(4) What is the threshold value for Linkage Disequilibrium (LD)? (0.2 is recommended) "))
print("- Threshold value for Linkage Disequilibrium (LD):", LD_threshold)
print("")

# Inputs for Step 4
user_data_population = str(input("(5) What abbreviation would you like to use for the population concerning your exome data? "))
print("- Selected abbreviation for the population of user's exome data:", user_data_population)
print("")

# Step 0: Importing dependencies
import numpy as np
import pandas as pd
import gc

# Step 1: Extracting and processing user's exome VCF data
from stepOne_user_file_processor import user_file_processor

print("Step 1: Extracting and processing user's exome VCF data")
df2 = user_file_processor(user_vcf_file_path, first_letter_unwanted_sample)
gc.collect()

# Step 2: Extracting SNPs from 1000 Genome data
from stepTwo_Thousand_Genome_SNP_extractor_v3 import Thousand_Genome_SNP_extractor

print("Step 2: Extracting SNPs from 1000 Genome data with matching POS")
# Specifying path for csv file of user's processed exome data
user_file_path = "/content/User_Exome_Analysis.csv"

# Specifying the list of chromosomes to be examined
chr_num_list = []

for chr_num in range(1, 23):
  chr_num_list.append(chr_num)

  if chr_num == 22:
    #chr_num_list = list(map(str, chr_num_list))
    chr_num_list.append("X")

# Storing the title of csv files containing extracted SNPs for each chromosome in a list
SNP_extracted_list = []

for chr in chr_num_list:
  csv_title = Thousand_Genome_SNP_extractor(chr, user_file_path, batch_size)
  SNP_extracted_list.append(csv_title)

# Step 3: Processing data of SNPs from 1000 Genome data
from stepThree_Thousand_Genome_data_processor import Thousand_Genome_data_processor

print("Step 3: Processing data of SNPs from 1000 Genome data")

ID_to_population_file_list = ["/content/AFR_373509-SampleGenotypes-Homo_sapiens_Variation_Sample_rs1143634.csv",
             "/content/AMR_373510-SampleGenotypes-Homo_sapiens_Variation_Sample_rs1143634.csv",
             "/content/EAS_373518-SampleGenotypes-Homo_sapiens_Variation_Sample_rs1143634.csv",
             "/content/EUR_373520-SampleGenotypes-Homo_sapiens_Variation_Sample_rs1143634.csv",
             "/content/SAS_373535-SampleGenotypes-Homo_sapiens_Variation_Sample_rs1143634.csv"]

df_10_list = Thousand_Genome_data_processor(SNP_extracted_list, ID_to_population_file_list, LD_threshold)
gc.collect()

# Step 4: Compiling SNPs from 1000 Genome data and user's exome data
from stepFour_Thousand_Genome_data_compiler import Thousand_Genome_data_compiler

print("Step 4: Compiling SNPs from 1000 Genome data and user's exome data")

user_data_df = pd.read_csv(user_file_path)

df_final_transposed = Thousand_Genome_data_compiler(user_file_path, df_10_list, user_data_population)
del df_10_list
gc.collect()

# Step 5: Conducting PCA on the compiled data
print("Step 5: Generating 2-D and 3-D PCA plots using the compiled data")

from stepFive_Thousand_Genome_PCA import Thousand_Genome_PCA

Thousand_Genome_PCA(df_final_transposed, user_data_population)