In [1]:
#!/usr/bin/env python3
# Python code for STRUCTURE analysis.

# PREREQS
# conda install scikit-learn -c conda-forge
# conda install -c conda-forge numpy\<2.0.0

import ipyrad.analysis as ipa
import pandas as pd
import toyplot

In [None]:
# directories and file paths
SUBSETS_PATH   = "/usr/nfs/923643692/cheno_popgen/subsets"
FULL_DATA_PATH = "$FILTER_PATH/final_outputs"

In [None]:
filename20   = 'chenopodium_20250304b-tetraploid_indivmiss20_locimiss50_thin10_maf0.01_LD0.8-1kbp'
filename40   = 'chenopodium_20250304b-tetraploid_indivmiss40_locimiss50_thin10_maf0.01_LD0.8-1kbp'
filename60   = 'chenopodium_20250304b-tetraploid_indivmiss60_locimiss50_thin10_maf0.01_LD0.8-1kbp'

dataname20   = '/usr/nfs/923643692/cheno_popgen/MAF_LD/tetraploid/' + filename20 + '.vcf.gz'
dataname40   = '/usr/nfs/923643692/cheno_popgen/MAF_LD/tetraploid/' + filename40 + '.vcf.gz'
dataname60   = '/usr/nfs/923643692/cheno_popgen/MAF_LD/tetraploid/' + filename60 + '.vcf.gz'

# init a conversion tool
converter20 = ipa.vcf_to_hdf5(
    data    = dataname20,
    workdir = './',
		name    = filename20
)
# run the converter
converter20.run(force=True)

# init a conversion tool
converter40 = ipa.vcf_to_hdf5(
    data    = dataname40,
    workdir = './',
		name    = filename40
)
# run the converter
converter40.run(force=True)

# init a conversion tool
converter60 = ipa.vcf_to_hdf5(
    data    = dataname60,
    workdir = './',
		name    = filename60
)
# run the converter
converter60.run(force=True)

# require that 50% of samples have data in each group
minmap20 = {0.5}
minmap40 = {0.5}
minmap60 = {0.5}
# minmap60 = {i: 0.5 for i in imap60}


# the path to your .snps.hdf5 database files
hdf5_20 = "/usr/nfs/923643692/cheno_popgen/pca/tetraploid/chenopodium_20250304b-tetraploid_indivmiss20_locimiss50_thin10_maf0.01_LD0.8-1kbp.snps.hdf5"
hdf5_40 = "/usr/nfs/923643692/cheno_popgen/pca/tetraploid/chenopodium_20250304b-tetraploid_indivmiss40_locimiss50_thin10_maf0.01_LD0.8-1kbp.snps.hdf5"
hdf5_60 = "/usr/nfs/923643692/cheno_popgen/pca/tetraploid/chenopodium_20250304b-tetraploid_indivmiss60_locimiss50_thin10_maf0.01_LD0.8-1kbp.snps.hdf5"

# ipa.snps_extracter(data).names

pca20 = ipa.pca(
    data=hdf5_20,
    #imap=imap20,
    minmap=minmap20,
    mincov=0.75,
    impute_method="sample",
)

# run the PCA analysis
pca20.run()
# store the PC axes as a dataframe
df20 = pd.DataFrame(pca20.pcaxes[0], index=pca20.names)

# write the PC axes to a CSV file
df20.to_csv("pca_analysis_tetraploid_miss20.csv")

pca40 = ipa.pca(
    data=hdf5_40,
    #imap=imap40,
    minmap=minmap40,
    mincov=0.75,
    impute_method="sample",
)

# run the PCA analysis
pca40.run()
# store the PC axes as a dataframe
df40 = pd.DataFrame(pca40.pcaxes[0], index=pca40.names)

# write the PC axes to a CSV file
df40.to_csv("pca_analysis_tetraploid_miss40.csv")

pca60 = ipa.pca(
    data=hdf5_60,
    #imap=imap40,
    minmap=minmap60,
    mincov=0.75,
    impute_method="sample",
)

# run the PCA analysis
pca60.run()
# store the PC axes as a dataframe
df60 = pd.DataFrame(pca60.pcaxes[0], index=pca60.names)

# write the PC axes to a CSV file
df60.to_csv("pca_analysis_tetraploid_miss60.csv")
