# PyGnocchi Models and Operations

Gnocchi supports opertations such as merging the results of multiple analyses and saving these results to file.

In [2]:
from bdgenomics.gnocchi.core.gnocchiSession import GnocchiSession
# from bdgenomics.gnocchi.models.linearGnocchiModel import LinearGnocchiModel
# from bdgenomics.gnocchi.models.logisticGnocchiModel import LogisticGnocchiModel

In [3]:
gs = GnocchiSession(spark) # Build GnocchiSession

### Construct a LinearGnocchiModel

In [6]:
# Load in genotype and phenotype data
genotypesPath1 = "/Users/Nathaniel/bdg/gnocchi/gnocchi-cli/src/test/resources/RegressionIntegrationTestData_genotypes.vcf"
# phenotypesPath1 = "../examples/testData/tab_time_phenos_1.txt"

genos1 = gs.loadGenotypes(genotypesPath1, "", "ADDITIVE")
# phenos1 = gs.loadPhenotypes(phenotypesPath1, "IID", "pheno_1", "\t")

In [4]:
# Use a factory pattern constructor to create the LinearGnocchiModel
linearGnocchiModel1 = LinearGnocchiModel.New(spark, genos1, phenos1, 
                                             ["pheno_1", "pheno_2", "pheno_3", "pheno_4", "pheno_5"])

### Access Inner Datasets

This allows us to view inside the dataset, pull out specific samples and to interpret the results.

In [5]:
# Access the inner Variant Models
variantModels = linearGnocchiModel1.getVariantModels()
print("Example Variant Model:", variantModels.get().head().toString())

Example Variant Model: LinearVariantModel(rs6772650,LinearAssociation(2071.8798678640846,2065.308779631643,0.7208581404624811,0.5439608302763301,93,0.5877701605134307,List(61.5460254131472, 0.39211859259742265),95),pheno_1,19,174824727,C,T,ADDITIVE,0)


These datasets also expose a set of operations that can be used to build out UDFs to run atop pySpark.

In [6]:
# Access the inner QC Variants
QCVariants = linearGnocchiModel1.getQCVariants()
print("Available operations on Variants Datasets:", ", ".join(dir(QCVariants.get().toJavaRDD())))

Available operations on Variants Datasets: aggregate, cache, cartesian, checkpoint, classTag, coalesce, collect, collectAsync, collectPartitions, context, count, countApprox, countApproxDistinct, countAsync, countByValue, countByValueApprox, distinct, equals, filter, first, flatMap, flatMapToDouble, flatMapToPair, fold, foreach, foreachAsync, foreachPartition, foreachPartitionAsync, fromRDD, getCheckpointFile, getClass, getNumPartitions, getStorageLevel, glom, groupBy, hashCode, id, intersection, isCheckpointed, isEmpty, iterator, keyBy, map, mapPartitions, mapPartitionsToDouble, mapPartitionsToPair, mapPartitionsWithIndex, mapPartitionsWithIndex$default$2, mapToDouble, mapToPair, max, min, name, notify, notifyAll, partitioner, partitions, persist, pipe, randomSplit, rdd, reduce, repartition, sample, saveAsObjectFile, saveAsTextFile, setName, sortBy, subtract, take, takeAsync, takeOrdered, takeSample, toDebugString, toLocalIterator, toRDD, toString, top, treeAggregate, treeReduce, unio

### Access Model Attributes

In [7]:
print("Model Type:", linearGnocchiModel1.getModelType())
print("Phenotype:", linearGnocchiModel1.getPhenotype())
print("Covariates:", linearGnocchiModel1.getCovariates())
print("Num Samples:", linearGnocchiModel1.getNumSamples())
print("Haplotype Block Error Threshold:", linearGnocchiModel1.getHaplotypeBlockErrorThreshold())
# print("Flagged Variant Models:", linearGnocchiModel1.getFlaggedVariantModels())

Model Type: LinearRegression
Phenotype: pheno_1
Covariates: pheno_2,pheno_3,pheno_4,pheno_5
Num Samples: 10000
Haplotype Block Error Threshold: 0.1


### Construct Second LinearGnocchiModel

In [8]:
# Load in genotype and phenotype data
genotypesPath2 = "../examples/testData/time_genos_2.vcf"
phenotypesPath2 = "../examples/testData/tab_time_phenos_2.txt"

genos2 = gs.loadGenotypes(genotypesPath2)
phenos2 = gs.loadPhenotypes(phenotypesPath2, "IID", "pheno_1", "\t")

In [9]:
# Use a factory pattern constructor to create the LinearGnocchiModel
linearGnocchiModel2 = LinearGnocchiModel.New(spark, genos2, phenos2, 
                                             ["pheno_1", "pheno_2", "pheno_3", "pheno_4", "pheno_5"])

In [10]:
print("Num Samples:", linearGnocchiModel2.getNumSamples())

Num Samples: 10000


### Merge LinearGnocchiModels

In [11]:
mergedLinearGnocchiModel = linearGnocchiModel1.mergeGnocchiModel(linearGnocchiModel2)

print("Num Samples:", mergedLinearGnocchiModel.getNumSamples())

Num Samples: 20000


In [12]:
mergedVariantModels = linearGnocchiModel1.mergeVariantModels(linearGnocchiModel2.getVariantModels())

In [13]:
mergedQCVariants = linearGnocchiModel1.mergeQCVariants(linearGnocchiModel2.getQCVariants())