In [0]:
import sys
import pandas as pd
import seaborn as sns
%matplotlib inline
from array import array
import matplotlib.pyplot as plt
import numpy as np
import warnings

In [0]:
from pandas.io.pytables import PerformanceWarning

In [0]:
warnings.filterwarnings("ignore", category=PerformanceWarning) 

In [0]:
sys.path.append("/home/cfriedline/ipynb/gypsy_moth/")

In [0]:
from hdfstorehelper import HDFStoreHelper

In [0]:
sns.set_context("talk")

In [0]:
cd /gpfs_fs/home/eckertlab/gypsy_indiv/masked/analysis/samtools1.2_no_otis/beagle40/

## Compute pairwise LD 

```bash
vcftools --gzvcf isect.vcf.gz.sorted.gz --interchrom-hap-r2  --out isect.vcf.gz.sorted.gz
vcftools --gzvcf isect.vcf.gz.sorted.gz --hap-r2  --out isect.vcf.gz.sorted.gz
vcftools --gzvcf isect.vcf.gz.sorted.gz --freq  --out isect.vcf.gz.sorted.gz
```

In [0]:
intra = "isect.vcf.gz.sorted.gz.hap.ld"
inter = "isect.vcf.gz.sorted.gz.interchrom.hap.ld"

In [0]:
h = open(intra)
intra_data = []
for line in h:
    intra_data.append(line.strip().split())
intra_df = pd.DataFrame(data=intra_data[1:], columns=intra_data[0])
intra_df.iloc[:,1:] = intra_df.iloc[:,1:].apply(pd.to_numeric)

In [0]:
intra_df['dist'] = intra_df.apply(lambda x: np.abs(x.POS1-x.POS2), axis=1)

In [0]:
plot_data = intra_df[intra_df['dist'] < 1000]
g = sns.regplot("dist", "R^2", 
            plot_data,
           fit_reg=False)
g.set(xlim=(0,500),ylim=(0,1),title="n=%d" % len(plot_data));

In [0]:
z12_pos = pd.read_csv("../notimputed/isect.vcf.gz.sorted.gz.012.pos", sep="\t", header=None)
z12_pos.columns = ["contig", "position"]
z12_pos['contig_pos'] = z12_pos.apply(lambda x: "%s_%s" % (x.contig, x.position), axis=1)
z12_pos.head()

In [0]:
z12_indv = pd.read_csv("../notimputed/isect.vcf.gz.sorted.gz.012.indv", header=None)
z12_indv.index.name = "sample"
z12_indv.head()

In [0]:
z12_data = pd.read_csv("../notimputed/isect.vcf.gz.sorted.gz.012", sep="\t", header=None)
z12_data = z12_data.drop(0, axis=1)
z12_data.columns = z12_pos.contig_pos.values
z12_data.index = z12_indv[0].values
z12_data.head()

In [0]:
def get_percent_missing(snp):
    c = snp.value_counts()
    if -1 in c:
        return c[-1]/len(snp)
    return 0.0
percent_missing = z12_data.apply(get_percent_missing)
percent_missing = pd.DataFrame(percent_missing)
percent_missing.columns = ["missing"]

In [0]:
percent_missing['contig'] = percent_missing.apply(lambda x: "_".join(x.name.split("_")[0:-1]), axis=1)
percent_missing['position'] = percent_missing.apply(lambda x: int(x.name.split("_")[-1]), axis=1)

In [0]:
def assign_bin(val, bins):
    for i, b in enumerate(bins):
        if val < b:
            return i
        
spaced_snps = []
read_size=150
for contig, data in percent_missing.groupby('contig'):
    data = data.sort_values("missing", ascending=True)
    bins = list(range(data.position.min()+read_size, data.position.max()+read_size, read_size))
    if len(data) == 1:
        data['bin'] = 0
    else:
        data['bin'] = data.position.apply(assign_bin, args=(bins,))
    for bin_id, bin_data in data.groupby('bin'):
        spaced_snps.append(bin_data.index[0])

In [0]:
spaced_df = pd.DataFrame(data=spaced_snps, columns=['contig'])
spaced_df.index = spaced_df.contig
spaced_df['keep'] = True
spaced_df = spaced_df.drop('contig', axis=1)
spaced_df.head()

In [0]:
keep = percent_missing.join(spaced_df)

In [0]:
keep_snps = keep[keep.keep==True]

In [0]:
keep_snps.to_csv("/gpfs_fs/home/eckertlab/gypsy_indiv/masked/analysis/samtools1.2_no_otis/keep_snps.csv", 
                sep="\t",
                header=True,
                index=True)

In [0]:
inter_data = []
h = open(inter)
for i, line in enumerate(h):
    line = line.split()
    inter_data.append(line)
    if i % 5000000 == 0 and i > 0:
        print(("at %d" % i))

In [0]:
inter_df = pd.DataFrame(data=inter_data[1:], columns=inter_data[0])

In [0]:
inter_df.iloc[:,-2:] = inter_df.iloc[:,-2:].apply(pd.to_numeric)

In [0]:
inter_df.shape

In [0]:
plot_data = inter_df['R^2']

In [0]:
g = sns.distplot(plot_data)
g.set(title="n=%d" % len(plot_data))

In [0]:
g.

In [0]:
print("median", inter_df['R^2'].median())
print("max", inter_df['R^2'].max())
print("mean", inter_df['R^2'].mean())