# Haplotype clustering and networking
- this notebook sets up the utilties and generates figures for paper

In [1]:
%run hapclust_utils.ipynb

## setup data

In [None]:
callset = h5py.File('../data/ag1000g.phase1.AR3.1.haplotypes.specific_regions.2L_2358158_2431617.h5', mode='r')
region_vgsc = SeqFeature('2L', 2358158, 2431617)
genotypes = allel.GenotypeArray(callset['2L/calldata/genotype'])
haplotypes = genotypes.to_haplotypes()
pos = allel.SortedIndex(callset['2L/variants/POS'])
loc = pos.locate_range(region_vgsc.start, region_vgsc.end)
h_vgsc = haplotypes[loc]
pos_995S = 2422651
pos_995F = 2422652
loc_995S = haplotypes[pos.locate_key(pos_995S)] == 1
loc_995F = haplotypes[pos.locate_key(pos_995F)] == 1
h_vgsc_995F = h_vgsc.compress(loc_995F, axis=1)
h_vgsc_995S = h_vgsc.compress(loc_995S, axis=1)
sample_ids = callset['2L']['samples'][:]
hap_ids = np.array(list(itertools.chain(*[[s + b'a', s + b'b'] for s in sample_ids])))
hap_ids_995F = hap_ids[loc_995F]
hap_ids_995S = hap_ids[loc_995S]
tbl_haplotypes = etl.fromtsv('../data/ag1000g.phase1.AR3.1.haplotypes.meta.txt')
hap_pops = np.array(tbl_haplotypes.values('population'))
hap_pops_995S = hap_pops[loc_995S]
hap_pops_995F = hap_pops[loc_995F]

# need to use named colors for graphviz
pop_colors = {
    'AOM': 'brown',
    'BFM': 'firebrick1',
    'GWA': 'goldenrod1',
    'GNS': 'cadetblue1',
    'BFS': 'deepskyblue',
    'CMS': 'dodgerblue3',
    'UGS': 'palegreen',
    'GAS': 'olivedrab',
    'KES': 'grey47',
    'colony': 'black'
}

hap_colors_995S = np.array([pop_colors[p] for p in hap_pops_995S])
hap_colors_995F = np.array([pop_colors[p] for p in hap_pops_995F])

## hierarchical clustering

In [None]:
# Default plot... 
# cuts the tree at height 2 (so max distance within each cluster is 1)...
# highlights all clusters...
# labels all clusters.
fig_haplotypes_clustered(h_vgsc_995S, dpi=150);

In [None]:
# Change the orientation...
fig_haplotypes_clustered(h_vgsc_995S, orientation='left', dpi=150);

In [None]:
#try a different cut height
fig_haplotypes_clustered(h_vgsc_995S, cut_height=5, dpi=150);

In [None]:
#choose to highlight clusters over a certain size
fig_haplotypes_clustered(h_vgsc_995S, dpi=150, highlight_clusters=5);

In [None]:
# Manually choose which clusters to highlight...
fig_haplotypes_clustered(h_vgsc_995S, dpi=150, highlight_clusters=[2, 9]);

In [None]:
#turn of cluster labels
fig_haplotypes_clustered(h_vgsc_995S, dpi=150, highlight_clusters=5, label_clusters=False);

In [None]:
#use favourite colours
fig_haplotypes_clustered(h_vgsc_995S, dpi=150, highlight_clusters=5, label_clusters=False, highlight_colors=['green','darkgreen','green','lightgreen','green',], highlight_alpha=.8);

## return values

In [None]:
#checkout all these returns
fig, ax_dend, ax_freq, cluster_spans, leaf_obs = fig_haplotypes_clustered(h_vgsc_995S, dpi=150, highlight_clusters=5, label_clusters=5);

### customising axes

In [None]:
#e.g. use the returned axes objects (ax_dend, ax_freq) to customise labels etc....
fig, ax_dend, ax_freq, cluster_spans, leaf_obs = fig_haplotypes_clustered(h_vgsc_995S, dpi=150, label_clusters=5, highlight_clusters=5, subplot_pad=4)
ax_dend.set_title('haplotype structure (L995S)')
ax_dend.set_ylabel('distance (no. SNPs)')
ax_dend.set_xlabel('blahblahblah')
ax_freq.set_ylabel('haplotype frequency');

### accessing information about clusters

In [None]:
#cluster spans is useful for accessing info about each cluster
cluster_spans

In [None]:
#e.g. cluster labelled 17 in plot
cluster_idx = 17
dend_start, dend_stop, cluster_hap_indices = cluster_spans[cluster_idx]

In [None]:
dend_start, dend_stop

In [None]:
#indices of haps in cluster
cluster_hap_indices

In [None]:
len(cluster_hap_indices)

In [None]:
#these are relative to the haplotype array passed to function - not relative to the 1530 haplotypes!
#extract cluster 17 haplotypes
cluster_haps = h_vgsc_995S.take(cluster_hap_indices, axis=1)
cluster_haps

### outputting haplotype data in other formats

In [None]:
#names for haps
cluster_hap_ids = hap_ids_995S.take(cluster_hap_indices)
cluster_hap_ids.shape, cluster_hap_ids[0]

In [None]:
#dtype and transpose
sequences = cluster_haps.astype('S1').T
sequences.shape

In [None]:
#write out fasta format
cut_height = 2
fasta_fn = '../data/hapclust.995S.cut{}.cluster{}.fasta'.format(cut_height, cluster_idx)
allel.io.write_fasta(fasta_fn, sequences=list(sequences), names=cluster_hap_ids, mode='w', width=2000)

In [None]:
#make phylip format for TCS inputting
fasta_pasta = pyfasta.Fasta(fasta_fn)

phylip_fn = '../data/hapclust.995S.cut{}.cluster{}.phy'.format(cut_height, cluster_idx)
thefile = open(phylip_fn, 'w')
thefile.write(str(sequences.shape[0])+' '+str(sequences.shape[1])+'\n')
for i in fasta_pasta.keys():
    j = i.split('-')
    k = str(fasta_pasta[i]).replace('0', 'g').replace('1', 'c')
    thefile.write(j[0]+j[1]+'__'+k+'\n')

### output all the clusters as .phy to compare with AM's using TCS

In [None]:
def output_all_phy():
    
    fig, ax_dend, ax_freq, cluster_spans, leaf_obs = fig_haplotypes_clustered(h_vgsc_995S, dpi=150, label_clusters=5, highlight_clusters=5);
    for cluster_idx in 2, 9, 12, 14, 17:
        dend_start, dend_stop, cluster_hap_indices = cluster_spans[cluster_idx]
        cluster_haps = h_vgsc_995S.take(cluster_hap_indices, axis=1)
        cluster_hap_ids = hap_ids_995S.take(cluster_hap_indices)
        sequences = cluster_haps.astype('S1').T

        fasta_fn = '../data/hapclust.995S.cut2.cluster{}.fasta'.format(cluster_idx)
        allel.io.write_fasta(fasta_fn, sequences=list(sequences), names=cluster_hap_ids, mode='w', width=2000)

        fasta_pasta = pyfasta.Fasta(fasta_fn)
        phylip_fn = '../data/hapclust.995S.cut2.cluster{}.phy'.format(cluster_idx)
        thefile = open(phylip_fn, 'w')
        thefile.write(str(sequences.shape[0])+' '+str(sequences.shape[1])+'\n')
        for i in fasta_pasta.keys():
            j = i.split('-')
            k = str(fasta_pasta[i]).replace('0', 'g').replace('1', 'c')
            thefile.write(j[0]+j[1]+'__'+k+'\n')

                
    fig, ax_dend, ax_freq, cluster_spans, leaf_obs = fig_haplotypes_clustered(h_vgsc_995F, dpi=150, label_clusters=5, cut_height=4, highlight_clusters=5);
    for cluster_idx in 4, 7, 8, 12, 16:
        dend_start, dend_stop, cluster_hap_indices = cluster_spans[cluster_idx]
        cluster_haps = h_vgsc_995F.take(cluster_hap_indices, axis=1)
        cluster_hap_ids = hap_ids_995F.take(cluster_hap_indices)
        sequences = cluster_haps.astype('S1').T

        fasta_fn = '../data/hapclust.995F.cut4.cluster{}.fasta'.format(cluster_idx)
        allel.io.write_fasta(fasta_fn, sequences=list(sequences), names=cluster_hap_ids, mode='w', width=2000)

        fasta_pasta = pyfasta.Fasta(fasta_fn)
        phylip_fn = '../data/hapclust.995F.cut4.cluster{}.phy'.format(cluster_idx)
        thefile = open(phylip_fn, 'w')
        thefile.write(str(sequences.shape[0])+' '+str(sequences.shape[1])+'\n')
        for i in fasta_pasta.keys():
            j = i.split('-')
            k = str(fasta_pasta[i]).replace('0', 'g').replace('1', 'c')
            thefile.write(j[0]+j[1]+'__'+k+'\n')

            
        

In [None]:
output_all_phy()