# NANUQ networks reconstructions (part 1) - Running treeslider

## Import modules

In [1]:
#Imports
import toytree
import toyplot.pdf
import pandas as pd
import ipyrad.analysis as ipa

## Getting trees with treeslider

In [None]:
#Sequences file"
SEQS = "../../Raw_data/full_dataset.seqs.hdf5"

In [3]:
# Get the biggest 100 scaffolds to use only
scaff = ipa.treeslider(SEQS)
scaff_table = scaff.scaffold_table.sort_values(by="scaffold_length", ascending=False)
display (scaff_table)

#use 100 biggest scaffolds
scaff_toUse = scaff_table.index[:100].tolist()
print (scaff_toUse)

Unnamed: 0,scaffold_name,scaffold_length
3588,Scaffold_3589;HRSCAF=4012,47665733
28086,Scaffold_28087;HRSCAF=31587,44977913
17364,Scaffold_17365;HRSCAF=19499,36021024
62490,Scaffold_62491;HRSCAF=74262,28015097
61137,Scaffold_61138;HRSCAF=69458,27022671
...,...,...
56530,Scaffold_56531;HRSCAF=63672,1000
47705,Scaffold_47706;HRSCAF=53707,1000
2963,Scaffold_2964;HRSCAF=3308,1000
28804,Scaffold_28805;HRSCAF=32399,1000


[3588, 28086, 17364, 62490, 61137, 45956, 8703, 9533, 48436, 2632, 63334, 63141, 24082, 13033, 62358, 61104, 61047, 62117, 17617, 56542, 43757, 8389, 61746, 62952, 63518, 61744, 61298, 60968, 5035, 37875, 61000, 22506, 62326, 63551, 61247, 46026, 57237, 730, 62111, 12689, 19887, 62221, 62866, 31061, 62700, 62299, 60847, 40973, 59869, 60919, 63421, 25834, 17056, 62901, 54906, 15325, 62359, 8724, 62132, 647, 63199, 62114, 63268, 63026, 12280, 62608, 21867, 62227, 59617, 29043, 63514, 22028, 61433, 60934, 41200, 29135, 42178, 39260, 40340, 62802, 62290, 126, 63349, 62877, 63548, 61616, 61757, 16723, 7796, 63111, 58278, 4505, 50949, 61161, 22581, 10687, 34786, 38883, 60844, 47880]


### Creating imap

In [4]:
# Load database to build imap
fulldata = pd.read_csv("../../Raw_data/oreinotinus_samples_database.csv")

In [None]:
#create imap

## import collections to create an empty dict
from collections import defaultdict

#create empty dict
imap = defaultdict(list)
#go row by row
for index, row in fulldata.iterrows():
    #only check fulldataset specimens
    if row["full_dataset_withAyava"]:
        #append each NamInAssembly to the key Lastest_SP_name
        imap[row["Lastest_SP_name"]].append(row["NameIhow IMAP 
imap

### Define treeslider object and parameters

In [6]:
# set treeslider parameters 
ts = ipa.treeslider(
    name="window2mb_msnip10_mcov9_IMAPED_100biggestScaff",
    data=SEQS,
    workdir="analysis-treeslider",
    scaffold_idxs=scaff_toUse,
    window_size=2000000,
    slide_size=2000000,
    inference_method="raxml",
    inference_args={"N": 100, "T": 1},
    minsnps=10,
    mincov=9,
    imap=imap,
#     minmap= {i: 0 for i in imap},
    consensus_reduce=True
)

In [7]:
# set parallelization parameters
ts.ipcluster['cores'] = 10
ts.ipcluster['threads'] = 1

In [8]:
ts.show_inference_command()

/home/carlos/anaconda3/bin/raxmlHPC-PTHREADS-AVX2 -f a -T 1 -m GTRGAMMA -n ... -w ... -s ... -p 54321 -N 100 -x 12345


In [None]:
# Run treeslider

ts.run(auto=True, force=True, show_cluster=True)

In [10]:
# Quick look of the table result
ts.tree_table.head()

Unnamed: 0,scaffold,start,end,sites,snps,samples,missing,tree
0,126,0,2000000,11331,320,41,0.29,"(new_sp_1:0.00470358,microphyllum:0.00441279,(acutifolium:0.00430497,((sulcatum:0.00417629,(fuscum:0.00778976,blandum:0.00324333)45:0.00184229)9:0.000563973,((disjunctum:0.00737338,(jucundum:0.00461906,lautum:0.00460504)41:0.0010021)9:0.000530903..."
1,126,2000000,4000000,11234,241,41,0.3,"(hartwegii:0.00357987,discolor:0.00230865,(stellato-tomentosum:0.00863191,((obtusatum:0.00692095,(costaricanum:0.00403002,(seemenii:0.00317119,((dumatorum:0.00352708,(subsessile:0.00334279,((hallii:0.00225204,tinoides_2:0.00504569)16:0.000760494,..."
2,126,4000000,6000000,8688,155,41,0.32,"(acutifolium:0.00293168,new_sp_1:0.00387266,(lautum:0.00569614,(new_sp_2:0.00224158,(membranaceum:0.00285351,(((disjunctum:0.00289303,fuscum:0.00460144)30:0.000816493,(blandum:0.00139228,((stellato-tomentosum:0.00453805,jucundum:0.00404081)14:0.0..."
3,126,6000000,8000000,10057,240,41,0.27,"(microphyllum:0.00305879,blandum:0.00207079,((new_sp_1:0.0023432,membranaceum:0.00285532)57:0.000803369,((fuscum:0.00502644,new_sp_2:0.00281474)25:0.0010865,(acutifolium:0.00343854,(alpinum:0.00241574,((dentatum:0.0120275,(stenocalyx:0.00327767,(..."
4,647,0,2000000,7767,207,41,0.34,"(acutifolium:0.00448521,new_sp_1:0.00635065,(microphyllum:0.00416238,((villosum:0.000993127,alpinum:1e-06)99:0.00622221,((blandum:0.00634458,(stenocalyx:0.00580542,((loeseneri:0.00499851,(tiliaefolium:0.0068288,microcarpum:0.00600745)39:0.0010616..."


In [11]:
#Check slides that have more than 300 snps
len(ts.tree_table[ts.tree_table.snps > 300].tree)

357

In [13]:
#Save tree into a full newick useful for Astral and for NANUQ
minsnps = 300
noNANtsTable = ts.tree_table.dropna()
outfile = open(f"raxmlTrees_{ts.name}_{minsnps}snps.nwk", "w")
outfile.write("\n".join(noNANtsTable[noNANtsTable.snps > minsnps].tree.tolist()))
outfile.close()

## Next step is described in the R script: `2_NANUQ_execution.R`