In [None]:
%reload_ext watermark
%matplotlib inline

import os
from metapool.metapool import *
from metapool import make_sample_sheet, IGMManifest
%watermark -i -v -iv -m -h -p metapool,sample_sheet,openpyxl -u

# Knight Lab metagenomics sample sheet generator 

### What is it?

This Jupyter Notebook allows you to automatically generate sample sheets for metagenomic sequencing. 


### Here's how it should work.

You'll start out with a **basic plate map** (platemap.tsv) , which just links each sample to it's approprite row and column.

You can use this google sheet template to generate your plate map:

https://docs.google.com/spreadsheets/d/1xPjB6iR3brGeG4bm2un4ISSsTDxFw5yME09bKqz0XNk/edit?usp=sharing

Next you'll automatically assign dual indexes to each sample in order to produce a **sample sheet** (samplesheet.csv) that you can give directly to IGM for sequencing. 

## Step 1: read in plate map

**Enter the correct path to the plate map file**. This will serve as the plate map for relating all subsequent information.

In [None]:
plate_map_fp = './test_data/Plate_Maps/Finrisk 33-36_plate_map.tsv'

if not os.path.isfile(plate_map_fp):
    print("Problem! %s is not a path to a valid file" % file)

**Read in the plate map**. It should look something like this:

```
Sample	Row	Col	Blank
GLY_01_012	A	1	False
GLY_14_034	B	1	False
GLY_11_007	C	1	False
GLY_28_018	D	1	False
GLY_25_003	E	1	False
GLY_06_106	F	1	False
GLY_07_011	G	1	False
GLY_18_043	H	1	False
GLY_28_004	I	1	False
```

**Make sure there a no duplicate IDs**. If each sample doesn't have a different name, an **error** will be thrown and you won't be able to generate a sample sheet.

In [None]:
plate_df = read_plate_map_csv(open(plate_map_fp,'r'))

plate_df.head()

# Assign barcode combination

This portion of the notebook will assign a unique combination of dual indeces to each sample. 

As inputs, it requires:
1. A plate map dataframe (from previous step)
2. A tab-delimited index combination file, relating index combinations, i5 and i7 index values, and i5 and i7 index locations

The workflow then:
1. reads in the index combo list
2. assigns indices per sample

## Step 2: Read in index combo list

This is a file that contains every possible i5 and i7 barcode combo on a separate line,
along with plate and well location information. It should look something like this:

```
index combo,index combo seq,i5 name,i5 sequence,i5 well,i5 plate,i7 name,i7 sequence,i7 well,i7 plate
0,ACCGACAAACGTTACC,iTru5_01_A,ACCGACAA,A1,iTru5_plate,iTru7_101_01,ACGTTACC,A1,iTru7_plate
1,AGTGGCAACTGTGTTG,iTru5_01_B,AGTGGCAA,B1,iTru5_plate,iTru7_101_02,CTGTGTTG,A2,iTru7_plate
2,CACAGACTTGAGGTGT,iTru5_01_C,CACAGACT,C1,iTru5_plate,iTru7_101_03,TGAGGTGT,A3,iTru7_plate
3,CGACACTTGATCCATG,iTru5_01_D,CGACACTT,D1,iTru5_plate,iTru7_101_04,GATCCATG,A4,iTru7_plate
4,GACTTGTGGCCTATCA,iTru5_01_E,GACTTGTG,E1,iTru5_plate,iTru7_101_05,GCCTATCA,A5,iTru7_plate
5,GTGAGACTAACAACCG,iTru5_01_F,GTGAGACT,F1,iTru5_plate,iTru7_101_06,AACAACCG,A6,iTru7_plate
6,GTTCCATGACTCGTTG,iTru5_01_G,GTTCCATG,G1,iTru5_plate,iTru7_101_07,ACTCGTTG,A7,iTru7_plate
7,TAGCTGAGCCTATGGT,iTru5_01_H,TAGCTGAG,H1,iTru5_plate,iTru7_101_08,CCTATGGT,A8,iTru7_plate
8,CTTCGCAATGTACACC,iTru5_02_A,CTTCGCAA,I1,iTru5_plate,iTru7_101_09,TGTACACC,A9,iTru7_plate
```

In [None]:
index_combo_fp = './test_output/iTru/temp_iTru_combos.csv'

if not os.path.isfile(index_combo_fp):
    print("Problem! %s is not a path to a valid file" % file)

In [None]:
index_combos = pd.read_csv(index_combo_fp)
index_combos.head()

## Step 3: Assign index combo

This will pick a set of index combos from the index combo for the number of samples in the `plate_df` DataFrame.

Specify the PrimerPlate you used to assign barcodes. 

In [None]:
primerplate_combo = 1

starting_combo = (primerplate_combo - 1)*384

indices = assign_index(len(plate_df['Sample']), index_combos, start_idx=starting_combo).reset_index()

plate_df = pd.concat([plate_df, indices], axis=1)

plate_df.head()

# Make HiSeq sample sheet

This workflow takes the pooled sample information and writes an Illumina sample sheet that can be given directly to the sequencing center. 

As inputs, this notebook requires:
1. A plate map DataFrame (from previous step)

The workflow:
1. formats sample names as bcl2fastq-compatible
2. formats sample data
3. sets values for sample sheet fields and formats sample sheet.
4. writes the sample sheet to a file

## Step 1: Format sample names to be bcl2fastq-compatible

bcl2fastq requires *only* alphanumeric, hyphens, and underscore characters. We'll replace all non-those characters
with underscores and add the bcl2fastq-compatible names to the DataFrame.

In [None]:
plate_df['sample sheet Sample_ID'] = plate_df['Sample'].map(bcl_scrub_name)

plate_df.head()

## Step 2: format sample sheet data

This step formats the data columns appropriately for the sample sheet, using the values we've calculated previously.

The newly-created bcl2fastq-compatible names will be in the **`Sample ID`** and **`Sample Name`** columns. The
original sample names will be in the **`Description`** column.

Modify **`lanes`** to indicate which lanes this pool will be sequenced on.

**Project Name and Project Plate values will be placed in the **`Sample_Project`** and **`Sample_Name`**
columns, respectively.

**`sequencer`** is important for making sure the i5 index is in the correct orientation for demultiplexing. `HiSeq4000`, `HiSeq3000`, `NextSeq`, and `MiniSeq` all require reverse-complemented i5 index sequences. If you enter one of these exact strings in for `sequencer`, it will revcomp the i5 sequence for you.

`HiSeq2500`, `MiSeq`, and `NovaSeq` will not revcomp the i5 sequence. 

In [None]:
plate_df['Project Name'] = 'Example_Project_1'
plate_df['Project Plate'] = 'Example Plate 1'
sequencer = 'HiSeq4000'
lanes = [1, 2, 3, 4]

# Knight Lab Nextera is also valid
metadata = {
    'Bioinformatics': [
        {
         'Sample_Project': 'Example_Project_1',
         'QiitaID': '1',
         'BarcodesAreRC': 'False',
         'ForwardAdapter': 'GATCGGAAGAGCACACGTCTGAACTCCAGTCAC',
         'ReverseAdapter': 'GATCGGAAGAGCGTCGTGTAGGGAAAGGAGTGT',
         'HumanFiltering': 'True',
         'library_construction_protocol': 'Knight Lab KAPA HyperPlus',
         'experiment_design_description': 'Example_Project_1_Description',
         'contains_replicates': False
        },
    ],
    'Contact': [
        {
         'Sample_Project': 'Example_Project_1',
         # non-admin contacts who want to know when the sequences
         # are available in Qiita
         'Email': 'jonsan@gmail.com'
        },
    ],
    'Assay': 'Metagenomic',
    'SheetType': 'standard_metag',
    'SheetVersion': '100'
}

sheet = make_sample_sheet(metadata, plate_df, sequencer, lanes)

Validate the contents of the sample sheet.
This method will return False if the sample-sheet contains Errors or True if it doesn't (Warnings don't count).
Warning and/or Error messages will be displayed below.

In [None]:
sheet.validate_and_scrub_sample_sheet()

## Step 4: Write the sample sheet to file

In [None]:
# write sample sheet as .csv
sample_sheet_fp = './test_output/SampleSheets/YYYY_MM_DD_FinRisk_33-36_samplesheet.csv'

if os.path.isfile(sample_sheet_fp):
    print("Warning! This file exists already.")

In [None]:
with open(sample_sheet_fp,'w') as f:
    sheet.write(f)
    
!head -n 30 {sample_sheet_fp}
!echo ...
!tail -n 15 {sample_sheet_fp}

# Create a Manifest file for IGM

Make sure you update the correct values for the attributes below. If you need to update the defaults, type `fest.` and hit tab to see what other values can be changed. The number of samples will be determined based on the number of samples in the sample sheet, but you have to enter the value for the number of lanes, and the names of the pools (one per line).

In [None]:
fest = IGMManifest()

fest.number_of_samples = len(sheet)
fest.number_of_lanes = 1

# remove plate naming
fest.pools = [
    'FILL IN POOL NAME',
    # 'FILL IN SECOND POOL NAME ...',
]

fest.write('./test_output/SampleSheets/YYYY_MM_DD_PI_Sequencing_Runs_Manifest_2021.xlsx')