In [3]:
%reload_ext watermark
from metapool import KLSampleSheet, quiet_validate_and_scrub_sample_sheet
%watermark -i -v -iv -m -h -p metapool,sample_sheet,openpyxl -u

Last updated: 2023-04-21T12:06:48.586344-07:00

Python implementation: CPython
Python version       : 3.9.16
IPython version      : 8.12.0

metapool    : 0+untagged.137.g8537cf6
sample_sheet: 0.13.0
openpyxl    : 3.1.2

Compiler    : Clang 14.0.6 
OS          : Darwin
Release     : 21.6.0
Machine     : x86_64
Processor   : i386
CPU cores   : 16
Architecture: 64bit

Hostname: Applejack.lan



# Knight Lab Sample Sheet Validation

This notebook is designed to validate and troubleshoot sample sheets of externally generated plates.

The steps are as follows:

1. Parse sample sheet.
1. Check that all the required columns in the `Data` section are present.
1. Check that the `Bioinformatics` and `Contact` section are present.
1. Validate and scrub sample identifiers so they are compliant with Illumina's `bcl2fastq` software.
    - Automatically replace non-allowed characters for underscores.
    - Flag non-unique sample identifiers.
1. Check that lane values are not empty.
1. Check that projects in the `Data`, `Bioinformatics` and `Contact` sections are all valid.
1. Validate the Qiita study identifier suffix at the end of every project name.
1. Save the parsed file in a compliant format.

**Note**: warning and error messages (text highlighted in red) will inform you of any problems that may come up.

**Enter the correct path to the sample sheet you want to validate**, replace the path to `good-sample-sheet.csv` for the location of the sheet you want to validate.

In [6]:
# sheet = KLSampleSheet('../metapool/tests/data/good-sample-sheet.csv')
sheet = KLSampleSheet('/Users/ccowart/Gail/2.FinRisk_12142_171211_GA_FIX_I8.csv')
msgs, valid_sheet = quiet_validate_and_scrub_sample_sheet(sheet)



If there are any error messages, please correct the sample sheet and re-run the cell above. Once you are happy with the results run the cell below, otherwise you will see an exception below.

In [7]:
msgs

[]

In [8]:
#_KL_SAMPLE_SHEET_DATA_COLUMNS
sheet.all_sample_keys

['Lane',
 'Sample_ID',
 'Sample_Name',
 'Sample_Plate',
 'Sample_Well',
 'I7_Index_ID',
 'index',
 'I5_Index_ID',
 'index2',
 'Sample_Project',
 'Well_description']

In [81]:
d2 = {'well_description': 'Well_description', 'sample_plate': 'Sample_Plate'}


In [82]:
# assume all samples represent an identical list and order of columns.
obs = sheet._samples[0]._store
warning_messages = []
for lowercased_key in obs:
    column_name, value = obs[lowercased_key]
    if column_name in d2:
        warning_messages.append("column '%s' changed to '%s'" % (column_name, d2[column_name]))
        
if warning_messages:
    print(warning_messages)
else:
    print('all clear')

["column 'sample_plate' changed to 'Sample_Plate'", "column 'well_description' changed to 'Well_description'"]


In [83]:
if warning_messages:
    
    # if differences in column names were found,
    # overwrite the values in each Sample object.
    import collections

    for i in range(0, len(sheet._samples)):
        obs = sheet._samples[i]._store
        d = collections.OrderedDict()

        for lowercased_key in obs:
            k, v = obs[lowercased_key]
            if k in d2:
                d[lowercased_key] = (d2[k], v)
            else:
                d[lowercased_key] = (k,v)

        sheet._samples[i]._store = d



        

        
        
    


In [84]:
sheet._samples[0]._store

OrderedDict([('lane', ('Lane', '2')),
             ('sample_id', ('Sample_ID', '401201325-5')),
             ('sample_name', ('Sample_Name', '4012013255')),
             ('sample_plate', ('Sample_Plate', 'Finrisk_77-80')),
             ('sample_well', ('Sample_Well', 'J12')),
             ('i7_index_id', ('I7_Index_ID', 'iTru7_110_07')),
             ('index', ('index', 'ACAGCAAC')),
             ('i5_index_id', ('I5_Index_ID', 'iTru5_24_E')),
             ('index2', ('index2', 'AGATACGG')),
             ('sample_project', ('Sample_Project', 'FinRisk_12142')),
             ('well_description',
              ('Well_description', 'FinRisk_12142_Finrisk_77-80_J12'))])

In [47]:
d

OrderedDict([('lane', ('Lane', '2')),
             ('sample_id', ('Sample_ID', '401206491-2')),
             ('sample_name', ('Sample_Name', '4012064912')),
             ('sample_plate', ('Sample_Plate', 'Finrisk_77-80')),
             ('sample_well', ('Sample_Well', 'J14')),
             ('i7_index_id', ('I7_Index_ID', 'iTru7_111_04')),
             ('index', ('index', 'TCAGACGA')),
             ('i5_index_id', ('I5_Index_ID', 'iTru5_19_E')),
             ('index2', ('index2', 'GTATTCCG')),
             ('sample_project', ('Sample_Project', 'FinRisk_12142')),
             ('well_description',
              ('Well_description', 'FinRisk_12142_Finrisk_77-80_J14'))])

In [27]:
sheet.sample_header

AttributeError: 'KLSampleSheet' object has no attribute 'sample_header'