## Import libraries

In [60]:
import datetime
import pandas as pd
from igf_data.igfdb.igfTables import Base, Project, User, Platform, Sample, Experiment, Seqrun, Run
from igf_data.igfdb.igfTables import Project_attribute, Sample_attribute, Experiment_attribute, Run_attribute
from igf_data.igfdb.baseadaptor import BaseAdaptor
from igf_data.igfdb.projectadaptor import ProjectAdaptor
from igf_data.igfdb.useradaptor import UserAdaptor
from igf_data.igfdb.sampleadaptor import SampleAdaptor
from igf_data.igfdb.platformadaptor import PlatformAdaptor
from igf_data.igfdb.seqrunadaptor import SeqrunAdaptor
from igf_data.igfdb.experimentadaptor import ExperimentAdaptor
from igf_data.igfdb.runadaptor import RunAdaptor
from sqlalchemy import create_engine

## Define database connection

### MySQL config

In [2]:
dbparams={'dbhost':'0.0.0.0', \
          'dbuser':'igf', \
          'dbpass':'igf123', \
          'dbname':'igfdb', \
          'driver':'mysql', \
          'connector':'pymysql'}
         # 'engine_config':{'echo':True}} 

### SQLite config

In [3]:
#dbparams={'dbname':'test.db'} #

## Create base adaptor instance

In [4]:
base=BaseAdaptor(**dbparams)
session_class=base.get_session_class()

## Delete existing database and create new

In [38]:
Base.metadata.drop_all(base.engine)
Base.metadata.create_all(base.engine)

## Sequencing platform details for facility

In [14]:
platform_data=[{'platform_igf_id':'ILM4K_001', \
                'model_name':'HISEQ4000',\
                'vendor_name':'ILLUMINA', \
                'software_name':'RTA', \
                'software_version':'RTA2'},
               {'platform_igf_id':'ILMNS_001', \
                'model_name':'NEXTSEQ',\
                'vendor_name':'ILLUMINA', \
                'software_name':'RTA', \
                'software_version':'RTA2'},
               {'platform_igf_id':'ILMMS_001', \
                'model_name':'MISEQ',\
                'vendor_name':'ILLUMINA', \
                'software_name':'RTA', \
                'software_version':'RTA1.18.64'},
              ]

In [39]:
pl=PlatformAdaptor(**{'session_class':base.session_class})
pl.start_session()
pl.store_platform_data(data=platform_data)

### Check sequencing platforms

In [44]:
pl_data=pl.fetch_records(query=pl.session.query(Platform))

In [94]:
pl_data.set_index('platform_id')

Unnamed: 0_level_0,platform_igf_id,model_name,vendor_name,software_name,software_version,date_created
platform_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,ILM4K_001,HISEQ4000,ILLUMINA,RTA,RTA2,2017-07-11 15:38:27
2,ILMNS_001,NEXTSEQ,ILLUMINA,RTA,RTA2,2017-07-11 15:38:27
3,ILMMS_001,MISEQ,ILLUMINA,RTA,RTA1.18.64,2017-07-11 15:38:27


In [None]:
pl.close_session()

## Get Project, User and Sample Information

Read all the information from project submission form (modification required)

## Define and store project data

In [46]:
data=[{'project_igf_id':'IGFP0003', \
       'project_name':'project_C',  \
       'description':'Its project C', \
       'project_deadline':'Before August 2017', \
       'comments':'Some samples are treated with drug X'},
      {'project_igf_id':'IGFP0004', \
       'project_name':'project_D',  \
       'description':'Its project D', \
       'project_deadline':'Second week of July', \
       'special_requirement':'Signal files for RNA samples'}]

In [61]:
pa=ProjectAdaptor(**{'session_class':base.session_class})
pa.start_session()
pa.store_project_and_attribute_data(data)

### Check projects

In [48]:
p_data=pa.fetch_records(query=pa.session.query(Project))

In [93]:
p_data.set_index('project_id')

Unnamed: 0_level_0,project_igf_id,project_name,start_timestamp,description,deliverable
project_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,IGFP0003,project_C,2017-07-11 15:41:04,Its project C,FASTQ
2,IGFP0004,project_D,2017-07-11 15:41:04,Its project D,FASTQ


In [62]:
pa_data=pa.fetch_records(query=p_session.query(Project_attribute))

In [92]:
pa_data.set_index('project_attribute_id')

Unnamed: 0_level_0,attribute_name,attribute_value,project_id
project_attribute_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,comments,Some samples are treated with drug X,1
2,project_deadline,Before August 2017,1
3,special_requirement,Signal files for RNA samples,2
4,project_deadline,Second week of July,2


## Define and store User data

In [50]:
data=[{'user_igf_id':'IGFC0001', \
       'name':'User A', \
       'email_id':'usera@ic.ac.uk', \
       'username':'usera', \
       'password':'passa'}, 
      {'user_igf_id':'IGFC0002', \
       'name':'User B', \
       'email_id':'userb@ic.ac.uk', \
       'username':'userb', \
       'hpc_username':'buser', \
       'password':'passb'}
     ]

### Check users

In [51]:
ua=UserAdaptor(**{'session_class':base.session_class})
ua.start_session()
ua.store_user_data(data=data)

In [52]:
u_data=ua.fetch_records(query=ua.session.query(User))

In [91]:
u_data.set_index('user_id')

Unnamed: 0_level_0,user_igf_id,name,email_id,username,hpc_username,category,status,date_created,password,encryption_salt
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,IGFC0001,User A,usera@ic.ac.uk,usera,,HPC_USER,ACTIVE,2017-07-11 15:41:11,05d5eed1b768809b852e4253595f9cf7196f8ee67a0396...,82f86726f6470615b0fe92712f1919738498c4742ccf08...
2,IGFC0002,User B,userb@ic.ac.uk,userb,buser,HPC_USER,ACTIVE,2017-07-11 15:41:11,0349abfa31a687b27b6a4cd1d9762df5b74b773bba75fe...,507f0928ce31e13b4a7a3a846efe99fed82a03bf6d9338...


In [54]:
ua.close_session()

## Link users to relevant projects

In [55]:
project_user_data=[{'project_igf_id': 'IGFP0003', 'user_igf_id': 'IGFC0001', 'data_authority':True},
                   {'project_igf_id': 'IGFP0003', 'user_igf_id': 'IGFC0002' },
                   {'project_igf_id': 'IGFP0004', 'user_igf_id': 'IGFC0002', 'data_authority':True}
                  ]

In [56]:
pa.assign_user_to_project(data=project_user_data)

In [57]:
pu_data=pa.get_project_user_info()

### Check project users

In [90]:
pu_data.set_index('project_id')

Unnamed: 0_level_0,project_igf_id,project_name,start_timestamp,description,deliverable,user_id,user_igf_id,name,email_id,username,hpc_username,category,status,date_created,password,encryption_salt,data_authority
project_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1,IGFP0003,project_C,2017-07-11 15:41:04,Its project C,FASTQ,1,IGFC0001,User A,usera@ic.ac.uk,usera,,HPC_USER,ACTIVE,2017-07-11 15:41:11,05d5eed1b768809b852e4253595f9cf7196f8ee67a0396...,82f86726f6470615b0fe92712f1919738498c4742ccf08...,T
1,IGFP0003,project_C,2017-07-11 15:41:04,Its project C,FASTQ,2,IGFC0002,User B,userb@ic.ac.uk,userb,buser,HPC_USER,ACTIVE,2017-07-11 15:41:11,0349abfa31a687b27b6a4cd1d9762df5b74b773bba75fe...,507f0928ce31e13b4a7a3a846efe99fed82a03bf6d9338...,
2,IGFP0004,project_D,2017-07-11 15:41:04,Its project D,FASTQ,2,IGFC0002,User B,userb@ic.ac.uk,userb,buser,HPC_USER,ACTIVE,2017-07-11 15:41:11,0349abfa31a687b27b6a4cd1d9762df5b74b773bba75fe...,507f0928ce31e13b4a7a3a846efe99fed82a03bf6d9338...,T


In [64]:
pa.close_session()

## Define and store samples

In [65]:
sample_data=[{'sample_igf_id':'IGFS0001', \
              'taxon_id':'9606',\
              'scientific_name':'Homo sapiens',\
              'common_name':'human',\
              'donor_anonymized_id':'donor_001',\
              'description':'Sample A from donor 001',\
              'phenotype':'Healthy',\
              'sex':'FEMALE',\
              'project_igf_id':'IGFP0003',\
              'sample_tube':'tube001',\
              'sample_library':'IGFS0001_20170628'},
             {'sample_igf_id':'IGFS0002', \
              'taxon_id':'9606',\
              'scientific_name':'Homo sapiens',\
              'common_name':'human',\
              'donor_anonymized_id':'donor_002',\
              'description':'Sample B from donor 002',\
              'phenotype':'Cancer',\
              'sex':'FEMALE',\
              'project_igf_id':'IGFP0003',\
              'sample_tube':'tube002',\
              'sample_library':'IGFS0002_20170628'},
       ]

In [67]:
sa=SampleAdaptor(**{'session_class':base.session_class})
sa.start_session()
sa.store_sample_and_attribute_data(data=sample_data)

### Check Samples

In [68]:
s_data=sa.fetch_records(sa.session.query(Sample))

In [89]:
s_data.set_index('sample_id')

Unnamed: 0_level_0,sample_igf_id,taxon_id,scientific_name,common_name,donor_anonymized_id,description,phenotype,sex,status,biomaterial_type,cell_type,tissue_type,cell_line,date_created,project_id
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1,IGFS0001,9606,Homo sapiens,human,donor_001,Sample A from donor 001,Healthy,FEMALE,ACTIVE,UNKNOWN,,,,2017-07-11 15:44:13,1
2,IGFS0002,9606,Homo sapiens,human,donor_002,Sample B from donor 002,Cancer,FEMALE,ACTIVE,UNKNOWN,,,,2017-07-11 15:44:13,1


In [70]:
sa_data=sa.fetch_records(sa_session.query(Sample_attribute))

In [88]:
sa_data.set_index('sample_attribute_id')

Unnamed: 0_level_0,attribute_name,attribute_value,sample_id
sample_attribute_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,sample_tube,tube001,1
2,sample_library,IGFS0001_20170628,1
3,sample_tube,tube002,2
4,sample_library,IGFS0002_20170628,2


In [72]:
sa.close_session()

## Find new sequencing runs and load information

* Look for new sequencing runs (seqrun) in the target directory
* Steps after a new run directory is created
   1. Read the Samplesheet.csv file and get the Project and Samples details
   2. Define new Experiments and Runs based on samplesheet data
    
      Library:    Its an unique library information. I will consider library for each sequencing run as unique
                  e.g. library prepared for sample_A on 07/07/2017 is sample_A_20170707
      Experiment: Its is an unique combination of Sample + Library + Sequencing platform type
                   e.g. a. sample_A has a library sample_A_20170707 and it was sequenced on Hiseq 4000 twice. All the data for sample_A from two sequencing runs will get combined under a single experiment
                        b. Library sample_A_20170707 was sequenced twice on two different platforms,                   it will have two separate experiments for each sequencing runs
       Run:       Fastq files for a sample in a particular lane is considered as a single run
                  e.g. sample_A was present only on lane_1 of a hiseq data, it will have only one run.                  For a NextSeq sequencing run, it will have 4 runs
   3. Generate the fastq files and assign them to relevent 'Run's after the demultiplexing event                  

## Load sequencing Run details

In [73]:
seqrun_data=[{'seqrun_igf_id':'170627_K00345_0012_AHJJKTBBXX', 
              'flowcell_id':'AHJJKTBBXX', 
              'platform_igf_id':'ILM4K_001'},
             {'seqrun_igf_id':'170627_M03291_0071_000000000-D0V30', 
              'flowcell_id':'D0V30', 
              'platform_igf_id':'ILMMS_001'},
            ]

In [74]:
sra=SeqrunAdaptor(**{'session_class':base.session_class})
sra.start_session()
sra.store_seqrun_data(data=seqrun_data)

### check sequencing runs

In [111]:
sra_data=sra.fetch_records(sra.session.query(Seqrun))

In [86]:
sra_data.set_index('seqrun_id')

Unnamed: 0_level_0,seqrun_igf_id,reject_run,date_created,flowcell_id,platform_id
seqrun_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,170627_K00345_0012_AHJJKTBBXX,N,2017-07-11 15:47:49,AHJJKTBBXX,1
2,170627_M03291_0071_000000000-D0V30,N,2017-07-11 15:47:49,D0V30,1


In [79]:
sra.close_session()

## Define and store experiment

In [80]:
exp_data=[{'experiment_igf_id':'IGFS0001_20170628_IH4',
           'library_name':'IGFS0001_20170628',
           'library_strategy':'RNA-SEQ',
           'experiment_type':'TOTAL-RNA',
           'library_layout':'PAIRED',
           'project_igf_id':'IGFP0003',
           'sample_igf_id':'IGFS0001',
           'platform_name':'HISEQ4000'},
          {'experiment_igf_id':'IGFS0002_20170628_IH4',
           'library_name':'IGFS0002_20170628',
           'library_strategy':'RNA-SEQ',
           'experiment_type':'TOTAL-RNA',
           'library_layout':'PAIRED',
           'project_igf_id':'IGFP0003',
           'sample_igf_id':'IGFS0002',
           'platform_name':'HISEQ4000'},
          ]

In [95]:
ea=ExperimentAdaptor(**{'session_class':base.session_class})
ea.start_session()
ea.store_project_and_attribute_data(data=exp_data)

### Check experiments

In [96]:
e_data=ea.fetch_records(ea.session.query(Experiment))

In [85]:
e_data.set_index('experiment_id')

Unnamed: 0_level_0,experiment_igf_id,project_id,sample_id,library_name,library_source,library_strategy,experiment_type,library_layout,status,date_created,platform_name
experiment_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,IGFS0001_20170628_IH4,1,1,IGFS0001_20170628,UNKNOWN,RNA-SEQ,TOTAL-RNA,PAIRED,ACTIVE,2017-07-11 15:51:01,HISEQ4000
2,IGFS0002_20170628_IH4,1,2,IGFS0002_20170628,UNKNOWN,RNA-SEQ,TOTAL-RNA,PAIRED,ACTIVE,2017-07-11 15:51:01,HISEQ4000


In [97]:
ea_data=ea.fetch_records(query=ea_session.query(Experiment_attribute))

In [98]:
ea_data

Unnamed: 0,experiment_attribute_id,attribute_name,attribute_value,experiment_id


In [99]:
ea.close_session()

## Define and store run

In [100]:
run_data=[{'run_igf_id':'IGFS0001_20170628_IH4_L1',
           'experiment_igf_id':'IGFS0001_20170628_IH4',
           'seqrun_igf_id':'170627_M03291_0071_000000000-D0V30',
           'lane_number':'1',
           'run_barcode':'BARCODE_X'},
          {'run_igf_id':'IGFS0002_20170628_IH4_L2',
           'experiment_igf_id':'IGFS0002_20170628_IH4',
           'seqrun_igf_id':'170627_M03291_0071_000000000-D0V30',
           'lane_number':'2',
           'run_barcode':'BARCODE_Y'},
         ]

In [101]:
ra=RunAdaptor(**{'session_class':base.session_class})
ra.start_session()
ra.store_run_and_attribute_data(data=run_data)

### Check runs

In [102]:
r_data=ra.fetch_records(query=ra.session.query(Run))

In [107]:
r_data.set_index('run_id')

Unnamed: 0_level_0,run_igf_id,experiment_id,seqrun_id,status,lane_number,date_created
run_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,IGFS0001_20170628_IH4_L1,1,2,ACTIVE,1,2017-07-11 15:58:40
2,IGFS0002_20170628_IH4_L2,2,2,ACTIVE,2,2017-07-11 15:58:40


In [104]:
ra_data=ra.fetch_records(query=ra.session.query(Run_attribute))

In [109]:
ra_data.set_index('run_attribute_id')

Unnamed: 0_level_0,attribute_name,attribute_value,run_id
run_attribute_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,run_barcode,BARCODE_X,1
2,run_barcode,BARCODE_Y,2


In [106]:
ra.close_session()

## Check combined data for projects

In [115]:
base.start_session()
all_data=base.fetch_records(base.session.query(Project.project_igf_id, \
                                               Sample.sample_igf_id, \
                                               Experiment.experiment_igf_id, \
                                               Run.run_igf_id).\
                                               join(Sample).\
                                               join(Experiment).\
                                               join(Run))

In [118]:
all_data.set_index('project_igf_id')

Unnamed: 0_level_0,sample_igf_id,experiment_igf_id,run_igf_id
project_igf_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
IGFP0003,IGFS0001,IGFS0001_20170628_IH4,IGFS0001_20170628_IH4_L1
IGFP0003,IGFS0002,IGFS0002_20170628_IH4,IGFS0002_20170628_IH4_L2


In [117]:
base.close_session()