#### We want to access all the cells in LIMS that have failed due to input resistance and map them back to Nathan's tSNE plot.

1. What cells are we searching for in LIMS?
2. What is needed to fit into tSNE?
3. Do I want features for LIMS or file paths?



In [1]:
import pg8000
import pandas as pd

In [2]:
#code from Agata

def _connect(user="limsreader", host="limsdb2", database="lims2", password="limsro", port=5432):
    conn = pg8000.connect(user=user, host=host, database=database, password=password, port=port)
    return conn, conn.cursor()

def _select(cursor, query):
    cursor.execute(query)
    columns = [ d[0] for d in cursor.description ]
    return [ dict(zip(columns, c)) for c in cursor.fetchall() ]

def limsquery(query, user="limsreader", host="limsdb2", database="lims2", password="limsro", port=5432):
    """A function that takes a string containing a SQL query, connects to the LIMS database and outputs the result."""
    conn, cursor = _connect(user, host, database, password, port)
    try:
        results = _select(cursor, query)
    finally:
        cursor.close()
        conn.close()
    return results


### What are the elements named in LIMS?
#### This is an easy way to search through the tables in LIMS to find out what columns are called

In [11]:
my_query = "SELECT TABLE_NAME FROM INFORMATION_SCHEMA.TABLES"
my_result = limsquery(my_query)
#first_element = my_result[0]
#print sorted(first_element.keys())

In [14]:
my_result

[{'table_name': u'pg_type'},
 {'table_name': u'pg_roles'},
 {'table_name': u'pg_group'},
 {'table_name': u'pg_user'},
 {'table_name': u'pg_rules'},
 {'table_name': u'pg_views'},
 {'table_name': u'pg_tables'},
 {'table_name': u'pg_matviews'},
 {'table_name': u'pg_indexes'},
 {'table_name': u'pg_stats'},
 {'table_name': u'pg_settings'},
 {'table_name': u'pg_locks'},
 {'table_name': u'pg_cursors'},
 {'table_name': u'pg_available_extensions'},
 {'table_name': u'pg_available_extension_versions'},
 {'table_name': u'pg_prepared_xacts'},
 {'table_name': u'pg_prepared_statements'},
 {'table_name': u'pg_seclabels'},
 {'table_name': u'pg_timezone_abbrevs'},
 {'table_name': u'pg_timezone_names'},
 {'table_name': u'pg_stat_all_tables'},
 {'table_name': u'pg_stat_xact_all_tables'},
 {'table_name': u'pg_stat_sys_tables'},
 {'table_name': u'pg_stat_xact_sys_tables'},
 {'table_name': u'pg_stat_user_tables'},
 {'table_name': u'pg_stat_xact_user_tables'},
 {'table_name': u'pg_statio_all_tables'},
 {'tabl

In [84]:
my_query = "SELECT * FROM specimens LIMIT 10"
my_result = limsquery(my_query)
first_element = my_result[0]
print sorted(first_element.keys())

['alignment3d_id', 'barcode', 'biophysical_model_state', 'carousel_well_name', 'cell_depth', 'cell_label', 'cell_prep_id', 'cell_reporter_id', 'cortex_layer_id', 'created_at', 'created_by', 'data', 'donor_id', 'ephys_cell_plan_id', 'ephys_neural_tissue_plan_id', 'ephys_qc_result', 'ephys_roi_result_id', 'ephys_start_time_sec', 'external_specimen_name', 'facs_well_id', 'flipped_specimen_id', 'frozen_at', 'hemisphere_id', 'histology_well_name', 'id', 'location_id', 'name', 'normalization_group_id', 'operation_id', 'parent_id', 'parent_x_coord', 'parent_y_coord', 'parent_z_coord', 'patched_cell_container', 'pinned_radius', 'plane_of_section_id', 'postmortem_interval_id', 'preparation_method_id', 'priority', 'project_id', 'reference_space_id', 'rna_integrity_number', 'specimen_preparation_method_id', 'specimen_set_id', 'storage_directory', 'structure_id', 'task_flow_id', 'tissue_ph', 'tissue_processing_id', 'updated_at', 'updated_by', 'x_coord', 'y_coord']


In [124]:
my_query = "SELECT * FROM well_known_files LIMIT 10"
my_result = limsquery(my_query)
first_element = my_result[0]
print first_element.keys()

['published_at', 'workflow_state', 'created_at', 'updated_at', 'filename', 'storage_directory', 'file_source_id', 'attachable_type', 'content_type', 'well_known_file_type_id', 'attachable_id', 'id', 'size']


### This will be easiest to use if we stick it in a dataframe

In [40]:
#more code written by Agata

def get_lims_dataframe(query):
    '''Return a dataframe with lims query'''
    result = limsquery(query)
    try:
        data_df = pd.DataFrame(data=result, columns=result[0].keys())
    except IndexError:
        print "Could not find results for your query."
        data_df = pd.DataFrame()
    return data_df

This is the basics of how a SQL query looks

SELECT TableA.*, TableB.*, TableC.*, TableD.*
FROM TableA
    JOIN TableB
        ON TableB.aID = TableA.aID
    JOIN TableC
        ON TableC.cID = TableB.cID
WHERE DATE(TableC.date)=date(now()) 

This allows us to pull in info from 4 different tables since everything we want is not stored in one place, but there are overlaps to link things up. 10 digit IDs generally are your link

### We can shorten ephys_roi_results to err and specimens to s, search for all of our features and use <br>JOIN to join the search results from ephys_roi_results and specimens following the outline from above

In [36]:
lims_query = "SELECT err.id, err.recording_date, err.failed_bad_rs, \
err.failed_electrode_0, err.failed_clogged_pipette, err.failed_no_seal, \
err.failed_other, err.workflow_state, s.name, s.ephys_roi_result_id, s.project_id, proj.id, proj.code, \
f.storage_directory, f.attachable_id \
FROM ephys_roi_results err JOIN specimens s ON s.ephys_roi_result_id = err.id \
JOIN projects proj ON s.project_id = proj.id \
JOIN well_known_files f ON err.id = f.attachable_id \
WHERE (failed_bad_rs = 'TRUE' \
AND err.failed_electrode_0 = 'FALSE' \
AND err.failed_clogged_pipette = 'FALSE' \
AND err.failed_no_seal = 'FALSE' \
AND err.failed_other = 'FALSE' \
AND err.workflow_state != 'manual_passed') \
AND (proj.code = 'T301' OR proj.code = 'T301x') \
AND f.filename LIKE '%%ephys_features.json'"

lims_df = get_lims_dataframe(lims_query)
lims_df.tail()

Unnamed: 0,ephys_roi_result_id,code,recording_date,failed_bad_rs,workflow_state,name,attachable_id,failed_clogged_pipette,storage_directory,failed_other,project_id,failed_electrode_0,id,failed_no_seal
40,579497253,T301,2017-04-04 15:22:26,True,manual_failed,Vipr2-IRES2-Cre;Ai14-310516.06.02.01,579497253,False,/projects/mousecelltypes/vol1/prod997/Ephys_Ro...,False,305094322,False,305094322,False
41,579645792,T301,2017-04-05 12:56:54,True,manual_failed,Ndnf-IRES2-dgCre;Ai14-311164.05.02.01,579645792,False,/projects/mousecelltypes/vol1/prod997/Ephys_Ro...,False,305094322,False,305094322,False
42,580812829,T301,2017-04-11 09:44:52,True,manual_failed,Nkx2-1-CreERT2;Ai14 (IVSCC)-311670.04.02.01,580812829,False,/projects/mousecelltypes/vol1/prod1002/Ephys_R...,False,305094322,False,305094322,False
43,580900050,T301,2017-04-11 14:44:26,True,manual_failed,Chrna2-Cre_OE25;Pvalb-T2A-Dre;Ai66-309757.05.0...,580900050,False,/projects/mousecelltypes/vol1/prod1003/Ephys_R...,False,305094322,False,305094322,False
44,580901813,T301,2017-04-11 15:21:05,True,manual_failed,Chrna2-Cre_OE25;Pvalb-T2A-Dre;Ai66-309757.05.0...,580901813,False,/projects/mousecelltypes/vol1/prod1003/Ephys_R...,False,305094322,False,305094322,False


## Searching for the features json seems to limit us in our search so lets remove that

In [27]:
lims_query = "SELECT err.id, err.recording_date, err.failed_bad_rs, \
err.failed_electrode_0, err.failed_clogged_pipette, err.failed_no_seal, \
err.failed_other, err.workflow_state, s.name, s.ephys_roi_result_id, s.project_id, proj.id, proj.code \
FROM ephys_roi_results err JOIN specimens s ON s.ephys_roi_result_id = err.id \
JOIN projects proj ON s.project_id = proj.id \
WHERE (failed_bad_rs = 'TRUE' \
AND err.failed_electrode_0 = 'FALSE' \
AND err.failed_clogged_pipette = 'FALSE' \
AND err.failed_no_seal = 'FALSE' \
AND err.failed_other = 'FALSE' \
AND err.workflow_state != 'manual_passed') \
AND (proj.code = 'T301' OR proj.code = 'T301x')"

lims_df = get_lims_dataframe(lims_query)
lims_df.tail()

Unnamed: 0,ephys_roi_result_id,code,recording_date,failed_bad_rs,workflow_state,name,failed_clogged_pipette,failed_other,project_id,failed_electrode_0,id,failed_no_seal
67,601831626,T301,2017-07-10 21:03:11,True,manual_failed,Nos1-CreERT2;Ai14-333287.03.01.01,False,False,305094322,False,305094322,False
68,602660919,T301,2017-07-13 22:16:27,True,manual_failed,Scnn1a-Tg2-Cre;Ai14-333165.04.02.01,False,False,305094322,False,305094322,False
69,604693290,T301x,2017-07-26 22:50:46,True,manual_failed,Rbp4-Cre_KL100;Ai14-337193.04.02.01,False,False,300080300,False,300080300,False
70,605537141,T301x,2017-07-28 20:37:50,True,manual_failed,Gad2-IRES-Cre;Ai14-336420.04.02.01,False,False,300080300,False,300080300,False
71,605538385,T301x,2017-07-28 21:32:11,True,manual_failed,Gad2-IRES-Cre;Ai14-336420.02.02.01,False,False,300080300,False,300080300,False


# The donors table didn't seem to be helpful in giving us the genotype so <br>let's just strip the labtracks.id off of names and use the count function in pandas

In [37]:
lims_df['genotype'] = lims_df['name'].apply(lambda x: x[:-16])
lims_df.groupby(['genotype']).size()


genotype
Chat-IRES-Cre-neo;Ai14                 5
Chrna2-Cre_OE25;Ai14(IVSCC)            1
Chrna2-Cre_OE25;Pvalb-T2A-Dre;Ai66     2
Esr2-IRES2-Cre;Ai14                    1
Htr3a-Cre_NO152;Ai14                   1
Ndnf-IRES2-dgCre;Ai14                  3
Nkx2-1-CreERT2;Ai14 (IVSCC)            5
Nos1-CreERT2;Ai14                      1
Nos1-CreERT2;Sst-IRES-FlpO;Ai65        2
Oxtr-T2A-Cre;Ai14                      1
Pvalb-IRES-Cre;Ai14                   11
Scnn1a-Tg3-Cre;Ai14                    1
Sst-IRES-Cre;Ai14                      1
Vip-IRES-Cre;Ai14                      6
Vip-IRES-Cre;Ai14(IVSCC)               1
Vipr2-IRES2-Cre;Ai14                   3
dtype: int64

# Quick practice on finding another dataset in LIMS

In [45]:
lims_query = "SELECT err.id, err.recording_date, s.name, s.ephys_roi_result_id, proj.code \
FROM ephys_roi_results err JOIN specimens s ON s.ephys_roi_result_id = err.id \
JOIN projects proj ON s.project_id = proj.id \
AND proj.code = 'mMPATCH'"
lims_df = get_lims_dataframe(lims_query)
lims_df.tail()

Unnamed: 0,ephys_roi_result_id,code,id,recording_date,name
14,645416634,mMPATCH,645416634,,Tlx3-Cre_PL56;Ai140;Sst-IRES-FlpO;Ai65F-354156...
15,645416707,mMPATCH,645416707,,Sst-IRES-Cre;Ai140;Pvalb-2A-FlpO;Ai65F-352872....
16,645416721,mMPATCH,645416721,,Vip-IRES-Cre;Ai140;Pvalb-T2A-FlpO;Ai65F-353601...
17,645416770,mMPATCH,645416770,,Sst-IRES-Cre;Ai140;Pvalb-2A-FlpO;Ai65F-351634....
18,645416815,mMPATCH,645416815,,Vip-IRES-Cre;Ai140;Pvalb-T2A-FlpO;Ai65F-351614...


In [102]:
my_query = "SELECT * FROM rna_amplifications LIMIT 10"
my_result = limsquery(my_query)
first_element = my_result[0]
print first_element.keys()

['umi_length_nt', 'amplified_quantity_ng', 'name', 'run_date', 'percent_cdna_longer_than_400bp', 'created_at', 'updated_at', 'failed', 'ercc_diluted_pool_id', 'ercc_pool_ul', 'rna_amplification_method_id', 'cycles', 'id', 'rna_amplification_set_id']


In [103]:
my_query = "SELECT ra.amplified_quantity_ng, ra.name, ra.run_date, ra.percent_cdna_longer_than_400bp, ra.failed, \
ra.cycles, ra.id, rai.sample_id, cell.id, cell.name, cell.created_at, cell.patched_cell_container \
FROM specimens cell \
LEFT JOIN rna_amplification_inputs rai ON rai.sample_id = cell.id \
LEFT JOIN rna_amplifications ra ON ra.id = rai.rna_amplification_id \
WHERE run_date > '2017-01-01'"
my_result = limsquery(my_query)
first_element = my_result[0]
data_df = pd.DataFrame(data=my_result, columns=my_result[0].keys())
data_df.head()

Unnamed: 0,amplified_quantity_ng,name,run_date,percent_cdna_longer_than_400bp,created_at,failed,patched_cell_container,sample_id,cycles,id
0,0.0,H16.06.012.11.03.01,2017-01-04 08:00:00,0.0,2016-09-29 21:13:38.114713,True,P1S4_160929_003_A01,548408304,19,548408304
1,1.1048,H16.06.012.11.03.02,2017-01-04 08:00:00,67.414917,2016-09-29 21:38:34.754088,False,P1S4_160929_004_A01,548417970,19,548417970
2,0.3869,H16.06.012.11.03.03,2017-01-04 08:00:00,37.167227,2016-09-29 22:29:08.487196,True,P1S4_160929_005_A01,548436554,19,548436554
3,0.5355,H16.06.012.11.03.04,2017-01-04 08:00:00,42.1662,2016-09-29 22:48:46.884752,False,P1S4_160929_006_A01,548444214,19,548444214
4,0.5713,H16.06.012.11.03.05,2017-01-04 08:00:00,40.994224,2016-09-29 23:09:13.059575,True,P1S4_160929_007_A01,548452482,19,548452482


In [108]:
len(data_df[data_df['failed'] == False])

2286

In [92]:
data_df.cycles.unique()

array([19, 18, 21], dtype=int64)

In [97]:
data_df[data_df['cycles'] == 21]['created_at'].min()

Timestamp('2017-04-21 17:58:36.870815')

In [115]:
shiny.head()

Unnamed: 0.1,Unnamed: 0,cl,score,marker.num,resolution.index,h.score,markers,resolution.index.percentile,exp_component_name,ar_id,...,Post_Patch,Total_time,Ephys_time,Retraction_time,Fill.Date,Rig_Operator,Rig_number,retraction_pressure,pilot_name,End_pipetteR
0,P6S4_170808_252_A01,Meis2,0.99,19,1.0,0.99,Meis2:Meis2 Scgn Amigo2 Filip1l Grm2 Lgr6 G630...,100,PS0907_E1-50_S34,623358639,...,0,,,,,,,,,
1,P2S4_170301_057_A01,Lamp5 Rgs12_3,1.0,26,1.0,1.0,Lamp5 Rgs12_3:Sema3e Tnnt1 Ntn1 Rxfp1 Fam19a2 ...,100,msPSX170321_E1_50_S11,578474417,...,0,,,,,,,,,
2,P1S4_160406_097_A01,Lamp5 Rgs12_3,0.93,18,1.0,0.93,Lamp5 Rgs12_3:Sema3e Tnnt1 Rxfp1 Ptprt LOC1010...,100,PSX160502_E1_50_S8_and_PSX160502_E2_50_S8_and_...,523786138,...,partial nucleated,,,,,,,,,
3,P2S4_170310_057_A01,Lamp5 Rgs12_3,0.83,19,1.0,0.83,Lamp5 Rgs12_3:Sema3e Tnnt1 Rxfp1 Ptprt Gria1 L...,100,msPSX170321_E1_50_S49,578474341,...,0,,,,,,,,,
4,P2S4_170922_055_A01,Lamp5 Rgs12_3,0.81,22,1.0,0.81,Lamp5 Rgs12_3:Sema3e Tnnt1 Rxfp1 Fam19a2 Gria1...,100,SM-GE63O_S009_E1-50,645671846,...,nucleus_present,,34.0,,8/31/2017,rustym,6.0,-30.0,,


In [106]:
shiny = pd.read_csv('\\\\allen\\programs\\celltypes\\workgroups\\rnaseqanalysis\\shiny\\patch_seq\\mouse_patchseq_VISp_20171204_collapsed90\\mapping.df.with.bp.90.csv')

In [113]:
df_new = pd.merge(data_df, shiny, left_on = 'patched_cell_container', right_on = 'Unnamed: 0')

In [120]:
df_new[df_new['Retraction_time'].notnull()]

Unnamed: 0,amplified_quantity_ng_x,name,run_date,percent_cdna_longer_than_400bp_x,created_at,failed,patched_cell_container_x,sample_id,cycles,id,...,Post_Patch,Total_time,Ephys_time,Retraction_time,Fill.Date,Rig_Operator,Rig_number,retraction_pressure,pilot_name,End_pipetteR
1094,2.548507,Chrna2-Cre_OE25;Ai14-351067.04.01.01,2017-06-10 07:00:00,0.486,2017-10-02 22:00:02.974338,True,P2S4_171002_053_A01,639695961,21,639695961,...,nucleus_absent,553.0,38.0,200.0,9/25/2017,rustym,5,-32.5,Nucleated Patch - Retraction Pressure,10
1095,5.740796,Chrna2-Cre_OE25;Ai14-351067.04.01.02,2017-06-10 07:00:00,0.618,2017-10-02 22:19:32.090600,False,P2S4_171002_054_A01,639701690,21,639701690,...,nucleus_present,501.0,45.0,96.0,9/25/2017,rustym,5,-30.0,Nucleated Patch - Retraction Pressure,1000
1096,15.401426,Chrna2-Cre_OE25;Ai14-351067.04.01.03,2017-06-10 07:00:00,0.715,2017-10-02 22:49:46.515181,False,P2S4_171002_055_A01,639715846,21,639715846,...,nucleus_present,404.0,28.0,118.0,9/25/2017,rustym,5,-32.0,Nucleated Patch - Retraction Pressure,1000
1097,11.077913,Chrna2-Cre_OE25;Ai14-351067.04.01.04,2017-06-10 07:00:00,0.274,2017-10-02 23:30:06.043189,True,P2S4_171002_056_A01,639725628,21,639725628,...,nucleus_absent,735.0,42.0,381.0,9/25/2017,rustym,5,-35.0,,40
1101,12.071228,Chrna2-Cre_OE25;Ai14-351067.04.02.01,2017-06-10 07:00:00,0.430,2017-10-02 21:14:50.513959,False,P8S4_171002_354_A01,639674270,21,639674270,...,nucleus_present,569.0,14.0,131.0,9/25/2017,lindsayn,7,-30.0,,1100
1102,20.745630,Chrna2-Cre_OE25;Ai14-351067.04.02.02,2017-06-10 07:00:00,0.433,2017-10-02 21:49:45.995448,False,P8S4_171002_355_A01,639689311,21,639689311,...,nucleus_present,637.0,31.0,168.0,9/25/2017,lindsayn,7,-30.0,,1400
1103,11.645717,Chrna2-Cre_OE25;Ai14-351067.03.02.01,2017-06-10 07:00:00,0.672,2017-10-02 22:35:15.122584,False,P8S4_171002_356_A01,639708328,21,639708328,...,nucleus_present,578.0,80.0,95.0,9/25/2017,lindsayn,7,-30.0,,3000
1104,6.906475,Chrna2-Cre_OE25;Ai14-351067.03.02.02,2017-06-10 07:00:00,0.319,2017-10-02 23:05:19.216595,True,P8S4_171002_357_A01,639719157,21,639719157,...,nucleus_absent,826.0,44.0,325.0,9/25/2017,lindsayn,7,-30.0,,30
1106,11.125648,Gad2-IRES-Cre;Ai14-350672.03.01.01,2017-06-10 07:00:00,0.751,2017-10-02 21:49:56.508144,False,P9S4_171002_402_A01,639689999,21,639689999,...,nucleus_present,437.0,17.0,111.0,9/25/2017,lisak,6,-23.9,,1000
1107,3.784308,Gad2-IRES-Cre;Ai14-350672.04.01.01,2017-06-10 07:00:00,0.334,2017-10-02 22:25:28.902676,True,P9S4_171002_403_A01,639706236,21,639706236,...,nucleus_absent,385.0,27.0,80.0,9/25/2017,lisak,6,-23.2,,1000
