# Read in processed RNA-seq, save into pickle.

In [1]:
root_directory = '/cellar/users/hsher/Data/ideker/sc/'
import os
os.listdir(root_directory)

['GPL17021.soft',
 'SRP061902',
 'GPL13112.soft',
 'GSE67835',
 'GSE71585-GPL17021_series_matrix.txt',
 'GSE52564',
 'GPL15520.soft',
 'GSE67835-GPL18573_series_matrix.txt',
 'GSE71585-GPL13112_series_matrix.txt',
 'SRP033200',
 'GPL18573.soft',
 'GSE71585',
 'pickle',
 'GSE73721-GPL18573_series_matrix.txt',
 'GPL16417.soft',
 'GSE60361',
 'GSE52564_series_matrix.txt',
 'GSE71585-GPL16417_series_matrix.txt',
 'GSE60361_series_matrix.txt',
 'GSE67835-GPL15520_series_matrix.txt',
 'GSE73721-GPL19057_series_matrix.txt',
 'SRP064454',
 'SRR1033783_2.fastq.gz',
 'GSE73721',
 'SRP057196',
 'GPL19057.soft',
 'SRP045452']

# start from raw data.
1. step one: untar/gunzip all files (manually)
2. step two: read in all data into scanpy
3. follow single cell protocol
- filter read per cell
- filter high mitochondrial content
- normalize
4. concat into the same anndata per species
- visualize to check
- calculate cluster (Adjusted Rand Index)
5. map mouse genome to human and revisualize per subspace
- are they comparable

output: scanpy: anndata

problems: 
1. different species problem
- first seperate two data from human and mice
    - combining data from different dataset simply by normalization will not eliminate batch effect? (wonder why paper do it?

validation:
- visualize cell cluster: is the clustering consistent with data
    - check batch effect: see if there's clustering per dataset

# Reading Dataset!
## Dataset 1

In [2]:
# process /GSE52564
data_1 = root_directory + 'GSE52564/'
filelist = os.listdir(data_1)
filelist = [f for f in filelist if 'xls' in f]
filelist

['GSM1269919_WC3.xls',
 'GSM1269903_Astrocyte1.xls',
 'GSM1269918_WC2.xls',
 'GSM1269910_NFO2.xls',
 'GSM1269907_OPC1.xls',
 'GSM1269906_Neuron2.xls',
 'GSM1269905_Neuron1.xls',
 'GSM1269916_Endothelial2.xls',
 'GSM1269913_Microglia1.xls',
 'GSM1269917_WC1.xls',
 'GSM1269915_Endothelial1.xls',
 'GSM1269914_Microglia2.xls',
 'GSM1269912_MO2.xls',
 'GSM1269909_NFO1.xls',
 'GSM1269908_OPC2.xls',
 'GSM1269911_MO1.xls',
 'GSM1269904_Astrocyte2.xls']

In [3]:
import pandas as pd
dflist = []
for f in filelist:
    df = pd.read_excel(data_1+f, index_col = 0) # read into dataframe
    dflist.append(df)
all_df = pd.concat(dflist, axis = 1, ignore_index = False)

In [4]:
all_df.head()

Unnamed: 0_level_0,WB3,A1,WB2,GC2,OPC1,N2,N1,Endo2,MGL1,WB1,Endo1,MGL2,MOG2,GC1,OPC2,MOG1,A2
gene.symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
0610005C13Rik,0.1,0.200201,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1
0610007C21Rik,42.267463,47.529076,32.229991,68.258139,57.415641,34.770467,33.378741,67.321897,75.383231,35.938745,59.257508,101.407792,52.536814,46.986842,84.325226,54.736686,41.781448
0610007L01Rik,9.941628,10.075592,9.339216,17.118498,12.323579,12.362272,11.723832,13.021746,20.214993,10.277692,13.904486,16.412027,7.248484,10.841421,14.148773,4.96029,11.83084
0610007P08Rik,3.723082,7.917463,2.943898,3.989445,6.060253,2.801128,1.278118,4.89565,0.519796,3.377649,4.525034,0.37521,3.264023,8.097296,2.945927,2.475218,6.419092
0610007P14Rik,40.772129,49.567579,35.67537,72.237064,43.987477,23.64729,25.025453,11.275608,8.628644,40.701673,9.75984,11.996426,66.573948,63.73033,64.589854,60.966421,47.814948


In [5]:
all_df.shape

(22462, 17)

In [6]:
all_df.columns

Index(['WB3', 'A1', 'WB2', 'GC2', 'OPC1', 'N2', 'N1', 'Endo2', 'MGL1', 'WB1',
       'Endo1', 'MGL2', 'MOG2', 'GC1', 'OPC2', 'MOG1', 'A2'],
      dtype='object')

In [7]:
all_df.sum(axis = 0)# not normalized? or FPKM?

WB3      230264.349011
A1       218449.186997
WB2      230166.432822
GC2      271592.272830
OPC1     240692.287928
N2       218287.490066
N1       205621.040949
Endo2    314334.363371
MGL1     367933.201803
WB1      231959.317925
Endo1    294964.022667
MGL2     389838.008769
MOG2     298078.528164
GC1      263362.938329
OPC2     248809.707267
MOG1     328210.333156
A2       216786.270158
dtype: float64

In [8]:
all_df.to_pickle(root_directory+'pickle/GSE52564.pickle')

## Dataset 2
age and sex in data

In [9]:
data_2 = root_directory+'GSE60361/GSE60361_C1-3005-Expression.txt'
all_data_2 = pd.read_csv(data_2, sep = '\t', header = 0, index_col = 0) 
# read count? # how to label cell, paper said based on cluster but they did not offer their clustering result

In [10]:
all_data_2.head()

Unnamed: 0_level_0,1772071015_C02,1772071017_G12,1772071017_A05,1772071014_B06,1772067065_H06,1772071017_E02,1772067065_B07,1772067060_B09,1772071014_E04,1772071015_D04,...,1772066110_D12,1772071017_A07,1772063071_G10,1772058148_C03,1772063061_D09,1772067059_B04,1772066097_D04,1772063068_D01,1772066098_A12,1772058148_F03
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Tspan12,0,0,0,3,0,0,3,0,0,0,...,0,0,0,0,0,0,0,0,0,1
Tshz1,3,1,0,2,2,2,2,1,0,2,...,0,0,0,0,0,0,0,0,0,1
Fnbp1l,3,1,6,4,1,2,1,0,5,2,...,0,0,0,0,0,0,0,0,0,0
Adamts15,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Cldn12,1,1,1,0,0,0,0,0,2,3,...,0,0,0,0,0,0,0,0,0,0


In [11]:
all_data_2.to_pickle(root_directory+'pickle/GSE60631.pickle')

## Dataset 3
[Original Paper](https://www.pnas.org/content/112/23/7285/tab-figures-data)
We used two complementary approaches to classify each single cell to one of the major brain cell types: astrocytes, oligodendrocytes, oligodendrocyte precursor cells (OPCs), neurons, microglia, and vascular cells. First, we used an unbiased approach to sort all 466 individual cells into distinct groups defined by the entirety of their molecular signatures as described in SI Appendix, SI Methods. This approach resulted in the identification of 10 distinct cell groups (unbiased groups) (SI Appendix, Fig. S2). A landscape of all of the cells colored by cell cluster, can be seen in Fig. 1A and SI Appendix, Table S2. Clusters 1–8 consist of adult brain cells, whereas clusters 9 and 10 consist of fetal brain cells.
Based upon the presence of typically associated cell-type–specific markers in the top 20 enriched genes for each unbiased group (SI Appendix, Table S3) we were able to easily identify all but two of the clusters as specific groups consisting of OPCs, oligodendrocytes, astrocytes, microglia, neurons, endothelial cells, replicating neuronal progenitors, and quiescent newly born neurons. While these groups contained relatively uniform identities, the enriched genes of two remaining clusters contained genes characteristic of more than one distinct cell type; the interpretation of this finding is discussed below.

In [12]:
data_3 = root_directory + 'GSE67835/' 
filelist = os.listdir(data_3)
filelist = [f for f in filelist if 'csv' in f]
len(filelist)

466

In [13]:
df_list_3 = []
for f in filelist:
    # cell id in filename
    # although file extension is .csv but it is .tsv the inside
    df = pd.read_csv(data_3 + f, sep = '\t', header = None, names = [f])# a lot of healthy cortex cell
    df_list_3.append(df)
all_data_3 = pd.concat(df_list_3, axis = 1, ignore_index = False)


In [14]:
all_data_3.head()

Unnamed: 0,GSM1658121_nochipID5.C44.csv,GSM1658106_nochipID3.C72.csv,GSM1658143_nochipID8.C29.csv,GSM1658023_nochipID2.C22.csv,GSM1658155_nochipID8.C49.csv,GSM1657957_1772078236.C70.csv,GSM1658213_nochipID10.C37.csv,GSM1658330_nochipID13.C72.csv,GSM1657895_1772078217.C61.csv,GSM1657873_1772078217.C06.csv,...,GSM1658099_nochipID3.C54.csv,GSM1658229_nochipID11.C02.csv,GSM1658123_nochipID5.C54.csv,GSM1658118_nochipID5.C18.csv,GSM1657871_1772078217.C03.csv,GSM1658358_nochipID4.C63.csv,GSM1658094_nochipID3.C37.csv,GSM1658012_nochipID2.C10.csv,GSM1658328_nochipID13.C67.csv,GSM1658240_nochipID11.C22.csv
1/2-SBSRNA4,0,0,0,0,0,0,0,0,2,0,...,0,72,0,0,0,0,0,0,0,0
A1BG,0,0,0,0,7,0,0,0,0,0,...,0,0,15,0,0,0,0,0,0,0
A1BG-AS1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A1CF,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A2LD1,0,0,0,7,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
all_data_3.sum() # count

GSM1658121_nochipID5.C44.csv     1056873
GSM1658106_nochipID3.C72.csv     2138110
GSM1658143_nochipID8.C29.csv     2490339
GSM1658023_nochipID2.C22.csv     3101242
GSM1658155_nochipID8.C49.csv     1972138
                                  ...   
GSM1658358_nochipID4.C63.csv     1786056
GSM1658094_nochipID3.C37.csv     2098165
GSM1658012_nochipID2.C10.csv     1642874
GSM1658328_nochipID13.C67.csv    1735585
GSM1658240_nochipID11.C22.csv    1645206
Length: 466, dtype: int64

In [16]:
all_data_3.index # three rows that are not genes 'no_feature ', 'ambiguous ', 'alignment_not_unique '

Index(['1/2-SBSRNA4 ', 'A1BG ', 'A1BG-AS1 ', 'A1CF ', 'A2LD1 ', 'A2M ',
       'A2ML1 ', 'A2MP1 ', 'A4GALT ', 'A4GNT ',
       ...
       'ZXDC ', 'ZYG11A ', 'ZYG11B ', 'ZYX ', 'ZZEF1 ', 'ZZZ3 ', 'tAKR ',
       'no_feature ', 'ambiguous ', 'alignment_not_unique '],
      dtype='object', length=22088)

In [17]:
all_data_3.to_pickle(root_directory+'pickle/GSE67835.pickle')

# Data 4


In [18]:
data_4 = root_directory+'GSE71585/GSE71585_RefSeq_counts.csv'
all_data_4 = pd.read_csv(data_4, header = 0, index_col = 0)
data_4_cluster = root_directory+'GSE71585/GSE71585_Clustering_Results.csv' 

In [19]:
all_data_4.head() # why counts have digital?

Unnamed: 0_level_0,Calb2_tdTpositive_cell_1,Calb2_tdTpositive_cell_2,Calb2_tdTpositive_cell_3,Calb2_tdTpositive_cell_4,Calb2_tdTpositive_cell_5,Calb2_tdTpositive_cell_6,Calb2_tdTpositive_cell_7,Calb2_tdTpositive_cell_8,Calb2_tdTpositive_cell_9,Calb2_tdTpositive_cell_10,...,CAV_VISp_Contra_tdTpos_cell_5,Rbp4_CTX_10pg_1,Rbp4_CTX_10pg_2,Rbp4_CTX_10pg_3,Rbp4_CTX_10pg_4,Rbp4_CTX_10pg_5,Rbp4_CTX_10pg_6,Rbp4_CTX_250ng_1,Rbp4_CTX_250ng_2,Trib2_CTX_250ng_1
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0610005C13Rik,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,9.0,5.0
0610007C21Rik,992.0,2287.02,491.78,1932.0,1425.0,130.03,2110.02,955.0,326.0,933.0,...,352.06,401.0,1750.99,1463.0,3439.0,502.95,1486.36,956.4,1054.94,944.03
0610007L01Rik,2.57,177.0,0.0,1.0,2.0,3.0,3040.99,101.0,0.0,1042.0,...,367.0,242.0,1.0,69.0,0.0,53.0,40.15,1251.0,1356.2,1147.0
0610007N19Rik,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,704.01,0.0,27.0,37.0,35.0
0610007P08Rik,0.0,0.0,0.0,0.0,0.0,0.0,17.03,0.0,0.0,0.0,...,3.0,11.0,0.0,0.0,0.0,0.0,0.0,574.35,570.86,590.84


In [20]:
all_data_4_cluster = pd.read_csv(data_4_cluster, index_col = 0)
all_data_4_cluster.head()

Unnamed: 0_level_0,mouse_line,cre_driver_1,cre_driver_2,cre_reporter,dissection,tdTomato,pass_qc_checks,broad_type,core_intermediate,primary_type,secondary_type,aibs_vignette_id
sample_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Calb2_tdTpositive_cell_46,Calb2,Calb2-IRES-Cre,,RCL-tdT (Ai14),upper,positive,Y,Astrocyte,core,Astro Gja1,,A1631_VU
Calb2_tdTpositive_cell_48,Calb2,Calb2-IRES-Cre,,RCL-tdT (Ai14),upper,positive,Y,Astrocyte,core,Astro Gja1,,A1633_VU
Calb2_tdTpositive_cell_50,Calb2,Calb2-IRES-Cre,,RCL-tdT (Ai14),lower,positive,Y,Astrocyte,core,Astro Gja1,,A1635_VL
Calb2_tdTpositive_cell_53,Calb2,Calb2-IRES-Cre,,RCL-tdT (Ai14),lower,positive,Y,Astrocyte,core,Astro Gja1,,A1638_VL
Calb2_tdTpositive_cell_58,Calb2,Calb2-IRES-Cre,,RCL-tdT (Ai14),lower,positive,Y,Astrocyte,core,Astro Gja1,,A1643_VL


In [21]:
all_data_4.sum() # ???? #?????

Calb2_tdTpositive_cell_1    14030202.14
Calb2_tdTpositive_cell_2     4803366.44
Calb2_tdTpositive_cell_3     3073418.37
Calb2_tdTpositive_cell_4    12192899.59
Calb2_tdTpositive_cell_5    13477232.54
                               ...     
Rbp4_CTX_10pg_5             13166764.03
Rbp4_CTX_10pg_6             10340431.59
Rbp4_CTX_250ng_1            25486493.33
Rbp4_CTX_250ng_2            26684731.91
Trib2_CTX_250ng_1           24775226.25
Length: 1809, dtype: float64

In [22]:
all_data_4.to_pickle(root_directory+'pickle/GSE71585.pickle')

# Data 5


In [23]:
data_5 = root_directory+'GSE73721/GSE73721_Human_and_mouse_table.csv'
all_data_5 = pd.read_csv(data_5, header = 0, index_col = 0) # normalized count?

In [24]:
all_data_5.head()

Unnamed: 0_level_0,59yo tumor periphery astro,59yo tumor core astro,64yo tumor core astro,65yo tumor core astro,21yo Hippocampus astro,22yo Hippocampus astro,53yo A Hippocampus astro,53yo B Hippocampus astro,Fetal ctx 1 astro,Fetal ctx 2 astro,...,47 yo ctx endo,13yo ctx endo,45yo whole cortex,63yo whole cortex,25yo whole cortex,53yo whole cortex,Mouse 1 month astro,Mouse 4 month astro,Mouse 7 month astro,Mouse 9 month astro
Gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Aamp,0.1,1.1,0.7,1.5,1.4,3.4,2.1,0.7,2.8,0.1,...,0.2,1.0,1.2,0.5,1.2,1.4,5.3,5.5,9.6,4.4
Abat,11.0,17.2,12.3,37.0,16.1,12.3,10.2,9.4,8.5,17.4,...,1.2,2.5,11.6,6.7,14.5,9.7,54.9,50.9,70.3,56.8
Abca1,2.1,35.2,11.2,27.5,8.8,10.1,6.5,4.8,0.9,1.5,...,1.3,1.5,1.1,1.6,1.7,1.8,9.6,6.4,13.6,9.0
Abca2,0.5,0.6,0.6,0.3,7.7,0.2,8.2,8.5,1.7,0.4,...,0.1,0.4,0.9,0.1,0.5,0.2,3.2,5.9,6.4,5.4
Abcc5,2.1,3.2,2.3,5.7,3.7,5.7,4.1,5.1,2.4,7.6,...,0.6,1.1,3.8,1.7,5.9,1.5,5.3,2.3,10.3,3.0


In [25]:
all_data_5.sum() #FPKM? From epilepsy patient

59yo tumor periphery astro     50527.1
59yo tumor core astro         103290.4
64yo tumor core astro          73933.8
65yo tumor core astro          74880.2
21yo Hippocampus astro         94227.1
22yo Hippocampus astro         85454.0
53yo A Hippocampus astro       98480.4
53yo B Hippocampus astro       79043.4
Fetal ctx 1 astro              61928.6
Fetal ctx 2 astro              84277.1
Fetal ctx 3 astro              74515.0
Fetal ctx 4 astro              98840.4
Fetal ctx 5 astro              58122.7
Fetal ctx 6 astro              73975.4
8yo ctx astro                  89389.9
13yo ctx astro                 85544.1
16yo ctx astro                 86464.6
21yo ctx astro                 81246.4
22yo ctx astro                 99001.4
35yo ctx astro                 96832.6
47yo ctx astro                 71864.7
51yo ctx astro                 90287.2
53yo ctx astro                 74999.9
60yo ctx astro                 65163.4
63yo ctx 1 astro               80508.2
63yo ctx 2 astro         

In [26]:
all_data_5.to_pickle(root_directory+'pickle/GSE73721.pickle')

# Result: Data characteristic
Dataset are processed by different pipeline.
Some "cell type" labels are data driven (not based on immune markers/FACs)