## TODO:
- make column1 of the GSM DGE matrix files for the single cell
data match column1 of the cell to cluster assignment file
- transpose the GSM DGE matrix files so that the gene names are now column 1
- join replicates (samp1+samp2, 1dpa1+1dpa2, 2dpa1+2dpa2, 4dpa1+4dpa2) on the gene name column
- join all files together

In [1]:
import pandas as pd
import os

### Initialize samples and files
Below are the sample names and the file names as arrays to be used later in the script.

In [None]:
samples = ['samp1',
           'samp2',
           '1dpa1',
           '1dpa2',
           '2dpa1',
           '2dpa2',
           '4dpa1',
           '4dpa2']
files = ['GSM4095393_samp1_DGEmatrix.csv',
         'GSM4095394_samp2_DGEmatrix.csv',
         'GSM4095395_1dpa1_DGEmatrix.csv',
         'GSM4095396_1dpa2_DGEmatrix.csv',
         'GSM4095397_2dpa1_DGEmatrix.csv',
         'GSM4095398_2dpa2_DGEmatrix.csv',
         'GSM4095399_4dpa1_DGEmatrix.csv',
         'GSM4095400_4dpa2_DGEmatrix.csv']

### `clean_matrix` function
This funtion iterates over the rows of the dataframe and parses column 1 to get rid of
the "-1" at the end, leaving the 16bp barcode. Then, it concatenates the sample identifier to the beginning.
Next, outside the for loop, the dataframe is transposed. The transposition sets the index [0, 1, 2, ...]
as column names, and the gene names as the indices. The next 3 lines set row 1 as the header
so that the barcodes become the column names. Then, the last 2 lines before returning the final
cleaned matrix resets the indices so that the gene names become their own column called "genes" and
the indices are back to [0, 1, 2, ...].

In [None]:
def clean_matrix(exp, sample):
    for index, row in exp.iterrows():
        col1 = row['Barcodes']
        col1 = col1[:16]
        col1 = sample+col1
        exp.at[index,'Barcodes'] = col1
    exp = exp.transpose()
    new_header = exp.iloc[0] #grab the first row for the header
    exp = exp[1:] #take the data less the header row
    exp.columns = new_header
    exp.reset_index(inplace=True)
    exp = exp.rename(columns = {'index':'genes'})
    return exp

### PreInjury samples
The two preinjury files are processed and joined, then exported to a `csv.gz` file.
The two matrices are joined in such a way that if a gene name is in one sample but
not the other, the columns for that sample are filled with a `NaN` value.

In [None]:
file = files[0]
sample = samples[0]
filepath = os.path.join('.', 'Hou_raw_matrices', file)
samp1 = pd.read_csv(filepath)
samp1 = clean_matrix(samp1, sample)

file = files[1]
sample = samples[1]
filepath = os.path.join('.', 'Hou_raw_matrices', file)
samp2 = pd.read_csv(filepath)
samp2 = clean_matrix(samp2, sample)

preinjury = pd.merge(samp1, samp2, left_on='genes', right_on='genes', how='outer')
merged = os.path.join('.', 'Hou_expression_matrices', 'preinjury_merged.csv.gz')
preinjury.to_csv(merged,
           index=False,
           compression="gzip")

### 1dpa samples
The two 1dpa files are processed and joined, then exported to a `csv.gz` file.
The two matrices are joined in such a way that if a gene name is in one sample but
not the other, the columns for that sample are filled with a `NaN` value.

In [None]:
file = files[2]
sample = samples[2]
filepath = os.path.join('.', 'Hou_raw_matrices', file)
samp1 = pd.read_csv(filepath)
samp1 = clean_matrix(samp1, sample)

file = files[3]
sample = samples[3]
filepath = os.path.join('.', 'Hou_raw_matrices', file)
samp2 = pd.read_csv(filepath)
samp2 = clean_matrix(samp2, sample)

onedpa = pd.merge(samp1, samp2, left_on='genes', right_on='genes', how='outer')
merged = os.path.join('.', 'Hou_expression_matrices', '_1dpa_merged.csv.gz')
onedpa.to_csv(merged,
           index=False,
           compression="gzip")

### 2dpa samples
The two 2dpa files are processed and joined, then exported to a `csv.gz` file.
The two matrices are joined in such a way that if a gene name is in one sample but
not the other, the columns for that sample are filled with a `NaN` value.

In [None]:
file = files[4]
sample = samples[4]
filepath = os.path.join('.', 'Hou_raw_matrices', file)
samp1 = pd.read_csv(filepath)
samp1 = clean_matrix(samp1, sample)

file = files[5]
sample = samples[5]
filepath = os.path.join('.', 'Hou_raw_matrices', file)
samp2 = pd.read_csv(filepath)
samp2 = clean_matrix(samp2, sample)

twodpa = pd.merge(samp1, samp2, left_on='genes', right_on='genes', how='outer')
merged = os.path.join('.', 'Hou_expression_matrices', '_2dpa_merged.csv.gz')
twodpa.to_csv(merged,
           index=False,
           compression="gzip")

### 4dpa samples
The two 4dpa files are processed and joined, then exported to a `csv.gz` file.
The two matrices are joined in such a way that if a gene name is in one sample but
not the other, the columns for that sample are filled with a `NaN` value.

In [None]:
file = files[6]
sample = samples[6]
filepath = os.path.join('.', 'Hou_raw_matrices', file)
samp1 = pd.read_csv(filepath)
samp1 = clean_matrix(samp1, sample)

file = files[7]
sample = samples[7]
filepath = os.path.join('.', 'Hou_raw_matrices', file)
samp2 = pd.read_csv(filepath)
samp2 = clean_matrix(samp2, sample)

fourdpa = pd.merge(samp1, samp2, left_on='genes', right_on='genes', how='outer')
merged = os.path.join('.', 'Hou_expression_matrices', '_4dpa_merged.csv.gz')
fourdpa.to_csv(merged,
           index=False,
           compression="gzip")


### Merge all samples

First preinjury and 1dpa are merged. Then 2dpa and 4dpa are merged. Then 0+1 and 2+4 are merged making a final merged dataframe of all samples that is then exported to a `csv.gz` file. 

In [None]:
_0and1 = pd.merge(preinjury, onedpa, left_on='genes', right_on='genes', how='outer')
_2and4 = pd.merge(twodpa, fourdpa, left_on='genes', right_on='genes', how='outer')
full_merge = pd.merge(_0and1, _2and4, left_on='genes', right_on='genes', how='outer')
merged = os.path.join('.', 'Hou_expression_matrices', 'full_merged.csv.gz')
full_merge.to_csv(merged,
           index=False,
           compression="gzip")

### Modifying `tximport` output
We will be using the gene counts file from `tximport`, and it needs to match the formatting of the files above. The file is a tab delimited `.txt` file. The first code block reads in the file and gives the first column the same name as the first colummn in the expression matrix from above. In the second code block, the file is made into a `.csv.gz` to match the files created above. 

In [None]:
filepath = os.path.join('.', 'Danio_rerio.GRCz10.91.gene.counts.txt')
gene_counts = pd.read_csv(filepath, sep='\t')
gene_counts = gene_counts.rename(columns = {'Unnamed: 0':'genes'})
gene_counts.iloc[:3, :4]

In [None]:
gene_count_file = os.path.join('.', 'Hartig_gene_counts.csv.gz')
gene_counts.to_csv(gene_count_file,
           index=False,
           compression="gzip")

### Making the Hou count file match the Hou design file
In order for RNA Sieve to run, the count file has to match the design file to make the anndata object. It was discovered that the design file (`GSE137971_cells.csv.gz`) is missing 237 samples from the count file (`Hou_shared_gene_counts.csv.gz`). 

It should further be noted that the final gene count files that are being used in the RNA Sieve notebook were created at the command line with the following commands: 

```
zcat Hartig_gene_counts.csv.gz Hou_expression_matrices/full_merged.csv.gz | cut -f 1 -d ',' |perl -ne "s/gene/#gene/;print" | sort | uniq -c | perl -ane "if(@F[0]>1){print \"@F[1]\n\"}" | perl -ne "s/#ge/ge/;print" > shared_genes.txt

zcat Hartig_gene_counts.csv.gz | perl -ne "s/,/\t/g;print" | join shared_genes.txt - | perl -ne "s/ /,/g;print" > Hartig_shared_gene_counts.csv

zcat Hou_expression_matrices/full_merged.csv.gz | perl -ne "s/ge/#ge/;s/,/\t/g;print" | sort | perl -ne "s/#ge/ge/;print" | join shared_genes.txt - | perl -ne "s/ /,/g;print" > Hou_shared_gene_counts.csv
```

In [2]:
# adding a label to the first column & sorting to see if it will fix the anndata mismatching issue
design = pd.read_csv("GSE137971_cells.csv.gz", compression="gzip")
design = design.rename(columns = {'Unnamed: 0':'samples'})
design = design.sort_values(design.columns[0])
design

Unnamed: 0,samples,nCount_RNA,nFeature_RNA,Stage,S.Score,G2M.Score,major.cl,manualPhase,UMAP_1,UMAP_2
0,1dpa1AAACCTGAGGGAAACA,6168,1292,1dpa,-0.045273,0.156326,Intermediate Epithelial,Unselected,-4.033391,5.024199
1,1dpa1AAACCTGCACACTGCG,6336,1388,1dpa,-0.034187,0.274646,Intermediate Epithelial,Unselected,-6.115250,1.632328
2,1dpa1AAACGGGGTATGAATG,9770,2034,1dpa,-0.050019,0.296948,Mesenchymal,Unselected,5.821854,-8.472561
3,1dpa1AAAGATGAGCGCCTCA,8086,1782,1dpa,-0.041866,0.239046,Basal Epithelial,Unselected,7.921741,5.167769
4,1dpa1AAAGTAGAGGACACCA,5539,1382,1dpa,0.009987,0.036322,Intermediate Epithelial,Unselected,-6.163578,1.299275
...,...,...,...,...,...,...,...,...,...,...
18546,samp2TTTGGTTCATTCACTT,6871,1322,Preinjury,-0.004101,-0.008782,Intermediate Epithelial,G1,-1.582347,0.674932
18547,samp2TTTGTCACACCGGAAA,4421,762,Preinjury,-0.013394,-0.065488,Intermediate Epithelial,G1,-4.194117,-0.139754
18548,samp2TTTGTCACAGACAAAT,20003,2280,Preinjury,0.232262,0.082392,Mesenchymal,S,4.920971,-8.737699
18549,samp2TTTGTCAGTTATGTGC,13829,1791,Preinjury,-0.019583,0.023622,Intermediate Epithelial,Unselected,-2.348434,5.227718


In [3]:
design_file = os.path.join('.', 'sorted_labeled_GSE137971_cells.csv.gz')
design.to_csv(design_file,
           index=False,
           compression="gzip")

In [4]:
# grab the first column of the design file to use as reference 
design = design.iloc[:,:1]

In [5]:
# read in the count file
counts = pd.read_csv("Hou_shared_gene_counts.csv.gz", compression="gzip")

In [6]:
# transpose the count file and set the header line and the index appropriately 
counts = counts.transpose()
new_header = counts.iloc[0] #grab the first row for the header
counts = counts[1:] #take the data less the header row
counts.columns = new_header
counts.reset_index(inplace=True)
counts = counts.rename(columns = {'index':'samples'})

In [7]:
# performing a merge on just the sample names from the design file and the full count file 
# will result in a dataframe that only contains the count daata from the samples that exist in the design file
joined = pd.merge(counts, design, left_on='samples', right_on='samples')
# sort the joined to match the design 
joined = joined.sort_values(joined.columns[0])
joined = joined.iloc[:, :-9]
joined

Unnamed: 0,samples,ENSDARG00000000001,ENSDARG00000000002,ENSDARG00000000018,ENSDARG00000000019,ENSDARG00000000068,ENSDARG00000000069,ENSDARG00000000086,ENSDARG00000000103,ENSDARG00000000142,...,ENSDARG00000109171,ENSDARG00000109173,ENSDARG00000109175,ENSDARG00000109179,ENSDARG00000109181,ENSDARG00000109183,ENSDARG00000109184,ENSDARG00000109185,ENSDARG00000109187,ENSDARG00000109188
4253,1dpa1AAACCTGAGGGAAACA,0,0,0,0,0,0,0,0,0,...,0,0,,,0,,0,0,0,0
4254,1dpa1AAACCTGCACACTGCG,0,0,0,0,0,0,0,0,0,...,0,0,,,0,,0,0,0,0
4255,1dpa1AAACGGGGTATGAATG,0,1,0,0,0,0,0,0,0,...,0,0,,,0,,0,0,0,0
4256,1dpa1AAAGATGAGCGCCTCA,0,0,0,0,0,5,0,0,0,...,0,0,,,0,,0,0,0,0
4257,1dpa1AAAGTAGAGGACACCA,0,0,0,0,0,0,0,0,0,...,0,0,,,0,,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4248,samp2TTTGGTTCATTCACTT,0,0,0,0,0,0,0,0,0,...,0,0,,,0,,0,0,0,0
4249,samp2TTTGTCACACCGGAAA,0,0,0,0,0,0,0,0,0,...,0,0,,,0,,0,0,0,0
4250,samp2TTTGTCACAGACAAAT,0,0,0,1,0,0,0,0,0,...,0,0,,,0,,0,0,0,0
4251,samp2TTTGTCAGTTATGTGC,0,0,0,1,0,0,0,0,0,...,0,0,,,0,,0,0,0,0


In [8]:
# export the count file that now matches the design file without transposing
joined_file = os.path.join('.', 'sorted_long_matched_Hou_shared_gene_counts.csv.gz')
joined.to_csv(joined_file,
           index=False,
           compression="gzip")

In [9]:
# transpose the final joined dataframe and set the header line and the index appropriately so that the input will work with rna sieve
joined = joined.transpose()
new_header = joined.iloc[0] #grab the first row for the header
joined = joined[1:] #take the data less the header row
joined.columns = new_header
joined.reset_index(inplace=True)
joined = joined.rename(columns = {'index':'genes'})
joined

samples,genes,1dpa1AAACCTGAGGGAAACA,1dpa1AAACCTGCACACTGCG,1dpa1AAACGGGGTATGAATG,1dpa1AAAGATGAGCGCCTCA,1dpa1AAAGTAGAGGACACCA,1dpa1AAAGTAGGTGTCGCTG,1dpa1AACGTTGGTGACGCCT,1dpa1AACTTTCAGCATGGCA,1dpa1AAGACCTGTTCGTCTC,...,samp2TTTCCTCGTCTAGTCA,samp2TTTCCTCGTTCATGGT,samp2TTTCCTCTCAGTTAGC,samp2TTTGCGCCACGCCAGT,samp2TTTGCGCTCACCGTAA,samp2TTTGGTTCATTCACTT,samp2TTTGTCACACCGGAAA,samp2TTTGTCACAGACAAAT,samp2TTTGTCAGTTATGTGC,samp2TTTGTCAGTTGTGGCC
0,ENSDARG00000000001,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,ENSDARG00000000002,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ENSDARG00000000018,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ENSDARG00000000019,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0
4,ENSDARG00000000068,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24319,ENSDARG00000109183,,,,,,,,,,...,,,,,,,,,,
24320,ENSDARG00000109184,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
24321,ENSDARG00000109185,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
24322,ENSDARG00000109187,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
# export the count file that now matches the design file
joined_file = os.path.join('.', 'sorted_matched_Hou_shared_gene_counts.csv.gz')
joined.to_csv(joined_file,
           index=False,
           compression="gzip")

In [2]:
# fill NaN values in both count files
hartig_count = pd.read_csv("Hartig_shared_gene_counts.csv.gz", compression="gzip")
hartig_count = hartig_count.fillna(0)
hartig_file = os.path.join('.', 'fill_Hartig_shared_gene_counts.csv.gz')
hartig_count.to_csv(hartig_file,
           index=False,
           compression="gzip")

In [3]:
hou_count = pd.read_csv("sorted_matched_Hou_shared_gene_counts.csv.gz", compression="gzip")
hou_count = hou_count.fillna(0)
hou_file = os.path.join('.', 'fill_sorted_matched_Hou_shared_gene_counts.csv.gz')
hou_count.to_csv(hou_file,
           index=False,
           compression="gzip")