In [1]:
import os
import pandas as pd

#https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE138711

In [3]:
GSE45148 = pd.read_csv('IGF2BP3_RIP.txt')

In [6]:
GSE45148['Assay Type'].value_counts()

RNA-Seq    71
RIP-Seq    36
Name: Assay Type, dtype: int64

In [8]:
GSE45148.loc[GSE45148['Assay Type']=='RIP-Seq'].iloc[0]

Run                                                        SRR10257798
Assay Type                                                     RIP-Seq
AvgSpotLen                                                         102
Bases                                                       1612548396
BioProject                                                 PRJNA576884
BioSample                                                 SAMN13012187
Bytes                                                        589591252
Center Name                                                        GEO
Consent                                                         public
DATASTORE filetype                                    fastq,run.zq,sra
DATASTORE provider                                          s3,gs,ncbi
DATASTORE region                        gs.US,ncbi.public,s3.us-east-1
Experiment                                                  SRX6975694
GEO_Accession (exp)                                         GSM4116587
Instru

In [9]:
GSE45148.loc[GSE45148['Assay Type']=='RIP-Seq', 'Treatment'].value_counts()

TetON pLKO shNTC + 1ug/mL Doxycycline       18
TetON pLKO shCDR1as + 1ug/mL Doxycycline    18
Name: Treatment, dtype: int64

In [10]:
GSE45148.loc[GSE45148['Assay Type']=='RIP-Seq', 'Sample Name'].value_counts()

GSM4116587    3
GSM4116588    3
GSM4116589    3
GSM4116590    3
GSM4116591    3
GSM4116592    3
GSM4116593    3
GSM4116594    3
GSM4116595    3
GSM4116596    3
GSM4116597    3
GSM4116598    3
Name: Sample Name, dtype: int64

GSM4116587 	WM278 shNTC_Input_1
GSM4116588 	WM278 shNTC_Input_2
GSM4116589 	WM278 shNTC_Input_3
GSM4116593 	WM278 shNTC_IGF2BP3_RIP_1
GSM4116594 	WM278 shNTC_IGF2BP3_RIP_2
GSM4116595 	WM278 shNTC_IGF2BP3_RIP_3

In [13]:
gsm_wanted = ['GSM4116587', 'GSM4116588', 'GSM4116589', 'GSM4116593', 'GSM4116594', 'GSM4116595']
' '.join(GSE45148.loc[GSE45148['Sample Name'].isin(gsm_wanted), 'Run'])

'SRR10257798 SRR10257799 SRR10257800 SRR10257801 SRR10257802 SRR10257803 SRR10257804 SRR10257805 SRR10257806 SRR10257816 SRR10257817 SRR10257818 SRR10257819 SRR10257820 SRR10257821 SRR10257822 SRR10257823 SRR10257824'

In [19]:
from pathlib import Path
mapper = {'GSM4116587': 'shNTC_Input_1', 
          'GSM4116588': 'shNTC_Input_2',
          'GSM4116589': 'shNTC_Input_3',
          'GSM4116593': 'shNTC_IGF2BP3_RIP_1', 
          'GSM4116594': 'shNTC_IGF2BP3_RIP_2', 
          'GSM4116595': 'shNTC_IGF2BP3_RIP_3'
         }

srr_dir = Path('/projects/ps-yeolab5/hsher/public_circ_rip/')
for name, group in GSE45148.loc[GSE45148['Sample Name'].isin(gsm_wanted)].groupby(by = 'Sample Name'):
    fq1s = ' '.join([str(srr_dir/f'{srr}_pass_1.fastq.gz') for srr in group['Run']])
    fq2s = ' '.join([str(srr_dir/f'{srr}_pass_2.fastq.gz') for srr in group['Run']])
          
    print(f'cat {fq1s} > {srr_dir/mapper[name]}.fq1.gz')
    print(f'cat {fq2s} > {srr_dir/mapper[name]}.fq2.gz')
    
    

cat /projects/ps-yeolab5/hsher/public_circ_rip/SRR10257798_pass_1.fastq.gz /projects/ps-yeolab5/hsher/public_circ_rip/SRR10257799_pass_1.fastq.gz /projects/ps-yeolab5/hsher/public_circ_rip/SRR10257800_pass_1.fastq.gz > /projects/ps-yeolab5/hsher/public_circ_rip/shNTC_Input_1.fq1.gz
cat /projects/ps-yeolab5/hsher/public_circ_rip/SRR10257798_pass_2.fastq.gz /projects/ps-yeolab5/hsher/public_circ_rip/SRR10257799_pass_2.fastq.gz /projects/ps-yeolab5/hsher/public_circ_rip/SRR10257800_pass_2.fastq.gz > /projects/ps-yeolab5/hsher/public_circ_rip/shNTC_Input_1.fq2.gz
cat /projects/ps-yeolab5/hsher/public_circ_rip/SRR10257801_pass_1.fastq.gz /projects/ps-yeolab5/hsher/public_circ_rip/SRR10257802_pass_1.fastq.gz /projects/ps-yeolab5/hsher/public_circ_rip/SRR10257803_pass_1.fastq.gz > /projects/ps-yeolab5/hsher/public_circ_rip/shNTC_Input_2.fq1.gz
cat /projects/ps-yeolab5/hsher/public_circ_rip/SRR10257801_pass_2.fastq.gz /projects/ps-yeolab5/hsher/public_circ_rip/SRR10257802_pass_2.fastq.gz /proj

In [21]:
manifest = []
for name in mapper.values():
    manifest.append([srr_dir/f'{name}.fq1.gz',
                     srr_dir/f'{name}.fq2.gz',
                     name])
manifest = pd.DataFrame(manifest, columns = ['fastq1', 'fastq2', 'Sample'])

In [23]:
manifest.applymap(os.path.isfile)

Unnamed: 0,fastq1,fastq2,Sample
0,True,True,False
1,True,True,False
2,True,True,False
3,True,True,False
4,True,True,False
5,True,True,False


In [24]:
manifest['fastq1'].eq(manifest['fastq2'])

0    False
1    False
2    False
3    False
4    False
5    False
dtype: bool

In [26]:
manifest['RNase'] = False

In [27]:
manifest.to_csv('downloaded_IGF2BP3_RIP.csv')

In [None]:
manie