In [1]:
import pandas as pd
import subprocess

# Load BAM into DataFrame
sam_output = subprocess.run(["samtools", "view", "intersect.bam"], capture_output=True, text=True)
sam_lines = sam_output.stdout.strip().split("\n")

# Convert SAM to DataFrame
columns = ["QNAME", "FLAG", "RNAME", "POS", "MAPQ", "CIGAR", "RNEXT", "PNEXT", "TLEN", "SEQ", "QUAL"]
data = [line.split("\t")[:11] for line in sam_lines]
df = pd.DataFrame(data, columns=columns)

# Convert FLAG to numeric
df["FLAG"] = pd.to_numeric(df["FLAG"])

# Assign read type
df["READ_TYPE"] = df["FLAG"].apply(lambda x: "READ_1" if x & 64 else ("READ_2" if x & 128 else "UNPAIRED"))
df["RP_INDEX"] = df["SEQ"].apply(lambda x: x[:8])


# Filtering condition
filtered_df = df[(df["RNEXT"] != "=") | (df["TLEN"] > 0)]

# Save filtered reads
filtered_df.to_csv("filtered_reads.sam", sep="\t", index=False, header=False)

# Save output
df.to_csv("reads_with_type.sam", sep="\t", index=False, header=False)

%history


In [3]:
import pandas as pd

df = pd.read_csv("/gpfs/home/asun/jin_lab/yap/pipeline0_bt2_local_alignment/split_s10/intersect.sam", sep="\t", header=None)

In [None]:
for i in range(len(df)):
    if df.iloc[i, 13] != null:
        df.iloc[i, 11] = "READ_1"
    elif df.iloc[i, 1] & 128:
        df.iloc[i, 11] = "READ_2"
    else:
        df.iloc[i, 11] = "UNPAIRED"

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,lh00134:653:22MKYCLT4:3:1102:23021:5428,177,Foxg1_g1,1,22,66S72M12S,chr8,30412233,0,TTACCGTAACTTGAAAGTATTTCGATTTCTTGGCTTTATATATCTT...,IIIIIIIIIIIIIIIIIIIIIIIIIIIIIII9IIIIIIIIIIIIII...,READ_2,TTACCGTA,
1,lh00134:653:22MKYCLT4:3:1107:31103:16231,99,Foxg1_g2,1,9,27S82M41S,=,1,-180,TACTGCTCTCATTTCCCGGCCAATGCAGAGTTACAACGGGACCACG...,IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII...,READ_1,TACTGCTC,I1
2,lh00134:653:22MKYCLT4:3:1107:31103:16231,147,Foxg1_g2,1,9,57S82M11S,=,1,180,CTCTTTCCCTACACGACGCTCTTCCGATCTTACTGCTCTCATTTCC...,IIIIIIII9*9IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII...,READ_2,CTCTTTCC,
3,lh00134:653:22MKYCLT4:3:1121:13337:8917,83,Foxg1_g1,1,9,60S73M17S,=,1,201,CCCAGATCTAGACACTCGTGACTGGAGTTCAGACGTGTGCTCTTCC...,9II99IIIIIIIIIII9IIIIIIIIIIIIIIIIIIIIIIIIIIIII...,READ_1,CCCAGATC,
4,lh00134:653:22MKYCLT4:3:1121:13337:8917,163,Foxg1_g1,1,9,9S73M68S,=,1,-201,GGTATGGGGAGCGCGTTGTAGCTGAACGGGTTTTAGAGCTAGAAAT...,IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII...,READ_2,GGTATGGG,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
189,lh00134:653:22MKYCLT4:3:2473:7707:26236,153,Foxg1_g1,1,9,74S52M24S,=,1,0,ATCATATGCTTACCGTAACTTGAAAGTATTTCGATTTCTAGGCTTT...,IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII9IIIIIIIIIII...,READ_2,ATCATATG,
190,lh00134:653:22MKYCLT4:3:2476:29622:10528,83,Foxg1_g1,1,9,53S73M24S,=,1,172,CAGACGTGTGCTCTTCCGATCTGAGGAATACTTGTGGAAAGGACGA...,IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII...,READ_1,CAGACGTG,
191,lh00134:653:22MKYCLT4:3:2476:29622:10528,163,Foxg1_g1,1,9,31S73M46S,=,1,-172,GAGGAATACTTGTGGAAAGGACGAAACACCGAGCGCGTTGTAGCTG...,IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII...,READ_2,GAGGAATA,
192,lh00134:653:22MKYCLT4:3:2484:35018:20351,83,Foxg1_g1,1,9,17S73M60S,=,1,163,GCTCTTCCGATCTGGGGAGCGCGTTGTAGCTGAACGGGTTTTAGAG...,IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII...,READ_1,GCTCTTCC,


In [8]:
# Filter rows where column 13 is not NaN and column 11 equals "READ_1"
result = df.loc[df[13].notna() & (df[11] == "READ_1"), 0]

result.tolist()

['lh00134:653:22MKYCLT4:3:1107:31103:16231',
 'lh00134:653:22MKYCLT4:3:1130:28635:15180',
 'lh00134:653:22MKYCLT4:3:1152:6331:18333',
 'lh00134:653:22MKYCLT4:3:1171:19825:28450',
 'lh00134:653:22MKYCLT4:3:1175:24874:20379',
 'lh00134:653:22MKYCLT4:3:1206:46514:12910',
 'lh00134:653:22MKYCLT4:3:1271:21597:1757',
 'lh00134:653:22MKYCLT4:3:1306:39605:26783',
 'lh00134:653:22MKYCLT4:3:1335:32494:10150',
 'lh00134:653:22MKYCLT4:3:1375:9389:10262',
 'lh00134:653:22MKYCLT4:3:1398:23992:13247',
 'lh00134:653:22MKYCLT4:3:2116:27673:10290',
 'lh00134:653:22MKYCLT4:3:2146:16614:10837',
 'lh00134:653:22MKYCLT4:3:2205:22204:18754',
 'lh00134:653:22MKYCLT4:3:2216:41943:22705',
 'lh00134:653:22MKYCLT4:3:2218:51304:23868',
 'lh00134:653:22MKYCLT4:3:2241:34630:13737',
 'lh00134:653:22MKYCLT4:3:2242:35124:14116',
 'lh00134:653:22MKYCLT4:3:2245:26030:27511',
 'lh00134:653:22MKYCLT4:3:2273:37575:3844',
 'lh00134:653:22MKYCLT4:3:2292:23603:9940',
 'lh00134:653:22MKYCLT4:3:2313:44654:23616',
 'lh00134:653:2

In [None]:
len(result)



26