In [None]:
import pandas as pd
import subprocess

# Load BAM into DataFrame
sam_output = subprocess.run(["samtools", "view", "intersect.bam"], capture_output=True, text=True)
sam_lines = sam_output.stdout.strip().split("\n")

# Convert SAM to DataFrame
columns = ["QNAME", "FLAG", "RNAME", "POS", "MAPQ", "CIGAR", "RNEXT", "PNEXT", "TLEN", "SEQ", "QUAL"]
data = [line.split("\t")[:11] for line in sam_lines]
df = pd.DataFrame(data, columns=columns)

# Convert FLAG to numeric
df["FLAG"] = pd.to_numeric(df["FLAG"])

# Assign read type
df["READ_TYPE"] = df["FLAG"].apply(lambda x: "READ_1" if x & 64 else ("READ_2" if x & 128 else "UNPAIRED"))


# Filtering condition
filtered_df = df[(df["RNEXT"] != "=") | (df["TLEN"] > 0)]

# Save filtered reads
filtered_df.to_csv("filtered_reads.sam", sep="\t", index=False, header=False)

# Save output
df.to_csv("reads_with_type.sam", sep="\t", index=False, header=False)

%history


In [None]:
import pandas as pd

df = pd.read_csv("/gpfs/home/asun/jin_lab/yap/pipeline0_bt2_local_alignment/split_s10/intersect.sam", sep="\t", header=None)
df.columns = ["QNAME", "FLAG", "RNAME", "POS", "MAPQ", "CIGAR", "RNEXT", "PNEXT", "TLEN", "SEQ", "QUAL", "READ_TYPE", "RP_INDEX", "WELL"]


In [15]:
df["RC"] = df["FLAG"].apply(lambda x: "RC" if x & 16 else "FORWARD")
df


Unnamed: 0,QNAME,FLAG,RNAME,POS,MAPQ,CIGAR,RNEXT,PNEXT,TLEN,SEQ,QUAL,READ_TYPE,RP_INDEX,WELL,RC
0,lh00134:653:22MKYCLT4:3:1102:23021:5428,177,Foxg1_g1,1,22,66S72M12S,chr8,30412233,0,TTACCGTAACTTGAAAGTATTTCGATTTCTTGGCTTTATATATCTT...,IIIIIIIIIIIIIIIIIIIIIIIIIIIIIII9IIIIIIIIIIIIII...,READ_2,TTACCGTA,,RC
1,lh00134:653:22MKYCLT4:3:1107:31103:16231,99,Foxg1_g2,1,9,27S82M41S,=,1,-180,TACTGCTCTCATTTCCCGGCCAATGCAGAGTTACAACGGGACCACG...,IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII...,READ_1,TACTGCTC,I1,FORWARD
2,lh00134:653:22MKYCLT4:3:1107:31103:16231,147,Foxg1_g2,1,9,57S82M11S,=,1,180,CTCTTTCCCTACACGACGCTCTTCCGATCTTACTGCTCTCATTTCC...,IIIIIIII9*9IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII...,READ_2,CTCTTTCC,,RC
3,lh00134:653:22MKYCLT4:3:1121:13337:8917,83,Foxg1_g1,1,9,60S73M17S,=,1,201,CCCAGATCTAGACACTCGTGACTGGAGTTCAGACGTGTGCTCTTCC...,9II99IIIIIIIIIII9IIIIIIIIIIIIIIIIIIIIIIIIIIIII...,READ_1,CCCAGATC,,RC
4,lh00134:653:22MKYCLT4:3:1121:13337:8917,163,Foxg1_g1,1,9,9S73M68S,=,1,-201,GGTATGGGGAGCGCGTTGTAGCTGAACGGGTTTTAGAGCTAGAAAT...,IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII...,READ_2,GGTATGGG,,FORWARD
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
189,lh00134:653:22MKYCLT4:3:2473:7707:26236,153,Foxg1_g1,1,9,74S52M24S,=,1,0,ATCATATGCTTACCGTAACTTGAAAGTATTTCGATTTCTAGGCTTT...,IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII9IIIIIIIIIII...,READ_2,ATCATATG,,RC
190,lh00134:653:22MKYCLT4:3:2476:29622:10528,83,Foxg1_g1,1,9,53S73M24S,=,1,172,CAGACGTGTGCTCTTCCGATCTGAGGAATACTTGTGGAAAGGACGA...,IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII...,READ_1,CAGACGTG,,RC
191,lh00134:653:22MKYCLT4:3:2476:29622:10528,163,Foxg1_g1,1,9,31S73M46S,=,1,-172,GAGGAATACTTGTGGAAAGGACGAAACACCGAGCGCGTTGTAGCTG...,IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII...,READ_2,GAGGAATA,,FORWARD
192,lh00134:653:22MKYCLT4:3:2484:35018:20351,83,Foxg1_g1,1,9,17S73M60S,=,1,163,GCTCTTCCGATCTGGGGAGCGCGTTGTAGCTGAACGGGTTTTAGAG...,IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII...,READ_1,GCTCTTCC,,RC


In [17]:
from Bio.Seq import Seq

# Function to reverse complement if RC column is "RC"
def conditional_reverse_complement(row):
    if row["RC"] == "RC":
        return str(Seq(row["SEQ"]).reverse_complement())
    return row["SEQ"]  # Keep the sequence unchanged if not "RC"

# Apply the function to create the TRUE_SEQ column
df["TRUE_SEQ"] = df.apply(conditional_reverse_complement, axis=1)

In [24]:
rp_indices = pd.read_csv("/gpfs/home/asun/jin_lab/yap/raw_data/384RPIndexes.csv")
rp_indices = rp_indices.rename(columns={"RP Index": "RP_INDEX", "Position": "WELL"})   
rp_indices

Unnamed: 0,WELL,RP_INDEX
0,A1,ACGATCAG
1,A3,TCGAGAGT
2,A5,CTAGCTCA
3,A7,ATCGTCTC
4,A9,TCGACAAG
...,...,...
379,P16,CCTAGAGA
380,P18,TACTAGCG
381,P20,CGTCCATT
382,P22,TCGCTATC


In [25]:
df["RP_INDEX"] = df["TRUE_SEQ"].apply(lambda x: x[:8])

df = df.merge(rp_indices, on=["RP_INDEX"], how="left")
df

Unnamed: 0,QNAME,FLAG,RNAME,POS,MAPQ,CIGAR,RNEXT,PNEXT,TLEN,SEQ,QUAL,READ_TYPE,RP_INDEX,WELL_x,RC,TRUE_SEQ,WELL_y
0,lh00134:653:22MKYCLT4:3:1102:23021:5428,177,Foxg1_g1,1,22,66S72M12S,chr8,30412233,0,TTACCGTAACTTGAAAGTATTTCGATTTCTTGGCTTTATATATCTT...,IIIIIIIIIIIIIIIIIIIIIIIIIIIIIII9IIIIIIIIIIIIII...,READ_2,GAGGGAAA,,RC,GAGGGAAATATTAAGTTGATAATGGATTAGTTTTATTTTAATTTGC...,
1,lh00134:653:22MKYCLT4:3:1107:31103:16231,99,Foxg1_g2,1,9,27S82M41S,=,1,-180,TACTGCTCTCATTTCCCGGCCAATGCAGAGTTACAACGGGACCACG...,IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII...,READ_1,TACTGCTC,I1,FORWARD,TACTGCTCTCATTTCCCGGCCAATGCAGAGTTACAACGGGACCACG...,I1
2,lh00134:653:22MKYCLT4:3:1107:31103:16231,147,Foxg1_g2,1,9,57S82M11S,=,1,180,CTCTTTCCCTACACGACGCTCTTCCGATCTTACTGCTCTCATTTCC...,IIIIIIII9*9IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII...,READ_2,GGGAGGAT,,RC,GGGAGGATATTAAGTTGATAATGGATTAGTTTTATTTAAACTTGCT...,
3,lh00134:653:22MKYCLT4:3:1121:13337:8917,83,Foxg1_g1,1,9,60S73M17S,=,1,201,CCCAGATCTAGACACTCGTGACTGGAGTTCAGACGTGTGCTCTTCC...,9II99IIIIIIIIIII9IIIIIIIIIIIIIIIIIIIIIIIIIIIII...,READ_1,GCGCATAT,,RC,GCGCATATCCACTTTTTCAAGTTGATAACGGACTAGCCTTATTTTA...,O23
4,lh00134:653:22MKYCLT4:3:1121:13337:8917,163,Foxg1_g1,1,9,9S73M68S,=,1,-201,GGTATGGGGAGCGCGTTGTAGCTGAACGGGTTTTAGAGCTAGAAAT...,IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII...,READ_2,GGTATGGG,,FORWARD,GGTATGGGGAGCGCGTTGTAGCTGAACGGGTTTTAGAGCTAGAAAT...,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
189,lh00134:653:22MKYCLT4:3:2473:7707:26236,153,Foxg1_g1,1,9,74S52M24S,=,1,0,ATCATATGCTTACCGTAACTTGAAAGTATTTCGATTTCTAGGCTTT...,IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII9IIIIIIIIIII...,READ_2,GGGGAAGA,,RC,GGGGAAGAGGTTAATGGATTAGTTTTATTTTAACTTGCTATTTCTA...,
190,lh00134:653:22MKYCLT4:3:2476:29622:10528,83,Foxg1_g1,1,9,53S73M24S,=,1,172,CAGACGTGTGCTCTTCCGATCTGAGGAATACTTGTGGAAAGGACGA...,IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII...,READ_1,ACAACAGC,,RC,ACAACAGCTATCAACCCAGAGTACCAAGTTGATAACGGACTAGCCT...,J17
191,lh00134:653:22MKYCLT4:3:2476:29622:10528,163,Foxg1_g1,1,9,31S73M46S,=,1,-172,GAGGAATACTTGTGGAAAGGACGAAACACCGAGCGCGTTGTAGCTG...,IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII...,READ_2,GAGGAATA,,FORWARD,GAGGAATACTTGTGGAAAGGACGAAACACCGAGCGCGTTGTAGCTG...,
192,lh00134:653:22MKYCLT4:3:2484:35018:20351,83,Foxg1_g1,1,9,17S73M60S,=,1,163,GCTCTTCCGATCTGGGGAGCGCGTTGTAGCTGAACGGGTTTTAGAG...,IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII...,READ_1,GCGCATAT,,RC,GCGCATATTAACCATTCAGCTACAACGCGCTCCCCAAGCAGTGGTA...,O23


In [28]:

# Filtering condition
filtered_df = df[(df["RNEXT"] != "=") | (df["READ_TYPE"] == "READ_1")]
filtered_df

Unnamed: 0,QNAME,FLAG,RNAME,POS,MAPQ,CIGAR,RNEXT,PNEXT,TLEN,SEQ,QUAL,READ_TYPE,RP_INDEX,WELL_x,RC,TRUE_SEQ,WELL_y
0,lh00134:653:22MKYCLT4:3:1102:23021:5428,177,Foxg1_g1,1,22,66S72M12S,chr8,30412233,0,TTACCGTAACTTGAAAGTATTTCGATTTCTTGGCTTTATATATCTT...,IIIIIIIIIIIIIIIIIIIIIIIIIIIIIII9IIIIIIIIIIIIII...,READ_2,GAGGGAAA,,RC,GAGGGAAATATTAAGTTGATAATGGATTAGTTTTATTTTAATTTGC...,
1,lh00134:653:22MKYCLT4:3:1107:31103:16231,99,Foxg1_g2,1,9,27S82M41S,=,1,-180,TACTGCTCTCATTTCCCGGCCAATGCAGAGTTACAACGGGACCACG...,IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII...,READ_1,TACTGCTC,I1,FORWARD,TACTGCTCTCATTTCCCGGCCAATGCAGAGTTACAACGGGACCACG...,I1
3,lh00134:653:22MKYCLT4:3:1121:13337:8917,83,Foxg1_g1,1,9,60S73M17S,=,1,201,CCCAGATCTAGACACTCGTGACTGGAGTTCAGACGTGTGCTCTTCC...,9II99IIIIIIIIIII9IIIIIIIIIIIIIIIIIIIIIIIIIIIII...,READ_1,GCGCATAT,,RC,GCGCATATCCACTTTTTCAAGTTGATAACGGACTAGCCTTATTTTA...,O23
5,lh00134:653:22MKYCLT4:3:1130:28635:15180,99,Foxg1_g1,1,22,40S44M66S,=,1,-211,ACAACAGCTTCATATATCTTGTGGAAAGGACGAAACACCGAGCGCG...,IIIIIII9IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII...,READ_1,ACAACAGC,J17,FORWARD,ACAACAGCTTCATATATCTTGTGGAAAGGACGAAACACCGAGCGCG...,J17
7,lh00134:653:22MKYCLT4:3:1132:8677:2359,83,Dnmt3a_g1,3,9,31S71M48S,=,3,170,GACGTGTGCTCTTCCGATCTGGGAGGAAGGATAGAACTCAAAGAAG...,IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII...,READ_1,TACTGCTC,,RC,TACTGCTCCCCCTCTTCTTTGAGTTCTATGTGGTATCAAGCAGAGT...,I1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
183,lh00134:653:22MKYCLT4:3:2459:22244:5848,99,Foxg1_g1,1,9,40S72M38S,=,1,-174,ACAACAGCTTTATTTATCTTGTGGAAAGGACGAAACACCGAGCGCG...,IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII...,READ_1,ACAACAGC,J17,FORWARD,ACAACAGCTTTATTTATCTTGTGGAAAGGACGAAACACCGAGCGCG...,J17
185,lh00134:653:22MKYCLT4:3:2463:35132:24695,83,Foxg1_g1,1,16,29S73M48S,=,1,158,TCCGATCTGGAGAGAGGAGCGAAACACTGAGCGCGTTGTAGCTGAA...,IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII...,READ_1,ACAACAGC,,RC,ACAACAGCTTCACCAATGCCTCTGCGCACCGACTCGGTGCCACTTT...,J17
187,lh00134:653:22MKYCLT4:3:2472:30326:21080,99,Safe_g1,2,9,16S71M63S,=,2,-202,ACAAGACGACACACCACTAGAAACCTAAAAATCTAGTTTTAGAGCT...,IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII...,READ_1,ACAAGACG,M22,FORWARD,ACAAGACGACACACCACTAGAAACCTAAAAATCTAGTTTTAGAGCT...,M22
190,lh00134:653:22MKYCLT4:3:2476:29622:10528,83,Foxg1_g1,1,9,53S73M24S,=,1,172,CAGACGTGTGCTCTTCCGATCTGAGGAATACTTGTGGAAAGGACGA...,IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII...,READ_1,ACAACAGC,,RC,ACAACAGCTATCAACCCAGAGTACCAAGTTGATAACGGACTAGCCT...,J17


In [37]:
final_df = filtered_df[["RNAME", "WELL_y"]]
final_df.to_csv("local_align_extra.tsv", sep="\t", index=False, header=True)

In [29]:
# Count rows with NaN in the 'POSITION' column
nan_count = filtered_df["WELL_y"].isna().sum()

print("Number of rows with NaN in 'WELL_y':", nan_count)

Number of rows with NaN in 'WELL_y': 18


In [30]:
# Count rows with NaN in the 'POSITION' column
nan_count = filtered_df["WELL_x"].isna().sum()

print("Number of rows with NaN in 'WELL_x':", nan_count)

Number of rows with NaN in 'WELL_x': 73


In [31]:
unique_count = filtered_df["QNAME"].nunique()
print(f"Number of unique QNAME values: {unique_count}")

Number of unique QNAME values: 99


In [34]:
# Filter rows where column 13 is not NaN and column 11 equals "READ_1"
result = filtered_df.loc[filtered_df["WELL_y"].notna() & (filtered_df["READ_TYPE"] == "READ_1"), "QNAME"]

result.tolist()

['lh00134:653:22MKYCLT4:3:1107:31103:16231',
 'lh00134:653:22MKYCLT4:3:1121:13337:8917',
 'lh00134:653:22MKYCLT4:3:1130:28635:15180',
 'lh00134:653:22MKYCLT4:3:1132:8677:2359',
 'lh00134:653:22MKYCLT4:3:1133:3257:17408',
 'lh00134:653:22MKYCLT4:3:1136:30067:17913',
 'lh00134:653:22MKYCLT4:3:1152:6331:18333',
 'lh00134:653:22MKYCLT4:3:1156:18830:6997',
 'lh00134:653:22MKYCLT4:3:1156:18847:6997',
 'lh00134:653:22MKYCLT4:3:1171:19825:28450',
 'lh00134:653:22MKYCLT4:3:1175:24874:20379',
 'lh00134:653:22MKYCLT4:3:1187:4616:19706',
 'lh00134:653:22MKYCLT4:3:1193:12925:26390',
 'lh00134:653:22MKYCLT4:3:1206:46514:12910',
 'lh00134:653:22MKYCLT4:3:1208:12342:1448',
 'lh00134:653:22MKYCLT4:3:1210:21039:12672',
 'lh00134:653:22MKYCLT4:3:1225:51393:3844',
 'lh00134:653:22MKYCLT4:3:1228:7108:6227',
 'lh00134:653:22MKYCLT4:3:1233:40495:5316',
 'lh00134:653:22MKYCLT4:3:1254:29566:12700',
 'lh00134:653:22MKYCLT4:3:1256:37987:5540',
 'lh00134:653:22MKYCLT4:3:1271:21597:1757',
 'lh00134:653:22MKYCLT4:3

In [35]:
with open("readnames_with_well_rc_adj.txt", "w") as f:
    for item in result.tolist():
        f.write(str(item) + "\n")


In [11]:
import os
print(os.getcwd())

/gpfs/group/jin/asun/yap
