# Testing Notebook for ETL

Imports functions from oas_onboarding/etl and tests those functions

In [1]:
import sys
import pandas as pd
from Bio.Align import PairwiseAligner

sys.path.insert(0, '/home/cameronhu/oas_onboarding/src/etl')
print(sys.path)

['/home/cameronhu/oas_onboarding/src/etl', '/home/cameronhu/miniforge3/lib/python312.zip', '/home/cameronhu/miniforge3/lib/python3.12', '/home/cameronhu/miniforge3/lib/python3.12/lib-dynload', '', '/home/cameronhu/miniforge3/lib/python3.12/site-packages']


In [2]:
from process import *

pd.set_option('display.max_columns', None)

# Unpaired Data

In [3]:
heavy_data_file = '/export/share/cameronhu/oas/unpaired/unpaired_human/unpaired_human_heavy/1279049_1_Heavy_Bulk.csv.gz'
heavy_metadata = parse_metadata(heavy_data_file)

heavy_metadata

{'Run': 1279049,
 'Link': 'https://doi.org/10.1038/s41586-022-05371-z',
 'Author': 'Jaffe et al., 2022',
 'Species': 'human',
 'Age': 35,
 'BSource': 'PBMC',
 'BType': 'Naive-B-Cells',
 'Vaccine': 'None',
 'Disease': 'SARS-COV-2',
 'Subject': 'Donor-2',
 'Longitudinal': 'no',
 'Unique sequences': 2,
 'Total sequences': 2,
 'Isotype': 'Bulk',
 'Chain': 'Heavy',
 'metadata_uid': '5db6a544-653e-4091-99c5-04ac2ed74f52'}

In [4]:
# Raw reading of OAS data file
heavy_sequence_raw = pd.read_csv(heavy_data_file, header=1)

heavy_sequence_raw.shape[1]

97

In [5]:
heavy_sequence_df, heavy_antibody_df = parse_sequence_antibody_data(heavy_data_file, heavy_metadata)
heavy_sequence_df.head()

Unnamed: 0,sequence,locus,stop_codon,vj_in_frame,v_frameshift,productive,rev_comp,complete_vdj,v_call,d_call,j_call,sequence_alignment,germline_alignment,sequence_alignment_aa,germline_alignment_aa,v_alignment_start,v_alignment_end,d_alignment_start,d_alignment_end,j_alignment_start,j_alignment_end,v_sequence_alignment,v_sequence_alignment_aa,v_germline_alignment,v_germline_alignment_aa,d_sequence_alignment,d_sequence_alignment_aa,d_germline_alignment,d_germline_alignment_aa,j_sequence_alignment,j_sequence_alignment_aa,j_germline_alignment,j_germline_alignment_aa,fwr1,fwr1_aa,cdr1,cdr1_aa,fwr2,fwr2_aa,cdr2,cdr2_aa,fwr3,fwr3_aa,fwr4,fwr4_aa,cdr3,cdr3_aa,junction,junction_length,junction_aa,junction_aa_length,v_score,d_score,j_score,v_cigar,d_cigar,j_cigar,v_support,d_support,j_support,v_identity,d_identity,j_identity,v_sequence_start,v_sequence_end,v_germline_start,v_germline_end,d_sequence_start,d_sequence_end,d_germline_start,d_germline_end,j_sequence_start,j_sequence_end,j_germline_start,j_germline_end,fwr1_start,fwr1_end,cdr1_start,cdr1_end,fwr2_start,fwr2_end,cdr2_start,cdr2_end,fwr3_start,fwr3_end,fwr4_start,fwr4_end,cdr3_start,cdr3_end,np1,np1_length,np2,np2_length,c_region,ANARCI_numbering,ANARCI_status,Redundancy,antibody_uid,Isotype,sequence_uid
0,AGCTCTGAGAGAGGAGCCCAGCCCTGGGATTTTCAGGTGTTTTCAT...,H,F,T,F,T,F,T,IGHV3-23*01,IGHD3-22*01,IGHJ4*02,GAGGTGCAGCTGTTGGAGTCTGGGGGAGGCTTGGTACAGCCTGGGG...,GAGGTGCAGCTGTTGGAGTCTGGGGGAGGCTTGGTACAGCCTGGGG...,EVQLLESGGGLVQPGGSLRLSCAASGFTFSSYAMSWVRQAPGKGLE...,EVQLLESGGGLVQPGGSLRLSCAASGFTFSSYAMSWVRQAPGKGLE...,1,291,296.0,318.0,326,370,GAGGTGCAGCTGTTGGAGTCTGGGGGAGGCTTGGTACAGCCTGGGG...,EVQLLESGGGLVQPGGSLRLSCAASGFTFSSYAMSWVRQAPGKGLE...,GAGGTGCAGCTGTTGGAGTCTGGGGGAGGCTTGGTACAGCCTGGGG...,EVQLLESGGGLVQPGGSLRLSCAASGFTFSSYAMSWVRQAPGKGLE...,ATTACTATGATAGTAGTGGTTAT,YYDSSGY,ATTACTATGATAGTAGTGGTTAT,YYDSSGY,ACTTTGACTACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAG,FDYWGQGTLVTVSS,ACTTTGACTACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAG,FDYWGQGTLVTVSS,GAGGTGCAGCTGTTGGAGTCTGGGGGAGGCTTGGTACAGCCTGGGG...,EVQLLESGGGLVQPGGSLRLSCAAS,GGATTCACCTTTAGCAGCTATGCC,GFTFSSYA,ATGAGCTGGGTCCGCCAGGCTCCAGGGAAGGGGCTGGAGTGGGTCT...,MSWVRQAPGKGLEWVSA,ATTAGTGGTAGTGGTGGTAGCACA,ISGSGGST,TACTACGCAGACTCCGTGAAGGGCCGGTTCACCATCTCCAGAGACA...,YYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYC,TGGGGCCAGGGAACCCTGGTCACCGTCTCCTCA,WGQGTLVTVSS,GCGCGAAATTACTATGATAGTAGTGGTTATATTACTAACTTTGACTAC,ARNYYDSSGYITNFDY,TGTGCGCGAAATTACTATGATAGTAGTGGTTATATTACTAACTTTG...,54.0,CARNYYDSSGYITNFDYW,18.0,455.247,44.909,87.208,136S291M92S5N,431S2N23M65S6N,461S3N45M13S,5.461e-130,7.025e-09,2.822e-21,100.0,100.0,100.0,137,427,1,291,432.0,454.0,3.0,25.0,462,506,4,48,137,211,212,235,236,286,287,310,311,424,473.0,505.0,425.0,472.0,CGAA,4,ATTACTA,7.0,CACCCACCAAGGC,"{'fwh1': {'1 ': 'E', '2 ': 'V', '3 ': 'Q', '4 ...","|Deletions: 10, 73||||",1,1e884061-00f4-4ddb-afc5-7bcb6a1a1c0a,Bulk,5e6d8403-cf1b-4f82-aa9e-023dae997140
1,ATGCTTTCTGAGAGTCATGGATCTCATGTGCAAGAAAATGAAGCAC...,H,F,T,F,T,F,T,IGHV4-39*01,,IGHJ6*02,CAGCTGCAGCTGCAGGAGTCGGGCCCAGGACTGGTGAAGCCTTCGG...,CAGCTGCAGCTGCAGGAGTCGGGCCCAGGACTGGTGAAGCCTTCGG...,QLQLQESGPGLVKPSETLSLTCTVSGGSISSSSYYWGWIRQPPGKG...,QLQLQESGPGLVKPSETLSLTCTVSGGSISSSSYYWGWIRQPPGKG...,1,298,,,310,366,CAGCTGCAGCTGCAGGAGTCGGGCCCAGGACTGGTGAAGCCTTCGG...,QLQLQESGPGLVKPSETLSLTCTVSGGSISSSSYYWGWIRQPPGKG...,CAGCTGCAGCTGCAGGAGTCGGGCCCAGGACTGGTGAAGCCTTCGG...,QLQLQESGPGLVKPSETLSLTCTVSGGSISSSSYYWGWIRQPPGKG...,,,,,TACTACTACTACGGTATGGACGTCTGGGGCCAAGGGACCACGGTCA...,YYYYGMDVWGQGTTVTVSS,TACTACTACTACGGTATGGACGTCTGGGGCCAAGGGACCACGGTCA...,YYYYGMDVWGQGTTVTVSS,CAGCTGCAGCTGCAGGAGTCGGGCCCAGGACTGGTGAAGCCTTCGG...,QLQLQESGPGLVKPSETLSLTCTVS,GGTGGCTCCATCAGCAGTAGTAGTTACTAC,GGSISSSSYY,TGGGGCTGGATCCGCCAGCCCCCAGGGAAGGGGCTGGAGTGGATTG...,WGWIRQPPGKGLEWIGS,ATCTATTATAGTGGGAGCACC,IYYSGST,TACTACAACCCGTCCCTCAAGAGTCGAGTCACCATATCCGTAGACA...,YYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYC,TGGGGCCAAGGGACCACGGTCACCGTCTCCTCA,WGQGTTVTVSS,GCGAGACTCGCAGAAAGATACTACTACTACGGTATGGACGTC,ARLAERYYYYGMDV,TGTGCGAGACTCGCAGAAAGATACTACTACTACGGTATGGACGTCTGG,48.0,CARLAERYYYYGMDVW,16.0,466.153,,110.281,94S298M69S1N,,403S5N57M1S,2.508e-133,,2.834e-28,100.0,,100.0,95,392,1,298,,,,,404,460,6,62,95,169,170,199,200,250,251,271,272,385,428.0,460.0,386.0,427.0,TCGCAGAAAGA,11,,,G,"{'fwh1': {'1 ': 'Q', '2 ': 'L', '3 ': 'Q', '4 ...","|Deletions: 10, 73||||",1,70f0059b-a57f-4815-a1c8-ce2a11135b93,Bulk,0eb4ffa0-493d-41da-8d52-949caa94003d


In [6]:
# The number of columns in the unpaired data after assigning UID should be 97 + 3 = 99. 
# Added columns are Antibody UID, Sequence UID, Isotype 
heavy_sequence_df.shape[1]

100

# Paired Data

In [5]:
paired_data_file = '/export/share/cameronhu/oas/paired/paired_human/1_S1__1_Paired_All.csv.gz'
paired_metadata = parse_metadata(paired_data_file)

paired_metadata

{'Run': '1_S1_',
 'Link': 'https://doi.org/10.1038/s41590-022-01230-1',
 'Author': 'Phad et al., 2022',
 'Species': 'human',
 'Age': 'no',
 'BSource': 'PBMC',
 'BType': 'Plasma-B-Cells',
 'Vaccine': 'None',
 'Disease': 'None',
 'Subject': 'Donor-2',
 'Longitudinal': 'Jul-Year-2020',
 'Unique sequences': 4668,
 'Isotype': 'All',
 'Chain': 'Paired',
 'metadata_uid': '83e9ef96-21fc-40de-a0ca-9c80370bbfc5'}

In [6]:
paired_data_raw = pd.read_csv(paired_data_file, header=1)
paired_data_raw.shape[1]

198

In [7]:
paired_df, antibody_df = parse_sequence_antibody_data(paired_data_file, paired_metadata)
paired_df.head()

Unnamed: 0,sequence_id_heavy,sequence_heavy,locus_heavy,stop_codon_heavy,vj_in_frame_heavy,v_frameshift_heavy,productive_heavy,rev_comp_heavy,complete_vdj_heavy,v_call_heavy,d_call_heavy,j_call_heavy,sequence_alignment_heavy,germline_alignment_heavy,sequence_alignment_aa_heavy,germline_alignment_aa_heavy,v_alignment_start_heavy,v_alignment_end_heavy,d_alignment_start_heavy,d_alignment_end_heavy,j_alignment_start_heavy,j_alignment_end_heavy,v_sequence_alignment_heavy,v_sequence_alignment_aa_heavy,v_germline_alignment_heavy,v_germline_alignment_aa_heavy,d_sequence_alignment_heavy,d_sequence_alignment_aa_heavy,d_germline_alignment_heavy,d_germline_alignment_aa_heavy,j_sequence_alignment_heavy,j_sequence_alignment_aa_heavy,j_germline_alignment_heavy,j_germline_alignment_aa_heavy,fwr1_heavy,fwr1_aa_heavy,cdr1_heavy,cdr1_aa_heavy,fwr2_heavy,fwr2_aa_heavy,cdr2_heavy,cdr2_aa_heavy,fwr3_heavy,fwr3_aa_heavy,fwr4_heavy,fwr4_aa_heavy,cdr3_heavy,cdr3_aa_heavy,junction_heavy,junction_length_heavy,junction_aa_heavy,junction_aa_length_heavy,v_score_heavy,d_score_heavy,j_score_heavy,v_cigar_heavy,d_cigar_heavy,j_cigar_heavy,v_support_heavy,d_support_heavy,j_support_heavy,v_identity_heavy,d_identity_heavy,j_identity_heavy,v_sequence_start_heavy,v_sequence_end_heavy,v_germline_start_heavy,v_germline_end_heavy,d_sequence_start_heavy,d_sequence_end_heavy,d_germline_start_heavy,d_germline_end_heavy,j_sequence_start_heavy,j_sequence_end_heavy,j_germline_start_heavy,j_germline_end_heavy,fwr1_start_heavy,fwr1_end_heavy,cdr1_start_heavy,cdr1_end_heavy,fwr2_start_heavy,fwr2_end_heavy,cdr2_start_heavy,cdr2_end_heavy,fwr3_start_heavy,fwr3_end_heavy,fwr4_start_heavy,fwr4_end_heavy,cdr3_start_heavy,cdr3_end_heavy,np1_heavy,np1_length_heavy,np2_heavy,np2_length_heavy,c_region_heavy,Isotype_heavy,Redundancy_heavy,ANARCI_numbering_heavy,ANARCI_status_heavy,sequence_id_light,sequence_light,locus_light,stop_codon_light,vj_in_frame_light,v_frameshift_light,productive_light,rev_comp_light,complete_vdj_light,v_call_light,d_call_light,j_call_light,sequence_alignment_light,germline_alignment_light,sequence_alignment_aa_light,germline_alignment_aa_light,v_alignment_start_light,v_alignment_end_light,d_alignment_start_light,d_alignment_end_light,j_alignment_start_light,j_alignment_end_light,v_sequence_alignment_light,v_sequence_alignment_aa_light,v_germline_alignment_light,v_germline_alignment_aa_light,d_sequence_alignment_light,d_sequence_alignment_aa_light,d_germline_alignment_light,d_germline_alignment_aa_light,j_sequence_alignment_light,j_sequence_alignment_aa_light,j_germline_alignment_light,j_germline_alignment_aa_light,fwr1_light,fwr1_aa_light,cdr1_light,cdr1_aa_light,fwr2_light,fwr2_aa_light,cdr2_light,cdr2_aa_light,fwr3_light,fwr3_aa_light,fwr4_light,fwr4_aa_light,cdr3_light,cdr3_aa_light,junction_light,junction_length_light,junction_aa_light,junction_aa_length_light,v_score_light,d_score_light,j_score_light,v_cigar_light,d_cigar_light,j_cigar_light,v_support_light,d_support_light,j_support_light,v_identity_light,d_identity_light,j_identity_light,v_sequence_start_light,v_sequence_end_light,v_germline_start_light,v_germline_end_light,d_sequence_start_light,d_sequence_end_light,d_germline_start_light,d_germline_end_light,j_sequence_start_light,j_sequence_end_light,j_germline_start_light,j_germline_end_light,fwr1_start_light,fwr1_end_light,cdr1_start_light,cdr1_end_light,fwr2_start_light,fwr2_end_light,cdr2_start_light,cdr2_end_light,fwr3_start_light,fwr3_end_light,fwr4_start_light,fwr4_end_light,cdr3_start_light,cdr3_end_light,np1_light,np1_length_light,np2_light,np2_length_light,c_region_light,Isotype_light,Redundancy_light,ANARCI_numbering_light,ANARCI_status_light,antibody_uid,sequence_uid
0,AAACCTGAGTCAATAG-1_contig_1,AGCTCTGGGAGAGGAGCTCCAGCCTTGGGATTCCCAGCTGTCTCCA...,H,F,T,F,T,F,T,IGHV3-73*02,IGHD2-2*01,IGHJ4*02,GAGGTGCAGCTGGTGGAGTCCGGGGGAGGCTTGGTCCAGCCTGGGG...,GAGGTGCAGCTGGTGGAGTCCGGGGGAGGCTTGGTCCAGCCTGGGG...,EVQLVESGGGLVQPGGSLKLSCAASGFTFSGSAMHWVRQASGKGLE...,EVQLVESGGGLVQPGGSLKLSCAASGFTFSGSAMHWVRQASGKGLE...,1,302,303.0,323.0,332,373,GAGGTGCAGCTGGTGGAGTCCGGGGGAGGCTTGGTCCAGCCTGGGG...,EVQLVESGGGLVQPGGSLKLSCAASGFTFSGSAMHWVRQASGKGLE...,GAGGTGCAGCTGGTGGAGTCCGGGGGAGGCTTGGTCCAGCCTGGGG...,EVQLVESGGGLVQPGGSLKLSCAASGFTFSGSAMHWVRQASGKGLE...,TTGTAGTAGTACCAGCTGCTA,CSSTSC,TTGTAGTAGTACCAGCTGCTA,CSSTSC,TTGACTACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAG,DYWGQGTLVTVSS,TTGACTACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAG,DYWGQGTLVTVSS,GAGGTGCAGCTGGTGGAGTCCGGGGGAGGCTTGGTCCAGCCTGGGG...,EVQLVESGGGLVQPGGSLKLSCAAS,GGGTTCACCTTCAGTGGCTCTGCT,GFTFSGSA,ATGCACTGGGTCCGCCAGGCTTCCGGGAAAGGGCTGGAGTGGGTTG...,MHWVRQASGKGLEWVGR,ATTAGAAGCAAAGCTAACAGTTACGCGACA,IRSKANSYAT,GCATATGCTGCGTCGGTGAAAGGCAGGTTCACCATCTCCAGAGATG...,AYAASVKGRFTISRDDSKNTAYLQMNSLKTEDTAVYYC,TGGGGCCAGGGAACCCTGGTCACCGTCTCCTCA,WGQGTLVTVSS,ACTAGACATTGTAGTAGTACCAGCTGCTACTCCGTGGTTGACTAC,TRHCSSTSCYSVVDY,TGTACTAGACATTGTAGTAGTACCAGCTGCTACTCCGTGGTTGACT...,51.0,CTRHCSSTSCYSVVDYW,17.0,472.386,41.064,81.44,137S302M231S,439S6N21M210S4N,468S6N42M160S,4.92e-135,1.309e-07,1.9949999999999998e-19,100.0,100.0,100.0,138,439,1,302,440.0,460.0,7.0,27.0,469,510,7,48,138,212,213,236,237,287,288,317,318,431,477.0,509.0,432.0,476.0,,0,CTCCGTGG,8.0,CATCCCCGACCAGCCCCAAGGTCTTCCCGCTGAGCCTCTGCAGCAC...,IGHA,1,"{'fwh1': {'1 ': 'E', '2 ': 'V', '3 ': 'Q', '4 ...","|Deletions: 10, 73||||",AAACCTGAGTCAATAG-1_contig_2,AGCTTCAGCTGTGGTAGAGAAGACAGGATTCAGGACAATCTCCAGC...,L,F,T,F,T,F,T,IGLV1-47*01,,IGLJ2*01,CAGTCTGTGCTGACTCAGCCACCCTCAGCGTCTGGGACCCCCGGGC...,CAGTCTGTGCTGACTCAGCCACCCTCAGCGTCTGGGACCCCCGGGC...,QSVLTQPPSASGTPGQRVTISCSGSSSNIGSNYVYWYQQLPGTAPK...,QSVLTQPPSASGTPGQRVTISCSGSSSNIGSNYVYWYQQLPGTAPK...,1,296,,,298,334,CAGTCTGTGCTGACTCAGCCACCCTCAGCGTCTGGGACCCCCGGGC...,QSVLTQPPSASGTPGQRVTISCSGSSSNIGSNYVYWYQQLPGTAPK...,CAGTCTGTGCTGACTCAGCCACCCTCAGCGTCTGGGACCCCCGGGC...,QSVLTQPPSASGTPGQRVTISCSGSSSNIGSNYVYWYQQLPGTAPK...,,,,,GTGGTATTCGGCGGAGGGACCAAGCTGACCGTCCTAG,VVFGGGTKLTVL,GTGGTATTCGGCGGAGGGACCAAGCTGACCGTCCTAG,VVFGGGTKLTVL,CAGTCTGTGCTGACTCAGCCACCCTCAGCGTCTGGGACCCCCGGGC...,QSVLTQPPSASGTPGQRVTISCSGS,AGCTCCAACATCGGAAGTAATTAT,SSNIGSNY,GTATACTGGTACCAGCAGCTCCCAGGAACGGCCCCCAAACTCCTCA...,VYWYQQLPGTAPKLLIY,AGGAATAAT,RNN,CAGCGGCCCTCAGGGGTCCCTGACCGATTCTCTGGCTCCAAGTCTG...,QRPSGVPDRFSGSKSGTSASLAISGLRSEDEADYYC,TTCGGCGGAGGGACCAAGCTGACCGTCCTA,FGGGTKLTVL,GCAGCATGGGATGACAGCCTGAGTGGTCCCGTGGTA,AAWDDSLSGPVV,TGTGCAGCATGGGATGACAGCCTGAGTGGTCCCGTGGTATTC,42.0,CAAWDDSLSGPVVF,14.0,463.037,,71.827,103S296M249S,,400S1N37M211S,3.0969999999999997e-132,,1.51e-16,100.0,,100.0,104,399,1,296,,,,,401,437,2,38,104,178,179,202,203,253,254,262,263,370,407.0,436.0,371.0,406.0,C,1,,,GTCAGCCCAAGGCTGCCCCCTCGGTCACTCTGTTCCCGCCCTCCTC...,Bulk,1,"{'fwl1': {'1 ': 'Q', '2 ': 'S', '3 ': 'V', '4 ...",|||||,7b1b379b-17fe-49b4-a884-1244a8df3626,d41e6c64-4537-45dd-91af-bc87b12c7ed9
1,AAACCTGCAACAACCT-1_contig_2,AGCTCTGAGAGCGGAGCCCCAGCCCCAGAATTCCCAGGTGTTTTCA...,H,F,T,F,T,F,T,IGHV3-72*01,IGHD6-19*01,IGHJ4*02,GAGGTGCAACTGGTGGAGTCTGGGGGAGGCTTGGTCCAGCCTGGAG...,GAGGTGCAGCTGGTGGAGTCTGGGGGAGGCTTGGTCCAGCCTGGAG...,EVQLVESGGGLVQPGGSLRLSCAASGFTFSDHYMEWVRQAPGKGLE...,EVQLVESGGGLVQPGGSLRLSCAASGFTFSDHYMDWVRQAPGKGLE...,1,301,310.0,315.0,320,361,GAGGTGCAACTGGTGGAGTCTGGGGGAGGCTTGGTCCAGCCTGGAG...,EVQLVESGGGLVQPGGSLRLSCAASGFTFSDHYMEWVRQAPGKGLE...,GAGGTGCAGCTGGTGGAGTCTGGGGGAGGCTTGGTCCAGCCTGGAG...,EVQLVESGGGLVQPGGSLRLSCAASGFTFSDHYMDWVRQAPGKGLE...,GGCTGG,GW,GGCTGG,GW,TTGACTACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAG,DYWGQGTLVTVSS,TTGACTACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAG,DYWGQGTLVTVSS,GAGGTGCAACTGGTGGAGTCTGGGGGAGGCTTGGTCCAGCCTGGAG...,EVQLVESGGGLVQPGGSLRLSCAAS,GGATTCACCTTCAGTGACCATTAC,GFTFSDHY,ATGGAGTGGGTCCGCCAGGCTCCAGGGAAGGGGCTGGAGTGGGTTG...,MEWVRQAPGKGLEWVGR,TCTAGAAACAAAGCTAACAGTTACACCACT,SRNKANSYTT,GCGTACGCCGCGTCTGTGCAAGGCAGATTCACCATCTCAAGAGATA...,AYAASVQGRFTISRDNSKNSLYLQMNSLKTEDTAVYYC,TGGGGCCAGGGAACCCTGGTCACCGTCTCCTCA,WGQGTLVTVSS,GCTAGAGGGGGGCAAGGCTGGCCCCTTGACTAC,ARGGQGWPLDY,TGTGCTAGAGGGGGGCAAGGCTGGCCCCTTGACTACTGG,39.0,CARGGQGWPLDYW,13.0,442.782,12.223,81.44,137S301M131S1N,446S12N6M117S3N,456S6N42M71S,3.401e-126,53.3,1.6889999999999997e-19,97.01,100.0,100.0,138,438,1,301,447.0,452.0,13.0,18.0,457,498,7,48,138,212,213,236,237,287,288,317,318,431,465.0,497.0,432.0,464.0,GGGGGCAA,8,CCCC,4.0,GGAGTGCATCCGCCCCAACCCTTTTCCCCCTCGTCTCCTGTGAGAA...,IGHM,1,"{'fwh1': {'1 ': 'E', '2 ': 'V', '3 ': 'Q', '4 ...","|Deletions: 10, 73||||",AAACCTGCAACAACCT-1_contig_1,TGTGCTTCTATTTTCTTATATGGGGAGGAGTCAGTCTCAGTCAGGA...,K,F,T,F,T,F,T,IGKV1-39*01,,IGKJ1*01,GACATCCAGATGACCCAGTCTCCATCCTCCCTGTCTGCATCTGTAG...,GACATCCAGATGACCCAGTCTCCATCCTCCCTGTCTGCATCTGTAG...,DIQMTQSPSSLSASVGDRVTITCRASQSIGTYLNWYQQKPGKAPKL...,DIQMTQSPSSLSASVGDRVTITCRASQSISSYLNWYQQKPGKAPKL...,1,283,,,286,322,GACATCCAGATGACCCAGTCTCCATCCTCCCTGTCTGCATCTGTAG...,DIQMTQSPSSLSASVGDRVTITCRASQSIGTYLNWYQQKPGKAPKL...,GACATCCAGATGACCCAGTCTCCATCCTCCCTGTCTGCATCTGTAG...,DIQMTQSPSSLSASVGDRVTITCRASQSISSYLNWYQQKPGKAPKL...,,,,,TGGACGTTCGGCCAAGGGACCAGGGTGGACATCAAAC,WTFGQGTRVDIK,TGGACGTTCGGCCAAGGGACCAAGGTGGAAATCAAAC,WTFGQGTKVEIK,GACATCCAGATGACCCAGTCTCCATCCTCCCTGTCTGCATCTGTAG...,DIQMTQSPSSLSASVGDRVTITCRAS,CAGAGCATTGGCACCTAT,QSIGTY,TTAAATTGGTATCAGCAGAAACCTGGGAAAGCCCCTAAACTCCTGA...,LNWYQQKPGKAPKLLIY,GCTGCATCC,AAS,AGTTTGCAAAGTGGGGTCCCATCAAGGTTCAGTGGCAGTGGATCTG...,SLQSGVPSRFSGSGSGTDFTLTISSLQPEDFATYYC,TTCGGCCAAGGGACCAGGGTGGACATCAAA,FGQGTRVDIK,CAACAGAGTTACAGTACCCTTTGGACG,QQSYSTLWT,TGTCAACAGAGTTACAGTACCCTTTGGACGTTC,33.0,CQQSYSTLWTF,11.0,427.201,,60.291,118S283M175S4N,,403S1N37M136S,1.6790000000000001e-121,,3.978e-13,98.233,,94.595,119,401,1,283,,,,,404,440,2,38,119,196,197,214,215,265,266,274,275,382,410.0,439.0,383.0,409.0,TT,2,,,GAACTGTGGCTGCACCATCTGTCTTCATCTTCCCGCCATCTGATGA...,Bulk,1,"{'fwk1': {'1 ': 'D', '2 ': 'I', '3 ': 'Q', '4 ...",|||||,97dd0674-0377-4f4f-bac5-17ef119b74a7,ee22c25a-c8eb-4527-bd5c-4b8257e3d5e7
2,AAACCTGCACGTCTCT-1_contig_2,AGCTCTGAGAGAGGAGCCCAGCCCTGGGATTTTCAGGTGTGTTCAC...,H,F,T,F,T,F,T,IGHV3-23*01,IGHD6-13*01,IGHJ5*02,GAGGTGCAGTTGTTGGAGTCTGGGGGAGGCTTGGTACAGCCTGGGG...,GAGGTGCAGCTGTTGGAGTCTGGGGGAGGCTTGGTACAGCCTGGGG...,EVQLLESGGGLVQPGGSLRLSCAASGFTFSSVAMTWVRQAPGKGLE...,EVQLLESGGGLVQPGGSLRLSCAASGFTFSSYAMSWVRQAPGKGLE...,1,295,297.0,304.0,320,361,GAGGTGCAGTTGTTGGAGTCTGGGGGAGGCTTGGTACAGCCTGGGG...,EVQLLESGGGLVQPGGSLRLSCAASGFTFSSVAMTWVRQAPGKGLE...,GAGGTGCAGCTGTTGGAGTCTGGGGGAGGCTTGGTACAGCCTGGGG...,EVQLLESGGGLVQPGGSLRLSCAASGFTFSSYAMSWVRQAPGKGLE...,GGGTATAG,GI,GGGTATAG,GI,TCGACCCCTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAG,DPWGQGTLVTVSS,TCGACCCCTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAG,DPWGQGTLVTVSS,GAGGTGCAGTTGTTGGAGTCTGGGGGAGGCTTGGTACAGCCTGGGG...,EVQLLESGGGLVQPGGSLRLSCAAS,GGATTCACCTTTAGCAGCGTTGCC,GFTFSSVA,ATGACCTGGGTCCGCCAGGCTCCAGGGAAGGGGCTGGAGTGGGTCG...,MTWVRQAPGKGLEWVAS,ATTGGTGAAAGTGGTGATACA,IGESGDT,TACAACGCAGACTCCGTGAAGGGCCGGTTCACCATCTCCAGAGACA...,YNADSVKGRFTISRDNSKDTLFLHMHSLGVEDTALYYC,TGGGGCCAGGGAACCCTGGTCACCGTCTCCTCA,WGQGTLVTVSS,GTGAAAGGGGGTATAGGGAGGGTCCAGGGAATCGACCCC,VKGGIGRVQGIDP,TGTGTGAAAGGGGGTATAGGGAGGGTCCAGGGAATCGACCCCTGG,45.0,CVKGGIGRVQGIDPW,15.0,377.341,16.069,81.44,136S168M3D124M248S1N,429S8M239S13N,452S9N42M182S,2.0290000000000002e-106,4.418,2.013e-19,91.525,100.0,100.0,137,428,1,295,430.0,437.0,1.0,8.0,453,494,10,51,137,211,212,235,236,286,287,307,308,421,461.0,493.0,422.0,460.0,G,1,GGAGGGTCCAGGGAA,15.0,CCTCCACCAAGGGCCCATCGGTCTTCCCCCTGGCGCCCTGCTCCAG...,IGHG,1,"{'fwh1': {'1 ': 'E', '2 ': 'V', '3 ': 'Q', '4 ...","|Deletions: 10, 73||||",AAACCTGCACGTCTCT-1_contig_1,TGAGCGCAGAAGGCAGGACTCGAGACAATCTTCATCATGACCTGCT...,L,F,T,F,T,F,T,IGLV1-51*01,,IGLJ3*02,CAGTCTGTGTTGACGCAGCCGCCCTCAGTGTCTGCGGCCCCGGGAC...,CAGTCTGTGTTGACGCAGCCGCCCTCAGTGTCTGCGGCCCCAGGAC...,QSVLTQPPSVSAAPGQKVTISCSGASSNIGSKTVSWYQQLPGTAPR...,QSVLTQPPSVSAAPGQKVTISCSGSSSNIGNNYVSWYQQLPGTAPK...,1,296,,,297,331,CAGTCTGTGTTGACGCAGCCGCCCTCAGTGTCTGCGGCCCCGGGAC...,QSVLTQPPSVSAAPGQKVTISCSGASSNIGSKTVSWYQQLPGTAPR...,CAGTCTGTGTTGACGCAGCCGCCCTCAGTGTCTGCGGCCCCAGGAC...,QSVLTQPPSVSAAPGQKVTISCSGSSSNIGNNYVSWYQQLPGTAPK...,,,,,GGTGTTCGGCGGAGGGACCAAGGTGACCGTCCTAG,VFGGGTKVTVL,GGTGTTCGGCGGAGGGACCAAGCTGACCGTCCTAG,VFGGGTKLTVL,CAGTCTGTGTTGACGCAGCCGCCCTCAGTGTCTGCGGCCCCGGGAC...,QSVLTQPPSVSAAPGQKVTISCSGA,AGCTCCAACATTGGGAGTAAGACT,SSNIGSKT,GTATCGTGGTACCAGCAACTCCCAGGAACAGCCCCCAGACTCCTCA...,VSWYQQLPGTAPRLLIY,GACAATAAT,DNN,AAGCGACCCTCAGGGATTCCTGACCGATTCTCTGGCTCCAAGTCTG...,KRPSGIPDRFSGSKSGTSATLDITGLQTGDEADYYC,TTCGGCGGAGGGACCAAGGTGACCGTCCTA,FGGGTKVTVL,GGAGCATGGGATATCAGGCTCAATGGTGGGGTG,GAWDIRLNGGV,TGCGGAGCATGGGATATCAGGCTCAATGGTGGGGTGTTC,39.0,CGAWDIRLNGGVF,13.0,406.945,,62.213,93S296M246S,,389S3N35M211S,2.329e-115,,1.159e-13,93.919,,97.143,94,389,1,296,,,,,390,424,4,38,94,168,169,192,193,243,244,252,253,360,394.0,423.0,361.0,393.0,,0,,,GTCAGCCCAAGGCTGCCCCCTCGGTCACTCTGTTCCCACCCTCCTC...,Bulk,1,"{'fwl1': {'1 ': 'Q', '2 ': 'S', '3 ': 'V', '4 ...",|||||,4453dcc7-2b5f-4fbd-a816-3f5fcdb6e225,e423ab9e-ee27-48ce-a091-63a7d9052060
3,AAACCTGGTCGACTAT-1_contig_2,GCTCTGAGAGAGGAGCCCAGCCCTGGGATTTTCAGGTGTTTTCATT...,H,F,T,F,T,F,T,IGHV3-23*01,IGHD5-12*01,IGHJ4*02,GAGGTGCAGCTGTTGGAGTCTGGGGGAGGCTTGGTACAGCGGGGGG...,GAGGTGCAGCTGTTGGAGTCTGGGGGAGGCTTGGTACAGCCTGGGG...,EVQLLESGGGLVQRGGSLRLSCAASRFTFSSYAMTWVRQAPGKGLE...,EVQLLESGGGLVQPGGSLRLSCAASGFTFSSYAMSWVRQAPGKGLE...,1,296,299.0,313.0,321,364,GAGGTGCAGCTGTTGGAGTCTGGGGGAGGCTTGGTACAGCGGGGGG...,EVQLLESGGGLVQRGGSLRLSCAASRFTFSSYAMTWVRQAPGKGLE...,GAGGTGCAGCTGTTGGAGTCTGGGGGAGGCTTGGTACAGCCTGGGG...,EVQLLESGGGLVQPGGSLRLSCAASGFTFSSYAMSWVRQAPGKGLE...,GATATAGTGGCCACG,YSGH,GATATAGTGGCTACG,YSGY,CTTTGACTACTGGGGCCAGGGAACCCTAGTCACCGTCTCCTCAG,FDYWGQGTLVTVSS,CTTTGACTACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAG,FDYWGQGTLVTVSS,GAGGTGCAGCTGTTGGAGTCTGGGGGAGGCTTGGTACAGCGGGGGG...,EVQLLESGGGLVQRGGSLRLSCAAS,AGATTCACCTTTAGCAGCTATGCC,RFTFSSYA,ATGACCTGGGTCCGCCAGGCTCCAGGGAAGGGGCTGGAGTGGGTCT...,MTWVRQAPGKGLEWVSS,GTTAGTGCTAGTGGTGGTAGCACC,VSASGGST,TACTACGCAGACTCCGTGAAGGGCCGGTTCACCATCTCCAGGGACA...,YYADSVKGRFTISRDNSKNTLFLQMNSLRGEDTAVYFC,TGGGGCCAGGGAACCCTAGTCACCGTCTCCTCA,WGQGTLVTVSS,GCGAGAGATAGATATAGTGGCCACGGCGGACTCTTTGACTAC,ARDRYSGHGGLFDY,TGTGCGAGAGATAGATATAGTGGCCACGGCGGACTCTTTGACTACTGG,48.0,CARDRYSGHGGLFDYW,16.0,419.41,23.76,79.518,135S296M71S,433S3N15M54S5N,455S4N44M3S,3.235e-119,0.0158,5.634999999999999e-19,95.27,93.333,97.727,136,431,1,296,434.0,448.0,4.0,18.0,456,499,5,48,136,210,211,234,235,285,286,309,310,423,466.0,498.0,424.0,465.0,TA,2,GCGGACT,7.0,CAT,Bulk,1,"{'fwh1': {'1 ': 'E', '2 ': 'V', '3 ': 'Q', '4 ...","|Deletions: 10, 73||||",AAACCTGGTCGACTAT-1_contig_1,AATCTAGGTGATGGTGAGACAAGAGGACACAGGGGTTAAATTCTGT...,K,F,T,F,T,F,T,IGKV3-20*01,,IGKJ1*01,GAAATTGTGTTGACGCAGTCTCCAGGCACCCTGTCTTTGTCTCCAG...,GAAATTGTGTTGACGCAGTCTCCAGGCACCCTGTCTTTGTCTCCAG...,EIVLTQSPGTLSLSPGERATLSCRASQSVSSSYLAWYQQKAGQAPR...,EIVLTQSPGTLSLSPGERATLSCRASQSVSSSYLAWYQQKPGQAPR...,1,287,,,288,325,GAAATTGTGTTGACGCAGTCTCCAGGCACCCTGTCTTTGTCTCCAG...,EIVLTQSPGTLSLSPGERATLSCRASQSVSSSYLAWYQQKAGQAPR...,GAAATTGTGTTGACGCAGTCTCCAGGCACCCTGTCTTTGTCTCCAG...,EIVLTQSPGTLSLSPGERATLSCRASQSVSSSYLAWYQQKPGQAPR...,,,,,GTGGACGTTCGGCCAAGGGACCAAGGTGGAAATCAAAC,WTFGQGTKVEIK,GTGGACGTTCGGCCAAGGGACCAAGGTGGAAATCAAAC,WTFGQGTKVEIK,GAAATTGTGTTGACGCAGTCTCCAGGCACCCTGTCTTTGTCTCCAG...,EIVLTQSPGTLSLSPGERATLSCRAS,CAGAGTGTTAGCAGCAGCTAC,QSVSSSY,TTAGCCTGGTACCAGCAGAAAGCTGGCCAGGCTCCCAGGCTCCTCA...,LAWYQQKAGQAPRLLIY,GAAGCATCC,EAS,AGCAGGGCCACTGGCATCCCAGACAGGTTCAGTGGGAGTGGGTCTG...,SRATGIPDRFSGSGSGADFTLAISRLEPEDFAVYYC,TTCGGCCAAGGGACCAAGGTGGAAATCAAA,FGQGTKVEIK,CAACAGTATGGTAGTTCACCGTGGACG,QQYGSSPWT,TGTCAACAGTATGGTAGTTCACCGTGGACGTTC,33.0,CQQYGSSPWTF,11.0,414.736,,73.749,344S287M174S3N,,631S38M136S,1.347e-117,,4.9650000000000003e-17,96.167,,100.0,345,631,1,287,,,,,632,669,1,38,345,422,423,443,444,494,495,503,504,611,639.0,668.0,612.0,638.0,,0,,,GAACTGTGGCTGCACCATCTGTCTTCATCTTCCCGCCATCTGATGA...,Bulk,1,"{'fwk1': {'1 ': 'E', '2 ': 'I', '3 ': 'V', '4 ...",|||||,1c8c15a5-1d25-4ab3-9f56-9db700d43d5b,9cf559de-e470-4882-979f-0cdb953a6fea
4,AAACCTGGTCGGCTCA-1_contig_2,CGAGCCCAGCACTGGAAGTCGCCGGTGTTTCCATTCGGTGATCATC...,H,F,T,F,T,F,T,IGHV3-30*04,IGHD5-12*01,IGHJ4*01,CAGGTTCACCTGGTGGAGTCTGGGGGGCGCGAGGTCCAGCCAGGGG...,CAGGTGCAGCTGGTGGAGTCTGGGGGAGGCGTGGTCCAGCCTGGGA...,QVHLVESGGREVQPGAPLRLSCAASGFTFALSPMGWVRQAPGQGLE...,QVQLVESGGGVVQPGRSLRLSCAASGFTFSSYAMHWVRQAPGKGLE...,1,296,305.0,310.0,313,355,CAGGTTCACCTGGTGGAGTCTGGGGGGCGCGAGGTCCAGCCAGGGG...,QVHLVESGGREVQPGAPLRLSCAASGFTFALSPMGWVRQAPGQGLE...,CAGGTGCAGCTGGTGGAGTCTGGGGGAGGCGTGGTCCAGCCTGGGA...,QVQLVESGGGVVQPGRSLRLSCAASGFTFSSYAMHWVRQAPGKGLE...,GTGGCT,G,GTGGCT,G,TTTGACTACTGGGGCCACGGAACCCTGGTCACCTTCTCTTCAG,FDYWGHGTLVTFSS,TTTGACTACTGGGGCCAAGGAACCCTGGTCACCGTCTCCTCAG,FDYWGQGTLVTVSS,CAGGTTCACCTGGTGGAGTCTGGGGGGCGCGAGGTCCAGCCAGGGG...,QVHLVESGGREVQPGAPLRLSCAAS,GGATTCACCTTCGCTCTCTCTCCT,GFTFALSP,ATGGGCTGGGTCCGCCAGGCTCCAGGCCAGGGGCTGGAGTGGGTGG...,MGWVRQAPGQGLEWVAF,ATTTCGTCTAATGCAAAGACTGAC,ISSNAKTD,AGCTATGCAGACTCCGTGAGGGGCCGATTCACCATCTCCAGAGACA...,SYADSVRGRFTISRDNYKNTLYLQMNSLSVEDTAVYYC,TGGGGCCACGGAACCCTGGTCACCTTCTCTTCA,WGHGTLVTFSS,GCGAGAGATGCCGAGGGTGGCTCGTTTGACTAC,ARDAEGGSFDY,TGTGCGAGAGATGCCGAGGGTGGCTCGTTTGACTACTGG,39.0,CARDAEGGSFDYW,13.0,335.272,12.223,66.059,124S296M161S,428S9N6M147S8N,436S5N43M102S,7.987e-94,54.45,7.365e-15,86.149,100.0,93.023,125,420,1,296,429.0,434.0,10.0,15.0,437,479,6,48,125,199,200,223,224,274,275,298,299,412,446.0,478.0,413.0,445.0,TGCCGAGG,8,CG,2.0,CACCCACCAAGGCTCCGGATGTGTTCCCCATCATATCAGGGTGCAG...,IGHD,1,"{'fwh1': {'1 ': 'Q', '2 ': 'V', '3 ': 'H', '4 ...","|Deletions: 10, 73||||",AAACCTGGTCGGCTCA-1_contig_1,AGTCTGGGCCTAAGGAAGCAGCACTGGTGGTGCCTCAGCCATGGCC...,L,F,T,F,T,F,T,IGLV3-21*04,,IGLJ2*01,TCCTATGTGCTGACTCAGCCACCCTCAGTGTCAGTGGCCCCAGGAA...,TCCTATGTGCTGACTCAGCCACCCTCAGTGTCAGTGGCCCCAGGAA...,SYVLTQPPSVSVAPGKTATITCGGDNIGREGVHWYQQRPGQAPVRV...,SYVLTQPPSVSVAPGKTARITCGGNNIGSKSVHWYQQKPGQAPVLV...,1,273,,,294,324,TCCTATGTGCTGACTCAGCCACCCTCAGTGTCAGTGGCCCCAGGAA...,SYVLTQPPSVSVAPGKTATITCGGDNIGREGVHWYQQRPGQAPVRV...,TCCTATGTGCTGACTCAGCCACCCTCAGTGTCAGTGGCCCCAGGAA...,SYVLTQPPSVSVAPGKTARITCGGNNIGSKSVHWYQQKPGQAPVLV...,,,,,ATTCGGCGGAGGGACCCACCTGACCGTCCTA,FGGGTHLTVL,ATTCGGCGGAGGGACCAAGCTGACCGTCCTA,FGGGTKLTVL,TCCTATGTGCTGACTCAGCCACCCTCAGTGTCAGTGGCCCCAGGAA...,SYVLTQPPSVSVAPGKTATITCGGD,AACATTGGACGTGAAGGT,NIGREG,GTGCACTGGTACCAGCAGAGGCCAGGCCAGGCCCCTGTGCGGGTCA...,VHWYQQRPGQAPVRVMY,TATAGTAGC,YSS,GACCGGCCCTCAGGGATCCCTGACCGATTCTCTGGCTCCAAGTCTG...,DRPSGIPDRFSGSKSGNMATLTISRVEAGDEADYYC,TTCGGCGGAGGGACCCACCTGACCGTCCTA,FGGGTHLTVL,CAGGTGTGGGATGCTGTTCTTGAGAGTCGGTCA,QVWDAVLESRS,TGTCAGGTGTGGGATGCTGTTCTTGAGAGTCGGTCATTC,39.0,CQVWDAVLESRSF,13.0,374.225,,48.755,97S273M263S17N,,390S6N31M212S1N,1.643e-105,,1.301e-09,93.773,,93.548,98,370,1,273,,,,,391,421,7,37,98,172,173,190,191,241,242,250,251,358,392.0,421.0,359.0,391.0,GCTGTTCTTGAGAGTCGGTC,20,,,AGTCAGCCCAAGGCTGCCCCCTCGGTCACTCTGTTCCCGCCCTCCT...,Bulk,1,"{'fwl1': {'1 ': 'S', '2 ': 'Y', '3 ': 'V', '4 ...",|||||,fe48eb90-779a-4d69-834c-69e7672b05ef,6a7195a2-3a3b-43d8-9b29-c443abd8b547


In [8]:
# After processing, number of columns should be 99 (same as unpaired data frame)
len(paired_df.columns)

200

# Difference Between Paired and Unpaired Columns

There are at least two columns that don't exist in the Unpaired df that do exist in the Paired df. Raw unpaired df has 97 columns. Doubling that for the paired should yield 194 columns. However, there are 198 columns in the raw Paired df. 

One of those columns is the "Isotype" column, as that can't be stored in the metadata because there are two chains and thus two isotypes. This will be remedied as we want each sequence to also contain Isotype data.

Trying to find the missing column:

In [7]:
heavy_data_file = '/export/share/cameronhu/oas/unpaired/unpaired_human/unpaired_human_heavy/1279049_1_Heavy_Bulk.csv.gz'
paired_data_file = '/export/share/cameronhu/oas/paired/paired_human/1_S1__1_Paired_All.csv.gz'

heavy_data_raw = pd.read_csv(heavy_data_file, header=1)
paired_data_raw = pd.read_csv(paired_data_file, header=1)

def get_unique_columns_from_paired(paired_df):
    """
    Extract unique column names from Paired dataframe by removing '_heavy' and '_light' suffixes.
    """
    paired_columns = paired_df.columns
    # Remove the '_heavy' and '_light' suffixes
    unique_columns_paired = {col.replace('_heavy', '').replace('_light', '') for col in paired_columns}
    return unique_columns_paired

def compare_columns(unpaired_df, paired_df):
    """
    Compare the columns of Unpaired and Paired dataframes.
    Returns columns missing from Unpaired but present in Paired.
    """
    # Get unique columns from Unpaired
    unpaired_columns = set(unpaired_df.columns)

    # Get unique columns from Paired
    unique_columns_paired = get_unique_columns_from_paired(paired_df)

    # Find columns missing from Unpaired that exist in Paired
    missing_columns = unique_columns_paired - unpaired_columns
    return missing_columns

# Example usage
missing_columns = compare_columns(heavy_data_raw, paired_data_raw)

print("Columns missing from Unpaired but present in Paired:")
for col in missing_columns:
    print(col)


Columns missing from Unpaired but present in Paired:
sequence_id
Isotype


# Analysis of Sequence Data

Difference between 'sequence' and 'sequence_alignment' fields. I presume the first is the raw sequence from the FASTA, and the 'sequence_alignment' is the NT output from IgBLAST

In [None]:
sequence = heavy_sequence_df['sequence'][0]
print(len(sequence))
sequence


519


'AGCTCTGAGAGAGGAGCCCAGCCCTGGGATTTTCAGGTGTTTTCATTTGGTGATCAGGACTGAACAGAGAGAACTCACCATGGAGTTTGGGCTGAGCTGGCTTTTTCTTGTGGCTATTTTAAAAGGTGTCCAGTGTGAGGTGCAGCTGTTGGAGTCTGGGGGAGGCTTGGTACAGCCTGGGGGGTCCCTGAGACTCTCCTGTGCAGCCTCTGGATTCACCTTTAGCAGCTATGCCATGAGCTGGGTCCGCCAGGCTCCAGGGAAGGGGCTGGAGTGGGTCTCAGCTATTAGTGGTAGTGGTGGTAGCACATACTACGCAGACTCCGTGAAGGGCCGGTTCACCATCTCCAGAGACAATTCCAAGAACACGCTGTATCTGCAAATGAACAGCCTGAGAGCCGAGGACACGGCCGTATATTACTGTGCGCGAAATTACTATGATAGTAGTGGTTATATTACTAACTTTGACTACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAGCACCCACCAAGGC'

In [None]:
sequence_alignment = heavy_sequence_df['sequence_alignment'][0]
print(len(sequence_alignment))
sequence_alignment

370


'GAGGTGCAGCTGTTGGAGTCTGGGGGAGGCTTGGTACAGCCTGGGGGGTCCCTGAGACTCTCCTGTGCAGCCTCTGGATTCACCTTTAGCAGCTATGCCATGAGCTGGGTCCGCCAGGCTCCAGGGAAGGGGCTGGAGTGGGTCTCAGCTATTAGTGGTAGTGGTGGTAGCACATACTACGCAGACTCCGTGAAGGGCCGGTTCACCATCTCCAGAGACAATTCCAAGAACACGCTGTATCTGCAAATGAACAGCCTGAGAGCCGAGGACACGGCCGTATATTACTGTGCGCGAAATTACTATGATAGTAGTGGTTATATTACTAACTTTGACTACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAG'

In [None]:
aligner = PairwiseAligner()
aligner.mode = 'local'
# Allow insertions at the 5' end of the target sequence without penalty
aligner.open_gap_score = -2.0        # Penalty for opening a gap
aligner.extend_gap_score = -0.5      # Penalty for extending a gap

alignments = aligner.align(sequence, sequence_alignment)
print(alignments[0])
print(alignments[0].score)

target          136 GAGGTGCAGCTGTTGGAGTCTGGGGGAGGCTTGGTACAGCCTGGGGGGTCCCTGAGACTC
                  0 ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
query             0 GAGGTGCAGCTGTTGGAGTCTGGGGGAGGCTTGGTACAGCCTGGGGGGTCCCTGAGACTC

target          196 TCCTGTGCAGCCTCTGGATTCACCTTTAGCAGCTATGCCATGAGCTGGGTCCGCCAGGCT
                 60 ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
query            60 TCCTGTGCAGCCTCTGGATTCACCTTTAGCAGCTATGCCATGAGCTGGGTCCGCCAGGCT

target          256 CCAGGGAAGGGGCTGGAGTGGGTCTCAGCTATTAGTGGTAGTGGTGGTAGCACATACTAC
                120 ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
query           120 CCAGGGAAGGGGCTGGAGTGGGTCTCAGCTATTAGTGGTAGTGGTGGTAGCACATACTAC

target          316 GCAGACTCCGTGAAGGGCCGGTTCACCATCTCCAGAGACAATTCCAAGAACACGCTGTAT
                180 ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
query           180 GCAGACTCCGTGAAGGGCCGGTTCACCATCTCCAGAGACAATTCCAAGAACACGCTGTAT

target          376 CTGC

Sequence_alignments from IgBLAST are indeed a subsequence of the intiial Sequence column. Raw sequences may include leader peptides not included in the final form of the variable chain.