# Comp 3550 bioinformatics project

## Introduction

some stuff

# Program

## Libary setup

In [22]:

# Import necessary libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from Bio.Blast import NCBIWWW, NCBIXML
from Bio import SeqIO
import csv, os


## Functions

### Functions to convert Fasta Files to CSV

In [23]:
# Functions to convert fasta files to csv

def convertFastaToCsv(input_file, output_file):
    with open(input_file, 'r') as infile, open(output_file, 'w', newline='') as outfile:
        csv_writer = csv.writer(outfile)
        csv_writer.writerow(['PrimerPair', 'Direction', 'Sequence'])
        
        for line in infile:
            if line.strip() == '':
                continue
            if line.startswith('>'):
                parts = line[1:].strip().split('_')
                index = parts[1] if len(parts) > 2 else '1'
                directionLetter = parts[2].split(" ")[0] if len(parts) > 2 else parts[1].split(" ")[0]
                if directionLetter == 'F':
                    direction = 'forward'
                else:
                    direction = 'reverse'
            else:
                sequence = line.strip()
                csv_writer.writerow([index, direction, sequence])


In [24]:
def convertFastaFolderToCsv(input_folder, output_folder):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    for filename in os.listdir(input_folder):
        if filename.endswith('.fas'):
            input_file = os.path.join(input_folder, filename)
            output_file = os.path.join(output_folder, f'{filename}.csv')
            
            # Check if output file already exists
            if os.path.exists(output_file):
                output_file = os.path.join(output_folder, f'{filename}.csv')
                continue
            
            convertFastaToCsv(input_file, output_file)

### Functions for manipulating DataFrames to have the same shape

#### Manipulating Geneious DataFrames

##### Group Dataframe

In [25]:
def reshapeGroupGeneious(dataframe):
    df = dataframe
    
    # Create a mapping of primers to their paired primers
    primer_pairs = {}
    for index, row in df.iterrows():
        primer_pairs[row['Name']] = row['Paired Primer']
        primer_pairs[row['Paired Primer']] = row['Name']

    # Assign primer pair index
    df['PrimerPair'] = None
    current_index = 1
    for primer in primer_pairs:
        if df.loc[df['Name'] == primer, 'PrimerPair'].isnull().all():
            df.loc[df['Name'] == primer, 'PrimerPair'] = current_index
            df.loc[df['Name'] == primer_pairs[primer], 'PrimerPair'] = current_index
            current_index += 1

    
    df = df.rename(columns={
        'Sequence (with extension)': 'Sequence',
        })
    df = df[['PrimerPair',  'Direction', 'Sequence', 'Length (with extension)', 'Paired Primer','Name', 'Minimum', 'Maximum', '# Local Off-target Sites', '%GC', 'Hairpin Tm', 'Self Dimer Tm', 'Tm']]
    return df
    
    

##### Species Dataframe

#### Manipulating PrimerQuest DataFrames

##### Group Dataframe

In [26]:
def reshapeGroupPrimerQuest(dataframe):
    df = dataframe
    # Extract primer pair number from AssaySet
    df['PrimerPair'] = df['AssaySet'].str.extract(r'Assay Set (\d+)').astype(int)

    # Rename Type to Direction and convert to lowercase
    df = df.rename(columns={'Type': 'Direction'})
    df['Direction'] = df['Direction'].str.replace(' Primer', '').str.lower()

    # Drop the old AssaySet column
    df = df.drop(columns=['AssaySet'])

    # Reorder columns
    df = df[['PrimerPair', 'Direction', 'Sequence', 'Start', 'Length', 'Tm', 'GC Percent', 'Amplicon']]
    
    return df


##### Species Dataframe

#### Manipulating PrimerBlast DataFrames

##### Group Dataframe

In [27]:
def reshapeGroupPrimerBlast(dataframe):
    

    # Assuming your DataFrame is named df
    df = dataframe

    # Melt the DataFrame to long format
    df_long = pd.melt(df, id_vars=['Primer pair #', 'Product length'], 
                    value_vars=[col for col in df.columns if col.startswith('Forward') or col.startswith('Reverse')],
                    var_name='Attribute', value_name='Value')

    # Extract primer direction and attribute name
    df_long['Direction'] = df_long['Attribute'].apply(lambda x: 'Forward' if 'Forward' in x else 'Reverse')
    df_long['Attribute'] = df_long['Attribute'].apply(lambda x: x.replace('Forward primer ', '').replace('Reverse primer ', ''))

    # Pivot the DataFrame back to wide format
    df_wide = df_long.pivot_table(index=['Primer pair #', 'Product length', 'Direction'], 
                                columns='Attribute', values='Value', aggfunc='first').reset_index()

    # Reorder columns
    df_wide = df_wide[['Primer pair #', 'Direction', 'Sequence (5\'->3\')', 'Template strand', 'Length', 'Start', 'Stop', 'Tm', 'GC%', 'Self complementarity', 'Self 3\' complementarity', 'Product length']]

    df_wide = df_wide.rename(columns={
        'Sequence (5\'->3\')': 'Sequence',
        'Primer pair #': 'PrimerPair'
    })
    
    df_wide['Direction'] = df_wide['Direction'].str.lower()

    return df_wide

##### Species Dataframe

#### Manipulating Primer3 DataFrames

##### Group Dataframe

##### Species Dataframe

### Function to make Blast Requests

## Main Program

### Data Setup


#### Convert to CSV Files for use with Pandas

In [28]:
foldersToConvert = ['data/group/primer3','data/species/primer3']

for folder in foldersToConvert:
    convertFastaFolderToCsv(folder, folder)

#### Initialize Group DataFrames

In [29]:
# Setup dataframes
groupGeneous = pd.read_csv('data/group/geneious/Entero grp-specific primers (GENEIOUS).csv')
groupPrimer3 = pd.read_csv('data/group/primer3/EnterobactericaeaP3.fas.csv')
groupPrimerBlast = pd.read_csv('data/group/primerBlast/Entero grp-specific primers (BLAST).csv')
groupPrimerQuest = pd.read_csv('Data/group/primerQuest/PrimerQuest family-specific.csv')

# Store dataframes in dictionary for dynamic usage and access later
groupData = {
    'Geneious': groupGeneous,
    'Primer3': groupPrimer3,
    'PrimerBlast': groupPrimerBlast,
    'PrimerQuest': groupPrimerQuest
}

##### Investigate shape of data frames


In [30]:
groupData['Geneious'].head()

Unnamed: 0,Name,Minimum,Maximum,Direction,Sequence (with extension),Length (with extension),Paired Primer,# Local Off-target Sites,%GC,Hairpin Tm,Self Dimer Tm,Tm
0,934 F,934,953,forward,GGGCTTGACATACACCGGAA,20,"1,130 R",0,55.0,,9.5,60.0
1,916 F,916,935,forward,ACGCGAAGAACCTTACCTGG,20,"1,131 R",0,55.0,,15.9,60.0
2,427 F,427,446,forward,ACGGTACCTGCAGAAGAAGC,20,953 R,0,55.0,,17.4,60.0
3,275 F,275,294,forward,CCACACTGGGACTGAGACAC,20,"1,182 R",0,60.0,32.7,,60.0
4,"1,182 R",1163,1182,reverse,TAGCATGTGTGAAGCCCTGG,20,275 F,0,55.0,,,60.0


In [31]:
groupData['Primer3'].head()

Unnamed: 0,PrimerPair,Direction,Sequence
0,1,forward,TGCCTGATGGAGGGGGATAA
1,1,reverse,GAGACTCAAGCCTGCCAGTT
2,2,forward,GGAACTGAGACACGGTCCAG
3,2,reverse,TTTAACCTTGCGGCCGTACT
4,3,forward,TGGTAGTCCACGCCGTAAAC


In [32]:
groupData['PrimerBlast'].head()

Unnamed: 0,Primer pair #,Forward primer Sequence (5'->3'),Forward primer Template strand,Forward primer Length,Forward primer Start,Forward primer Stop,Forward primer Tm,Forward primer GC%,Forward primer Self complementarity,Forward primer Self 3' complementarity,Reverse primer Sequence (5'->3'),Reverse primer Template strand,Reverse primer Length,Reverse primer Start,Reverse primer Stop,Reverse primer Tm,Reverse primer GC%,Reverse primer Self complementarity,Reverse primer Self 3' complementarity,Product length
0,1,AAGAAGCACCGGCTAACTCC,Plus,20.0,496.0,515.0,60.04,55.0,4.0,0.0,TTCACAACACGAGCTGACGA,Minus,20.0,1082.0,1063.0,59.9,50.0,4.0,2.0,587.0
1,2,TATTGCACAATGGGCGCAAG,Plus,20.0,366.0,385.0,59.83,50.0,6.0,2.0,GGAGTTAGCCGGTGCTTCTT,Minus,20.0,515.0,496.0,60.04,55.0,4.0,0.0,150.0
2,3,CGGTAATACGGAGGGTGCAA,Plus,20.0,529.0,548.0,59.82,55.0,5.0,2.0,GTTCTTCGCGTTGCATCGAA,Minus,20.0,980.0,961.0,59.84,50.0,6.0,2.0,452.0
3,4,TTCGATGCAACGCGAAGAAC,Plus,20.0,961.0,980.0,59.84,50.0,6.0,0.0,TTGCGGGACTTAACCCAACA,Minus,20.0,1103.0,1084.0,59.82,50.0,4.0,2.0,143.0
4,5,TCGTCAGCTCGTGTTGTGAA,Plus,20.0,1063.0,1082.0,59.9,50.0,4.0,1.0,TTGTAGCACGTGTGTAGCCC,Minus,20.0,1240.0,1221.0,60.32,55.0,6.0,0.0,178.0


In [33]:
groupData['PrimerQuest']

Unnamed: 0,AssaySet,Type,Sequence,Start,Length,Tm,GC Percent,Amplicon
0,Assay Set 1 (Consensus),Forward Primer,GATGTGCCCAGATGGGATTAG,220.0,21.0,62.233,52.381,
1,Assay Set 1 (Consensus),Reverse Primer,GTGGATGTCAAGACCAGGTAAG,994.0,22.0,62.205,50.000,
2,Assay Set 1 (Consensus),Product,,,,,,775.0
3,Assay Set 2 (Consensus),Forward Primer,ACGGTAGCTAATACCGCATAAC,155.0,22.0,61.972,45.455,
4,Assay Set 2 (Consensus),Reverse Primer,CCTCCAGATCTCTACGCATTTC,707.0,22.0,62.034,50.000,
...,...,...,...,...,...,...,...,...
994,,,,,,,,
995,,,,,,,,
996,,,,,,,,
997,,,,,,,,


##### Investigate dataframes

In [34]:
groupData['Geneious'].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Name                       10 non-null     object 
 1   Minimum                    10 non-null     int64  
 2   Maximum                    10 non-null     int64  
 3   Direction                  10 non-null     object 
 4   Sequence (with extension)  10 non-null     object 
 5   Length (with extension)    10 non-null     int64  
 6   Paired Primer              10 non-null     object 
 7   # Local Off-target Sites   10 non-null     int64  
 8   %GC                        10 non-null     float64
 9   Hairpin Tm                 3 non-null      float64
 10  Self Dimer Tm              3 non-null      float64
 11  Tm                         10 non-null     float64
dtypes: float64(4), int64(4), object(4)
memory usage: 1.1+ KB


In [35]:
groupData['Primer3'].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   PrimerPair  20 non-null     int64 
 1   Direction   20 non-null     object
 2   Sequence    20 non-null     object
dtypes: int64(1), object(2)
memory usage: 612.0+ bytes


In [36]:
groupData['PrimerBlast'].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12 entries, 0 to 11
Data columns (total 20 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   Primer pair #                           11 non-null     object 
 1   Forward primer Sequence (5'->3')        10 non-null     object 
 2   Forward primer Template strand          10 non-null     object 
 3   Forward primer Length                   10 non-null     float64
 4   Forward primer Start                    10 non-null     float64
 5   Forward primer Stop                     10 non-null     float64
 6   Forward primer Tm                       10 non-null     float64
 7   Forward primer GC%                      10 non-null     float64
 8   Forward primer Self complementarity     10 non-null     float64
 9   Forward primer Self 3' complementarity  10 non-null     float64
 10  Reverse primer Sequence (5'->3')        10 non-null     object 


In [37]:
groupData['PrimerQuest'].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 999 entries, 0 to 998
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   AssaySet    15 non-null     object 
 1   Type        15 non-null     object 
 2   Sequence    10 non-null     object 
 3   Start       10 non-null     float64
 4   Length      10 non-null     float64
 5   Tm          10 non-null     float64
 6   GC Percent  10 non-null     float64
 7   Amplicon    5 non-null      float64
dtypes: float64(5), object(3)
memory usage: 62.6+ KB


##### Clean up dataframes

In [38]:
groupData['PrimerBlast'].dropna(inplace=True, subset=['Forward primer Template strand'])
groupData['PrimerBlast'].reset_index().info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 21 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   index                                   10 non-null     int64  
 1   Primer pair #                           10 non-null     object 
 2   Forward primer Sequence (5'->3')        10 non-null     object 
 3   Forward primer Template strand          10 non-null     object 
 4   Forward primer Length                   10 non-null     float64
 5   Forward primer Start                    10 non-null     float64
 6   Forward primer Stop                     10 non-null     float64
 7   Forward primer Tm                       10 non-null     float64
 8   Forward primer GC%                      10 non-null     float64
 9   Forward primer Self complementarity     10 non-null     float64
 10  Forward primer Self 3' complementarity  10 non-null     float64
 

In [39]:
groupData['PrimerQuest'].dropna(inplace=True, subset=['Sequence'])
groupData['PrimerQuest'].reset_index().info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   index       10 non-null     int64  
 1   AssaySet    10 non-null     object 
 2   Type        10 non-null     object 
 3   Sequence    10 non-null     object 
 4   Start       10 non-null     float64
 5   Length      10 non-null     float64
 6   Tm          10 non-null     float64
 7   GC Percent  10 non-null     float64
 8   Amplicon    0 non-null      float64
dtypes: float64(5), int64(1), object(3)
memory usage: 852.0+ bytes


##### Data Manipulation to have the same shape/column names 

In [40]:
groupData['PrimerBlast'] = reshapeGroupPrimerBlast(groupData['PrimerBlast'])
groupData['PrimerBlast'].head()


Attribute,PrimerPair,Direction,Sequence,Template strand,Length,Start,Stop,Tm,GC%,Self complementarity,Self 3' complementarity,Product length
0,1,forward,AAGAAGCACCGGCTAACTCC,Plus,20.0,496.0,515.0,60.04,55.0,4.0,0.0,587.0
1,1,reverse,TTCACAACACGAGCTGACGA,Minus,20.0,1082.0,1063.0,59.9,50.0,4.0,2.0,587.0
2,10,forward,GCAGAAGAAGCACCGGCTAA,Plus,20.0,492.0,511.0,60.67,55.0,4.0,1.0,89.0
3,10,reverse,TGCGCTTTACGCCCAGTAAT,Minus,20.0,580.0,561.0,60.39,50.0,5.0,2.0,89.0
4,2,forward,TATTGCACAATGGGCGCAAG,Plus,20.0,366.0,385.0,59.83,50.0,6.0,2.0,150.0


In [41]:
groupData['Geneious'] = reshapeGroupGeneious(groupData['Geneious'])
groupData['Geneious'].head()

Unnamed: 0,PrimerPair,Direction,Sequence,Length (with extension),Paired Primer,Name,Minimum,Maximum,# Local Off-target Sites,%GC,Hairpin Tm,Self Dimer Tm,Tm
0,1,forward,GGGCTTGACATACACCGGAA,20,"1,130 R",934 F,934,953,0,55.0,,9.5,60.0
1,2,forward,ACGCGAAGAACCTTACCTGG,20,"1,131 R",916 F,916,935,0,55.0,,15.9,60.0
2,3,forward,ACGGTACCTGCAGAAGAAGC,20,953 R,427 F,427,446,0,55.0,,17.4,60.0
3,4,forward,CCACACTGGGACTGAGACAC,20,"1,182 R",275 F,275,294,0,60.0,32.7,,60.0
4,4,reverse,TAGCATGTGTGAAGCCCTGG,20,275 F,"1,182 R",1163,1182,0,55.0,,,60.0


In [42]:
groupData['PrimerQuest'] = reshapeGroupPrimerQuest(groupData['PrimerQuest'])
groupData['PrimerQuest'].head()

Unnamed: 0,PrimerPair,Direction,Sequence,Start,Length,Tm,GC Percent,Amplicon
0,1,forward,GATGTGCCCAGATGGGATTAG,220.0,21.0,62.233,52.381,
1,1,reverse,GTGGATGTCAAGACCAGGTAAG,994.0,22.0,62.205,50.0,
3,2,forward,ACGGTAGCTAATACCGCATAAC,155.0,22.0,61.972,45.455,
4,2,reverse,CCTCCAGATCTCTACGCATTTC,707.0,22.0,62.034,50.0,
6,3,forward,TAGCGGTGAAATGCGTAGAG,679.0,20.0,61.797,50.0,


##### Initialize Species DataFrames