# Nanopore S3 Directory

In [None]:
s3cmd get --recursive s3://aretian-genomics/nanopore/

# Reference Genome

Info: https://lh3.github.io/2017/11/13/which-human-reference-genome-to-use

In [None]:
# Pull reference genome from S3
!s3cmd get s3://aretian-genomics/nanopore/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz

In [None]:
!gunzip GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz

In [None]:
# Index reference genome
!samtools faidx GCA_000001405.15_GRCh38_no_alt_analysis_set.fna

In [None]:
# Select chromosomes or regions
!samtools faidx GCA_000001405.15_GRCh38_no_alt_analysis_set.fna chr17 > chr17_selected.fa

In [None]:
# Import chromosome 

In [1]:
# Read in fasta files
def read_fasta_genome(fasta_file,chromosome_header):
    clean_data = fasta_file.read().replace("\n", "")
    clean_data = clean_data.replace(chromosome_header,"") # get rid of header

    return clean_data

In [2]:
with open('./assembly-algorithm/reference-genome/chr17_selected.fa') as f:
    chr17_genome = read_fasta_genome(f,'>chr17')

See https://www.bioinformatics.org/sms/iupac.html for IUPAC nucleotide codes

In [3]:
print(f"Unique characters: {list(set(chr17_genome))}") 

Unique characters: ['W', 'N', 'R', 'K', 'A', 'Y', 'T', 'G', 'S', 'C']


In [4]:
print(f"Selected chromosome from reference genome is {len(chr17_genome)} BP long")

Selected chromosome from reference genome is 83257441 BP long


# Chromosome Selection (Nanopore)

In [5]:
!ls

assembly-algorithm		 step1.py
bioliquid_chr17_pompe.bam	 step2.py
bioliquid_chr17_pompe.bam.bai	 step3.py
bioliquid_chr17_pompe_reads.txt  str_pipeline_download_data.ipynb
bioliquid_chr17_pompe.txt	 str_pipeline_step2.sh
chunk2.csv			 str_pipeline_step3.sh
extracted_reads.txt		 temp
families.csv			 Untitled.ipynb
output.bam


In [None]:
game_reader

In [None]:
df = pd.read_csv('./assembly-algorithm/data/bioliquid_chr17_pompe_500reads.txt', delimiter = "\t")

In [None]:
df

In [None]:
DataFrame.from_records(data, columns=['readID', 'unknown1', 'chromosome', 'start_index','sequence_description','unknown2','unknown3','unknown4','sequence','quality']) 

In [None]:
import csv
import time

reads = []
with open('./assembly-algorithm/data/bioliquid_chr17_pompe_500reads.txt', newline = '') as games:                                                                                          
    game_reader = csv.reader(games, delimiter='\t')
    
    for game in game_reader:
        time.sleep(1) # prevent memory read rate error
        reads.append(game) 
        #print(game)

In [None]:
nanopore_reads = pd.DataFrame.from_records(data)

In [None]:
# Read in fasta files and convert to pandas dataframe
def read_fasta_nanopore(fasta_file,chromosome_header):
    clean_data = fasta_file.read().replace("\n", "")
    #clean_data = clean_data.replace(chromosome_header,"") # get rid of header

    return clean_data

In [None]:
with open('./assembly-algorithm/data/nanopore_chr17.fa') as f:
    chr17_nanopore_reads = read_fasta_nanopore(f,'>chr17')

In [None]:
nanopore_chr17

In [None]:
# Pull reads from chr17
!samtools view bioliquid_chr17_pompe.bam | head -n 5000 > bioliquid_chr17_pompe_5000reads.txt

# Sliding Window Assembly Algorithm (V1)

- Pad left-right on reference genome
- Create sliding window ~5000 BP
    - Avg. read length ~23,452
- Cluster on each window
- Slide window + jump_length
    - Jump length ~1000 BP
 

#### Padding left-right on reference genome

#### Create sliding window of length 5000 bp

In [None]:

Perform clustering by positioning sliding window in position 0
Move window to 0 + jump_length
Jump_length: 1000
Perform clustering again and repeat
Chr17 total length: 80M. If jump_length=1000, total number of iterations = 80K

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
#reads = pd.read_csv('bioliquid_chr17_pompe_100reads.csv')
reads = pd.read_csv('bioliquid_chr17_pompe_500reads.csv')

In [None]:
#reads['start_index'] = reads['unknown2']

In [None]:
test = reads[0:105]
test.head()

In [None]:
for ID in test.index:
    print(f"{test.loc[ID,'start_index']}, {len(test.loc[ID,'sequence'])}")

In [None]:
end_index = max(test['start_index']) + len(test.loc[test['start_index'].idxmax(),'sequence'])
start_index = min(test['start_index'])
index_range = end_index-start_index

In [None]:
read_overlap = np.zeros((len(test),index_range)) # empty matrix for reads

In [None]:
for ID in test.index:
    print(f"Adding raad cover for read {ID+1}: Index-->{test.loc[ID,'start_index']}, length-->{len(test.loc[ID,'sequence'])}")
    start = test.loc[ID,'start_index']-start_index
    #print(start)
    end = start+len(test.loc[ID,'sequence'])
    #print(end)
    read_overlap[ID,start:end] = 1

In [None]:
summary = read_overlap.sum(axis=0)

In [None]:
df = pd.DataFrame(summary,index=range(start_index,end_index), columns=['overlap_count'])
df['position'] = df.index

In [None]:
plt.plot(df['position'],df['overlap_count'])

In [None]:
index_range

# Clustering

In [None]:
left_bound = 23000000

In [None]:
test.head(1)

In [None]:
#sequences = {}
sequences = []

In [None]:
for ID in test.index:
    sequence = list(test.loc[ID,'sequence']) # turn sequence string into list
    
    start = test.loc[ID,'start_index']-left_bound
    
    if start > 0: # sequence starts to the right of the bound
        
        sequence = (start*['X'])+sequence
        sequence_trimmed = ''.join(sequence) # keep entire sequence
        
        
    elif start < 0: # sequence starts to the left of the bound
        sequence_trimmed = ''.join(sequence[np.abs(start):])
        
                                    
    #sequences[test.loc[ID,'ID']] = sequence_trimmed
    sequences.append(sequence_trimmed)
    

In [None]:
longest_sequence_length = len(max(sequences, key=len)) # get longest sequence
for i in range(len(sequences)):
    pad_length = longest_sequence_length - len(sequences[i])
    sequence = list(sequences[i])
    sequence = sequence+(pad_length*['X'])
    sequence = sequence[0:3000] # ADDED - RANGE from start index
    sequence = ''.join(sequence)
    sequences[i] = sequence

In [None]:
test['sequence_area_of_interest'] = sequences

In [None]:
test

In [None]:
import numpy as np
NUCLEOTIDE_VOCABULARY = [
    'A','C','G','T','X'
]
        
def nucleotide_to_one_hot(nucleotide_sequence):
    to_return = []
    for char in nucleotide_sequence:
        if char in NUCLEOTIDE_VOCABULARY:
            to_append = np.zeros(len(NUCLEOTIDE_VOCABULARY))
            to_append[NUCLEOTIDE_VOCABULARY.index(char)] = 1.
            to_return.append(to_append)
        else:
            raise ValueError('Could not one-hot code character {}'.format(char))
    return np.array(to_return)

nucleotide_to_one_hot('GTCATACX') # example

In [None]:
sequences_test = []

In [None]:
for seq in sequences:
    sequences_test.append(list(seq))

In [None]:
test['sequence_one_hot'] = test.sequence_area_of_interest.apply(lambda x: nucleotide_to_one_hot(x).flatten())  # Map variants that have insertions or deletions to all zeros (483 of them).

In [None]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder()
X = enc.fit_transform(sequences_test).toarray()

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

print(X_pca)

In [None]:
test['PCA1'] = np.nan
test['PCA2'] = np.nan
for ID in test.index:
    test.loc[ID,'PCA1'] = X_pca[ID][0].round(5)
    test.loc[ID,'PCA2'] = X_pca[ID][1].round(5)

In [None]:
import seaborn as sns
sns.scatterplot(data=test,x='PCA1',y='PCA2')

In [None]:
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans


# Standardizing the features
X = test[['PCA1','PCA2']]
X = StandardScaler().fit_transform(X)

distortions = []

for k in range(1, 5):
    KMeans_model = KMeans(n_clusters=k, random_state=42)
    KMeans_model.fit(X)
    distortions.append(KMeans_model.inertia_)

plt.plot(range(1, 5), distortions,  marker='o')
plt.xlabel('Number of clusters (K)')
plt.ylabel('Distortion')

In [None]:
silhouette_scores = []
for k in range(2, 5):
    model = KMeans(n_clusters=k, random_state=42)
    model.fit(X)
    score = silhouette_score(X, model.labels_)
    print("Silhouette Score for k = ", k, "is", score)
    silhouette_scores.append(score)

In [None]:
plt.plot(range(2, 5), silhouette_scores, marker='o')
plt.xlabel('Number of clusters (K)')
plt.ylabel('Silhouette score')

In [None]:
from yellowbrick.cluster import KElbowVisualizer

visualizer3 = KElbowVisualizer(KMeans(), k=(2,5))

visualizer3.fit(X) # Fit the data to the visualizer

In [None]:
model = KMeans(n_clusters=2, random_state=42)
cls2 = model.fit(X)
test['kmeans_cls2'] = cls2.labels_

In [None]:
test

In [None]:
import seaborn as sns
sns.scatterplot(data=test,x='PCA1',y='PCA2',hue='kmeans_cls2')