In [None]:
import os
from sklearn.utils import shuffle
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import TFAutoModelForSequenceClassification, AutoTokenizer
from transformers import BertModel, BertTokenizer#, DistilProtBert
from transformers import T5Model, T5Tokenizer
from sklearn.metrics import f1_score
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from transformers import TrainingArguments, Trainer
import pickle
import mgzip
import bz2
import gc
from transformers import TrainingArguments, Trainer
from transformers import EarlyStoppingCallback
from transformers import default_data_collator
from functools import reduce
import bisect


# Specify file paths
proteins_filepath = "/content/drive/MyDrive/Colab Notebooks/Dissertation Code/AWS/Data/c_auris_unannotated.csv"
savepath = "/content"



########## PREPARE DATASET ##########
# Importing data
print("\nLoading data")
proteins = pd.read_csv(proteins_filepath)

# Check for and remove null values (missing sequences) and duplicates
proteins['Predicted Protein Sequence'].isna().sum()
proteins = proteins.dropna(subset=['Predicted Protein Sequence'], axis=0)
proteins = proteins.drop_duplicates()

proteins.to_csv(f'{savepath}/cauris_sequences_filtered.csv', index=False)

# Drop unneeded columns
proteins = proteins.drop(columns=['Gene ID','source_id','Product Description', 'Gene Name or Symbol', 'gene_source_id','Organism', 'Computed GO Component IDs', 'Computed GO Function IDs', 'Computed GO Process IDs', 'Curated GO Component IDs','Curated GO Function IDs', 'Curated GO Process IDs'])


# Find longest sequence length to act as max length for inputs
longest_seq = proteins['Predicted Protein Sequence'].str.len().max()
max_len = longest_seq


# Convert dataframes to lists and find length of dataset
x = proteins['Predicted Protein Sequence'].tolist()
datapoints = len(proteins)


# Add spaces between characters in sequences
x = [" ".join(sequence) for sequence in x]

print("\nInitial dataset preparation complete")
print(f"\nLongest sequence: {longest_seq}")
print(f"Number of datapoints: {datapoints}")
print("First sequence:")
print(x[0])



########## TOKENISE AND SPLIT DATA ##########
# Load tokeniser
print("\nLoading tokeniser")
tokenizer = BertTokenizer.from_pretrained("Rostlab/prot_bert", do_lower_case=False )
print(f"\nTokenising unannotated dataset")

# Tokenise dataset
dataset = x
dataset_size = len(x)
tokenised_data = tokenizer(dataset, padding='max_length', truncation=True, max_length=max_len, add_special_tokens=True, return_attention_mask=True)
with mgzip.open(f"{savepath}/BEAR_encoded_cauris.gz", "wb") as f:
    pickle.dump(tokenised_data, f)

    
print("Tokenisation complete")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

Loading data

Initial dataset preparation complete

Longest sequence: 3118
Number of datapoints: 10149
First sequence:
M S K S Y T L T S G R Y N Y P F E F V F P G K E H V I Q C V K D K R I L H K R E Y L W N E R R Y H A T L P G S F F D G V N L G D Y C F V Q Y S V K A R V K T A S S F R F N I K Q S V P I Y F A P R N S D S F F S L L S L C D A S S K D L L P D E S H A C K K V K Y A I D S D L K K N K S F L R S L F S S N A V E V P F D L N V R F K E V I P I E T E K G T T N R V L Q A G S R L S R F V D L D L S T S F S Y S N L M D A L G M N K L D K R G S V P P P A I K F T H I E I K L L S T V R Y Q G T R E S V L E S S F V L R K H P L E L Q V D L S D F E K V E N Y S P L L S K K S P S K Y S E K L D E N V C Y R L S L D R S W W D C Y V S D I G Q T F M L C N I H K T A R L Y I C L R I A S A G N P A K E R K I E N T S P I V F Y R Q E G P D A P L C N Q V E H L P R Y M P A P A D Y

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/81.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/86.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/361 [00:00<?, ?B/s]


Tokenising unannotated dataset
Tokenisation complete
