In [7]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [8]:
import numpy as np

data = np.load("/content/drive/MyDrive/Colab Notebooks/dataset.npy", allow_pickle=True)
data = data.item()

genes = data['genes']
labels = data['resistant'].astype(int)


In [9]:
print("Total samples:", len(genes))
print("Total labels:", len(labels))
print("Example gene:", genes[0][:60])
print("Example label:", labels[0])
print("Unique labels:", set(labels))


Total samples: 100000
Total labels: 100000
Example gene: ATGCACTACCGTATGATCCCCCTTCACTGGATGATGGAAATTGACTGCAATGGCTGCGCT
Example label: 0
Unique labels: {np.int64(0), np.int64(1)}


In [10]:
from collections import Counter

def get_kmers(seq, k=3):
    return [seq[i:i+k] for i in range(len(seq) - k + 1)]


In [11]:
vocab = Counter()
for g in genes:
    vocab.update(get_kmers(g))

TOP_K = 2000
kmers_list = [k for k, _ in vocab.most_common(TOP_K)]
kmer_index = {k: i for i, k in enumerate(kmers_list)}

print("Total features:", len(kmers_list))


Total features: 64


In [12]:
import numpy as np

X = np.zeros((len(genes), len(kmers_list)))

for i, g in enumerate(genes):
    for k in get_kmers(g):
        if k in kmer_index:
            X[i, kmer_index[k]] += 1

print("Feature matrix shape:", X.shape)


Feature matrix shape: (100000, 64)


In [13]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(
    X, labels, test_size=0.2, random_state=42
)

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, pred))


Accuracy: 0.9408
