In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

# بارگذاری داده‌ها
df_rnalocate = pd.read_csv("F:/payan-nameh/faz2 . 1404.04.02/rnalocate_dataset.csv")
sequences = df_rnalocate['sequence'].values

# استخراج k-mer با k=3
vectorizer = CountVectorizer(analyzer='char', ngram_range=(3, 3), lowercase=False)
kmer_features = vectorizer.fit_transform(sequences)

# تبدیل به آرایه NumPy
X_kmer = kmer_features.toarray()

print(f"✅ k-mer features extracted: {X_kmer.shape} (samples x 64 features)")
# ذخیره موقت برای استفاده در گام‌های بعدی
import numpy as np
np.save("F:/payan-nameh/faz2 . 1404.04.02/X_kmer_temp.npy", X_kmer)
print("📝 Saved: X_kmer_temp.npy")

✅ k-mer features extracted: (12410, 64) (samples x 64 features)
📝 Saved: X_kmer_temp.npy


In [2]:
import numpy as np
from itertools import product

# بارگذاری k-mer features برای تست
X_kmer = np.load("F:/payan-nameh/faz2 . 1404.04.02/X_kmer_temp.npy")
df_rnalocate = pd.read_csv("F:/payan-nameh/faz2 . 1404.04.02/rnalocate_dataset.csv")
sequences = df_rnalocate['sequence'].values

# تعریف الگوهای فاصله‌دار (A..C، G..T و غیره) با فاصله 2
bases = ['A', 'C', 'G', 'U']
distance = 2
motif_features = []

for seq in sequences:
    seq_features = []
    for start, end in product(bases, bases):
        pattern = f"{start}.{{{distance}}}{end}"
        count = sum(1 for i in range(len(seq) - distance - 1) if seq[i] == start and seq[i + distance + 1] == end)
        seq_features.append(count)
    motif_features.append(seq_features)

X_distance = np.array(motif_features)
print(f"✅ Distance-based motifs extracted: {X_distance.shape} (samples x {len(bases) * len(bases)} features)")
np.save("F:/payan-nameh/faz2 . 1404.04.02/X_distance_temp.npy", X_distance)
print("📝 Saved: X_distance_temp.npy")

✅ Distance-based motifs extracted: (12410, 16) (samples x 16 features)
📝 Saved: X_distance_temp.npy


In [3]:
import numpy as np

# بارگذاری داده‌ها
df_rnalocate = pd.read_csv("F:/payan-nameh/faz2 . 1404.04.02/rnalocate_dataset.csv")
sequences = df_rnalocate['sequence'].values

def cgr_feature(sequence):
    # نقاط اولیه برای A, C, G, U
    points = {'A': (0, 0), 'C': (1, 0), 'G': (0, 1), 'U': (1, 1)}
    x, y = 0.5, 0.5  # نقطه شروع
    for base in sequence:
        if base in points:
            px, py = points[base]
            x = (x + px) / 2
            y = (y + py) / 2
    # بردار آماری ساده (میانگین و واریانس مختصات)
    return [x, y, np.var([x for _ in sequence]), np.var([y for _ in sequence])]

# استخراج ویژگی‌ها
cgr_features = np.array([cgr_feature(seq) for seq in sequences])
print(f"✅ CGR features extracted: {cgr_features.shape} (samples x 4 features)")
np.save("F:/payan-nameh/faz2 . 1404.04.02/X_cgr_temp.npy", cgr_features)
print("📝 Saved: X_cgr_temp.npy")

✅ CGR features extracted: (12410, 4) (samples x 4 features)
📝 Saved: X_cgr_temp.npy


In [4]:
import numpy as np
from sklearn.preprocessing import StandardScaler

# بارگذاری داده‌ها
df_rnalocate = pd.read_csv("F:/payan-nameh/faz2 . 1404.04.02/rnalocate_dataset.csv")
sequences = df_rnalocate['sequence'].values

# تنظیم طول ثابت (مثلاً 100، می‌تونی تغییر بدی)
max_length = 100

def pad_sequence(seq, max_len):
    if len(seq) > max_len:
        return seq[:max_len]
    return seq + 'N' * (max_len - len(seq))

# پد کردن توالی‌ها
padded_sequences = [pad_sequence(seq, max_length) for seq in sequences]

# محاسبه PSSM
pssm_features = []
for seq in padded_sequences:
    pos_counts = np.zeros((max_length, 4))  # برای A, C, G, U
    for i, base in enumerate(seq):
        if base == 'A': pos_counts[i, 0] += 1
        elif base == 'C': pos_counts[i, 1] += 1
        elif base == 'G': pos_counts[i, 2] += 1
        elif base == 'U': pos_counts[i, 3] += 1
    pssm_features.append(pos_counts.flatten())

X_pssm = np.array(pssm_features)
print(f"✅ PSSM features extracted: {X_pssm.shape} (samples x {max_length * 4} features)")
np.save("F:/payan-nameh/faz2 . 1404.04.02/X_pssm_temp.npy", X_pssm)
print("📝 Saved: X_pssm_temp.npy")

✅ PSSM features extracted: (12410, 400) (samples x 400 features)
📝 Saved: X_pssm_temp.npy


In [5]:
import numpy as np

# بارگذاری داده‌ها
df_rnalocate = pd.read_csv("F:/payan-nameh/faz2 . 1404.04.02/rnalocate_dataset.csv")
sequences = df_rnalocate['sequence'].values

def statistical_features(sequence):
    total = len(sequence)
    a_count = sequence.count('A') / total
    c_count = sequence.count('C') / total
    g_count = sequence.count('G') / total
    u_count = sequence.count('U') / total
    gc_content = (g_count + c_count)
    # Shannon Entropy
    entropy = 0
    for base in 'ACGU':
        p = sequence.count(base) / total
        if p > 0:
            entropy -= p * np.log2(p)
    return [a_count, c_count, g_count, u_count, gc_content, entropy]

# استخراج ویژگی‌ها
ssf_features = np.array([statistical_features(seq) for seq in sequences])
print(f"✅ SSF features extracted: {ssf_features.shape} (samples x 6 features)")
np.save("F:/payan-nameh/faz2 . 1404.04.02/X_ssf_temp.npy", ssf_features)
print("📝 Saved: X_ssf_temp.npy")

✅ SSF features extracted: (12410, 6) (samples x 6 features)
📝 Saved: X_ssf_temp.npy


In [4]:
import numpy as np
from sklearn.preprocessing import StandardScaler

# بارگذاری همه ویژگی‌های موقت
X_kmer = np.load("F:/payan-nameh/faz2 . 1404.04.02/X_kmer_temp.npy")
X_distance = np.load("F:/payan-nameh/faz2 . 1404.04.02/X_distance_temp.npy")
X_cgr = np.load("F:/payan-nameh/faz2 . 1404.04.02/X_cgr_temp.npy")
X_pssm = np.load("F:/payan-nameh/faz2 . 1404.04.02/X_pssm_temp.npy")
X_ssf = np.load("F:/payan-nameh/faz2 . 1404.04.02/X_ssf_temp.npy")

# ترکیب ویژگی‌ها
X_handcrafted = np.concatenate((X_kmer, X_distance, X_cgr, X_pssm, X_ssf), axis=1)
print(f"✅ Combined features shape: {X_handcrafted.shape}")

# نرمال‌سازی
scaler = StandardScaler()
X_handcrafted_scaled = scaler.fit_transform(X_handcrafted)

# ذخیره نهایی
np.save("F:/payan-nameh/faz2 . 1404.04.02/X_handcrafted.npy", X_handcrafted_scaled)
print("📝 Saved: X_handcrafted.npy")

✅ Combined features shape: (12410, 490)
📝 Saved: X_handcrafted.npy


In [11]:
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np

# بارگذاری مدل و توکن‌کننده با اسم اصلاح‌شده
model_name = "zhihan1996/DNA_bert_6"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# بارگذاری داده‌ها
df_rnalocate = pd.read_csv("F:/payan-nameh/faz2 . 1404.04.02/rnalocate_dataset.csv")
sequences = df_rnalocate['sequence'].values

# استخراج embeddings
def get_bert_features(sequence):
    inputs = tokenizer(sequence, return_tensors="pt", truncation=True, max_length=4000, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

bert_features = np.array([get_bert_features(seq) for seq in sequences])
print(f"✅ BERT features extracted: {bert_features.shape} (samples x 768 features)")
np.save("F:/payan-nameh/faz2 . 1404.04.02/X_bert.npy", bert_features)
print("📝 Saved: X_bert.npy")

✅ BERT features extracted: (12410, 768) (samples x 768 features)
📝 Saved: X_bert.npy


In [12]:
import numpy as np

# بارگذاری فایل‌ها
X_handcrafted = np.load("F:/payan-nameh/faz2 . 1404.04.02/X_handcrafted.npy")
X_bert = np.load("F:/payan-nameh/faz2 . 1404.04.02/X_bert.npy")

# چاپ ابعاد
print(f"Shape of X_handcrafted: {X_handcrafted.shape}")  # [N, F₁]
print(f"Shape of X_bert: {X_bert.shape}")  # [N, F₂]

# برای اطمینان، تعداد نمونه‌ها رو مقایسه کن
if X_handcrafted.shape[0] == X_bert.shape[0]:
    print(f"Number of samples match: {X_handcrafted.shape[0]}")
else:
    print("Warning: Number of samples do not match!")

Shape of X_handcrafted: (12410, 490)
Shape of X_bert: (12410, 768)
Number of samples match: 12410
