In [3]:
from Bio import SeqIO
import pandas as pd
import numpy as np
import os

# مسیر فایل‌های RNALocate
rnalocate_files = {
    "Cytoplasm": "F:/New Version/Data/rnalocate/Cytoplasm_train.fasta",
    "Nucleus": "F:/New Version/Data/rnalocate/Nucleus_train.fasta",
    "Mitochondria": "F:/New Version/Data/rnalocate/Mitochondria_train.fasta",
    "Endoplasmic_reticulum": "F:/New Version/Data/rnalocate/Endoplasmic_reticulum_train.fasta",
    "Extracellular_region": "F:/New Version/Data/rnalocate/Extracellular_region_train.fasta"
}

# چک کردن وجود فایل‌ها
for loc, file_path in rnalocate_files.items():
    if not os.path.exists(file_path):
        print(f"⚠️ File not found: {file_path}")
        raise FileNotFoundError(f"File {file_path} does not exist.")

# خواندن و پاک‌سازی داده‌های RNALocate
rnalocate_data = []
label_map = {loc: i for i, loc in enumerate(rnalocate_files.keys())}
min_length = 20  # آستانه نگه داشته شده

for loc, file_path in rnalocate_files.items():
    print(f"Reading and cleaning {loc} from {file_path}")
    try:
        for record in SeqIO.parse(file_path, "fasta"):
            sequence = str(record.seq).upper()  # تبدیل به حروف بزرگ
            if len(sequence) < min_length:
                print(f"⚠️ Skipped sequence {record.id} due to length < {min_length}.")
                continue
            invalid_bases = [base for base in sequence if base not in "ACGTU"]
            if invalid_bases:
                print(f"⚠️ Skipped sequence {record.id} due to invalid bases: {invalid_bases}")
                continue
            # تبدیل T به U اگه RNA مدنظرت باشه (اختیاری)
            sequence = sequence.replace("T", "U")
            rnalocate_data.append({
                "id": record.id,
                "sequence": sequence,
                "label": label_map[loc],
                "location": loc,
                "length": len(sequence)
            })
    except Exception as e:
        print(f"❌ Error reading {file_path}: {e}")

if not rnalocate_data:
    raise ValueError("❌ No valid sequences found in RNALocate files. Check file paths and content.")

df_rnalocate = pd.DataFrame(rnalocate_data)
print(f"\n✅ RNALocate data loaded and cleaned: {df_rnalocate.shape[0]} sequences.")
print("Distribution of classes:")
print(df_rnalocate['location'].value_counts())
print("\nSequence length stats:")
print(df_rnalocate['length'].describe())

# ذخیره فایل‌ها
df_rnalocate.to_csv("F:/payan-nameh/faz2 . 1404.04.02/rnalocate_dataset.csv", index=False)
print("\n📝 Saved: rnalocate_dataset.csv")

# مسیر فایل‌های CeFra-Seq
cefra_seq_path = "F:/New Version/Data/cefra-seq/cefra_seq_cDNA_screened.fa"
cefra_ann_path = "F:/New Version/Data/cefra-seq/cefra_seq_cDNA_ann_screened.fa"

# چک کردن وجود فایل CeFra-Seq
if not os.path.exists(cefra_seq_path):
    print(f"⚠️ File not found: {cefra_seq_path}")
    raise FileNotFoundError(f"File {cefra_seq_path} does not exist.")

# خواندن و پاک‌سازی توالی‌های CeFra-Seq
cefra_seqs = []
print(f"Reading and cleaning CeFra-Seq from {cefra_seq_path}")
try:
    for record in SeqIO.parse(cefra_seq_path, "fasta"):
        sequence = str(record.seq).upper()
        if len(sequence) < min_length:
            print(f"⚠️ Skipped sequence {record.id} due to length < {min_length}.")
            continue
        invalid_bases = [base for base in sequence if base not in "ACGTU"]
        if invalid_bases:
            print(f"⚠️ Skipped sequence {record.id} due to invalid bases: {invalid_bases}")
            continue
        # تبدیل T به U اگه RNA مدنظرت باشه
        sequence = sequence.replace("T", "U")
        cefra_seqs.append({
            "id": record.id,
            "sequence": sequence,
            "length": len(sequence)
        })
except Exception as e:
    print(f"❌ Error reading {cefra_seq_path}: {e}")

if not cefra_seqs:
    raise ValueError("❌ No valid sequences found in CeFra-Seq file. Check file path and content.")

df_cefra = pd.DataFrame(cefra_seqs)
print(f"\n✅ CeFra-Seq sequences loaded and cleaned: {df_cefra.shape[0]} sequences.")
print("\nSequence length stats:")
print(df_cefra['length'].describe())

# ذخیره فایل CSV
df_cefra.to_csv("F:/payan-nameh/faz2 . 1404.04.02/cefra_dataset.csv", index=False)
print("\n📝 Saved: cefra_dataset.csv")

print("\n🚀 Done! Data ready for further processing.")

Reading and cleaning Cytoplasm from F:/New Version/Data/rnalocate/Cytoplasm_train.fasta
Reading and cleaning Nucleus from F:/New Version/Data/rnalocate/Nucleus_train.fasta
Reading and cleaning Mitochondria from F:/New Version/Data/rnalocate/Mitochondria_train.fasta
Reading and cleaning Endoplasmic_reticulum from F:/New Version/Data/rnalocate/Endoplasmic_reticulum_train.fasta
Reading and cleaning Extracellular_region from F:/New Version/Data/rnalocate/Extracellular_region_train.fasta

✅ RNALocate data loaded and cleaned: 12410 sequences.
Distribution of classes:
Cytoplasm                5310
Nucleus                  4855
Endoplasmic_reticulum    1185
Extracellular_region      710
Mitochondria              350
Name: location, dtype: int64

Sequence length stats:
count     12410.000000
mean       3929.565431
std        3492.177394
min         387.000000
25%        2286.000000
50%        3236.000000
75%        4716.000000
max      222377.000000
Name: length, dtype: float64

📝 Saved: rnaloc

In [4]:
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

# بارگذاری داده RNALocate
df = pd.read_csv("F:/payan-nameh/faz2 . 1404.04.02/rnalocate_dataset.csv")

# انتخاب فقط ستون‌های مورد نیاز
df = df[["sequence", "label"]]

# تقسیم داده‌ها: Train (70%) و Temp (30%)
train_df, temp_df = train_test_split(df, test_size=0.3, stratify=df['label'], random_state=42)

# تقسیم Temp به Val (15%) و Test (15%)
val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df['label'], random_state=42)

# ذخیره به فرمت CSV و NumPy
train_df.to_csv("F:/payan-nameh/faz2 . 1404.04.02/rnalocate_train.csv", index=False)
val_df.to_csv("F:/payan-nameh/faz2 . 1404.04.02/rnalocate_val.csv", index=False)
test_df.to_csv("F:/payan-nameh/faz2 . 1404.04.02/rnalocate_test.csv", index=False)

np.save("F:/payan-nameh/faz2 . 1404.04.02/rnalocate_X_train.npy", train_df["sequence"].values)
np.save("F:/payan-nameh/faz2 . 1404.04.02/rnalocate_y_train.npy", train_df["label"].values)
np.save("F:/payan-nameh/faz2 . 1404.04.02/rnalocate_X_val.npy", val_df["sequence"].values)
np.save("F:/payan-nameh/faz2 . 1404.04.02/rnalocate_y_val.npy", val_df["label"].values)
np.save("F:/payan-nameh/faz2 . 1404.04.02/rnalocate_X_test.npy", test_df["sequence"].values)
np.save("F:/payan-nameh/faz2 . 1404.04.02/rnalocate_y_test.npy", test_df["label"].values)

# نمایش توزیع کلاس‌ها
print("\n✅ Data split completed:")
print(f"Train samples: {len(train_df)}, Class distribution: \n{train_df['label'].value_counts()}")
print(f"Validation samples: {len(val_df)}, Class distribution: \n{val_df['label'].value_counts()}")
print(f"Test samples: {len(test_df)}, Class distribution: \n{test_df['label'].value_counts()}")

print("\n📝 Saved CSV and NumPy files for RNALocate.")



✅ Data split completed:
Train samples: 8687, Class distribution: 
0    3717
1    3399
3     829
4     497
2     245
Name: label, dtype: int64
Validation samples: 1861, Class distribution: 
0    796
1    728
3    178
4    106
2     53
Name: label, dtype: int64
Test samples: 1862, Class distribution: 
0    797
1    728
3    178
4    107
2     52
Name: label, dtype: int64

📝 Saved CSV and NumPy files for RNALocate.


In [5]:
import pandas as pd

# بارگذاری فایل‌ها
annotation_path = "F:/New Version/Data/Supplemental_File_3.tsv"
sequence_path = "F:/payan-nameh/faz2 . 1404.04.02/cefra_dataset.csv"

df_ann = pd.read_csv(annotation_path, sep='\t')
df_seq = pd.read_csv(sequence_path)

# چک کردن IDهای از دست‌رفته
missing_ids = set(df_seq["id"]) - set(df_ann["ensembl_gene_id"])
if missing_ids:
    print(f"⚠️ {len(missing_ids)} sequence IDs missing in annotation file.")
else:
    print("✅ No missing IDs found between sequence and annotation files.")

# استخراج برچسب‌ها با آستانه
threshold = 10

def extract_labels(row):
    labels = []
    if row['cyto_A'] > threshold or row['cyto_B'] > threshold:
        labels.append("cytoplasm")
    if row['insol_A'] > threshold or row['insol_B'] > threshold:
        labels.append("insoluble")
    if row['membr_A'] > threshold or row['membr_B'] > threshold:
        labels.append("membrane")
    if row['nucl_A'] > threshold or row['nucl_B'] > threshold:
        labels.append("nucleus")
    return ",".join(labels) if labels else "unknown"

df_ann["labels"] = df_ann.apply(extract_labels, axis=1)

# ادغام داده‌ها
merged_df = pd.merge(
    df_seq.rename(columns={"id": "ensembl_gene_id"}),
    df_ann[["ensembl_gene_id", "labels"]],
    on="ensembl_gene_id"
)

# حذف نمونه‌های بدون برچسب
merged_df = merged_df[merged_df["labels"] != "unknown"]

# اضافه کردن طول توالی
merged_df["length"] = merged_df["sequence"].str.len()

# ذخیره فایل
merged_df.to_csv("F:/payan-nameh/faz2 . 1404.04.02/cefra_labeled_data.tsv", sep='\t', index=False)

print(f"\n✅ CeFra-Seq labeled data saved: {len(merged_df)} sequences.")
print("\nSequence length stats:")
print(merged_df['length'].describe())

print("\n📝 Saved: cefra_labeled_data.tsv")


✅ No missing IDs found between sequence and annotation files.

✅ CeFra-Seq labeled data saved: 5668 sequences.

Sequence length stats:
count     5668.000000
mean      3455.664079
std       2406.254239
min        207.000000
25%       1875.750000
50%       2827.000000
75%       4373.000000
max      34526.000000
Name: length, dtype: float64

📝 Saved: cefra_labeled_data.tsv


In [6]:
import pandas as pd
import matplotlib.pyplot as plt

# بارگذاری داده
df = pd.read_csv("F:/payan-nameh/faz2 . 1404.04.02/cefra_labeled_data.tsv", sep="\t")

# تعداد برچسب‌ها برای هر نمونه
df["label_count"] = df["labels"].apply(lambda x: len(x.split(",")))

# بررسی چندبرچسبی بودن
multi_label_samples = df[df["label_count"] > 1]
print(f"تعداد نمونه‌های چندبرچسبی: {len(multi_label_samples)}")
print(f"از کل {len(df)} نمونه، حدوداً {len(multi_label_samples)/len(df)*100:.2f}% چندبرچسبی هستند.")

# توزیع تعداد برچسب‌ها (متنی و بصری)
label_counts = df["label_count"].value_counts().sort_index()
print("\nتوزیع تعداد برچسب‌ها:")
for count, freq in label_counts.items():
    print(f"تعداد برچسب {count}: {freq} نمونه")

# رسم هیستوگرام
plt.figure(figsize=(8, 5))
label_counts.plot(kind="bar")
plt.title("توزیع تعداد برچسب‌ها در هر نمونه")
plt.xlabel("تعداد برچسب")
plt.ylabel("تعداد نمونه")
plt.xticks(rotation=0)
plt.tight_layout()
plt.savefig("F:/payan-nameh/faz2 . 1404.04.02/label_distribution.png")
plt.close()

print("\n📝 Saved: label_distribution.png")


تعداد نمونه‌های چندبرچسبی: 4200
از کل 5668 نمونه، حدوداً 74.10% چندبرچسبی هستند.

توزیع تعداد برچسب‌ها:
تعداد برچسب 1: 1468 نمونه
تعداد برچسب 2: 1269 نمونه
تعداد برچسب 3: 1024 نمونه
تعداد برچسب 4: 1907 نمونه

📝 Saved: label_distribution.png


In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer

# بارگذاری داده برچسب‌خورده
df = pd.read_csv("F:/payan-nameh/faz2 . 1404.04.02/cefra_labeled_data.tsv", sep="\t")

# برداشتن sequence و labels
X = df["sequence"].values
y = df["labels"].apply(lambda x: x.split(","))

# تبدیل برچسب‌ها به فرمت باینری
mlb = MultiLabelBinarizer()
y_bin = mlb.fit_transform(y)

# تقسیم اولیه: Train (70%) و Temp (30%)
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y_bin, test_size=0.30, random_state=42, stratify=y_bin)

# تقسیم دوم: Validation (15%) و Test (15%)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, random_state=42, stratify=y_temp)

# ذخیره داده‌ها در فرمت CSV و NumPy
def save_csv_and_npy(X_split, y_split, prefix):
    label_str = mlb.inverse_transform(y_split)
    df_out = pd.DataFrame({
        "sequence": X_split,
        "labels": [",".join(lbls) for lbls in label_str]
    })
    df_out.to_csv(f"F:/payan-nameh/faz2 . 1404.04.02/{prefix}_train.csv", index=False)
    df_out.to_csv(f"F:/payan-nameh/faz2 . 1404.04.02/{prefix}_val.csv", index=False)
    df_out.to_csv(f"F:/payan-nameh/faz2 . 1404.04.02/{prefix}_test.csv", index=False)
    np.save(f"F:/payan-nameh/faz2 . 1404.04.02/{prefix}_X_train.npy", X_split)
    np.save(f"F:/payan-nameh/faz2 . 1404.04.02/{prefix}_y_train.npy", y_split)
    np.save(f"F:/payan-nameh/faz2 . 1404.04.02/{prefix}_X_val.npy", X_val)
    np.save(f"F:/payan-nameh/faz2 . 1404.04.02/{prefix}_y_val.npy", y_val)
    np.save(f"F:/payan-nameh/faz2 . 1404.04.02/{prefix}_X_test.npy", X_test)
    np.save(f"F:/payan-nameh/faz2 . 1404.04.02/{prefix}_y_test.npy", y_test)

save_csv_and_npy(X_train, y_train, "cefra")

# نمایش توزیع
print("\n✅ Data split completed:")
print(f"Train samples: {len(X_train)}")
print(f"Validation samples: {len(X_val)}")
print(f"Test samples: {len(X_test)}")

# بررسی توزیع برچسب‌ها
y_train_df = pd.DataFrame(y_train, columns=mlb.classes_)
y_val_df = pd.DataFrame(y_val, columns=mlb.classes_)
y_test_df = pd.DataFrame(y_test, columns=mlb.classes_)

print("\nTrain label distribution (sum per class):")
print(y_train_df.sum())
print("\nValidation label distribution (sum per class):")
print(y_val_df.sum())
print("\nTest label distribution (sum per class):")
print(y_test_df.sum())

print("\n📝 Saved CSV and NumPy files for CeFra-Seq.")



✅ Data split completed:
Train samples: 3967
Validation samples: 850
Test samples: 851

Train label distribution (sum per class):
cytoplasm    2610
insoluble    2834
membrane     2565
nucleus      2284
dtype: int64

Validation label distribution (sum per class):
cytoplasm    560
insoluble    607
membrane     549
nucleus      490
dtype: int64

Test label distribution (sum per class):
cytoplasm    559
insoluble    608
membrane     550
nucleus      490
dtype: int64

📝 Saved CSV and NumPy files for CeFra-Seq.


In [8]:
import pandas as pd
df_rnalocate = pd.read_csv("F:/payan-nameh/faz2 . 1404.04.02/rnalocate_dataset.csv")
print(f"Total RNALocate samples: {len(df_rnalocate)}")
print("Class distribution:\n", df_rnalocate['location'].value_counts())

Total RNALocate samples: 12410
Class distribution:
 Cytoplasm                5310
Nucleus                  4855
Endoplasmic_reticulum    1185
Extracellular_region      710
Mitochondria              350
Name: location, dtype: int64
