# In this notebook we divide our data and assigning each part to a Model

In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from ipywidgets import interact
import ipywidgets as widgets
import plotly.graph_objects as go
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.decomposition import PCA
from mpl_toolkits.mplot3d import Axes3D

In [2]:
gc_data = pd.read_csv('../datasets/MainDataset/gastric_cancer_detection_dataset.csv')
pd.set_option('display.max_columns', None)
pd.set_option('display.precision', 4)

gc_data.head()

Unnamed: 0,age,gender,ethnicity,geographical_location,family_history,smoking_habits,alcohol_consumption,helicobacter_pylori_infection,dietary_habits,existing_conditions,endoscopic_images,biopsy_results,ct_scan,mature_mirna_acc,mature_mirna_id,target_symbol,target_entrez,target_ensembl,diana_microt,elmmo,microcosm,miranda,mirdb,pictar,pita,targetscan,predicted.sum,all.sum,label
0,43,Male,Ethnicity_A,Other,1,0,0,0,Low_Salt,Chronic Gastritis,Normal,Negative,Negative,MIR123,MIR123_1,TP53,5034,1946626,0.3813,0.187,0.7864,0.2048,0.5619,0.4382,0.2836,0.9282,4.3243,7.6668,0
1,86,Female,Ethnicity_B,California,1,0,0,1,High_Salt,Diabetes,Normal,Negative,Negative,MIR123,MIR234_2,TP53,6901,1503178,0.9586,0.4933,0.964,0.498,0.9856,0.1446,0.3754,0.1036,7.9677,1.4833,0
2,68,Male,Ethnicity_A,California,0,1,1,0,High_Salt,,Normal,Negative,Negative,MIR345,MIR345_3,KRAS,4014,1787909,0.2695,0.5736,0.6669,0.5404,0.9059,0.8273,0.3509,0.1669,3.7487,3.0468,0
3,57,Female,Ethnicity_A,Other,0,0,0,1,High_Salt,Chronic Gastritis,Normal,Negative,Negative,MIR123,MIR123_1,KRAS,7351,1310766,0.3723,0.2614,0.9495,0.1342,0.4299,0.9352,0.7947,0.867,5.4783,8.8113,0
4,33,Male,Ethnicity_A,California,0,1,1,0,High_Salt,Diabetes,Abnormal,Negative,Negative,MIR345,MIR123_1,CDH1,7982,1277058,0.9747,0.7545,0.2632,0.8768,0.6508,0.3377,0.4275,0.9158,1.8092,0.3946,0


## Understanding the Dataset (Full Explanation)

In this step, we explain each feature in the dataset in a way that is accessible even to those without a background in medicine, biology, or genetics.

### 🔹 Demographic & Lifestyle Information

These features capture general information about the patient and their habits, which may influence the risk of developing gastric cancer.

- **age** *(Numerical)*: The age of the patient at the time of diagnosis or evaluation.
- **gender** *(Categorical: Male/Female)*: The biological sex of the patient.
- **ethnicity** *(Categorical)*: The ethnic background of the patient (e.g., Asian, Caucasian).
- **geographical_location** *(Categorical)*: The country or region where the data was collected. Environmental or regional factors can influence cancer rates.
- **family_history** *(Binary: Yes/No)*: Whether the patient has a family history of cancer, which may increase their risk.
- **smoking_habits** *(Binary: Yes/No)*: Indicates if the patient is a smoker, a known risk factor for gastric and many other cancers.
- **alcohol_consumption** *(Binary: Yes/No)*: Indicates whether the patient consumes alcohol, which can also be a risk factor.
- **dietary_habits** *(Categorical)*: Describes the patient’s eating patterns—such as diets high in salt—which can affect cancer risk.

### 🔹 Medical History & Diagnostic Results

These features involve the patient's medical history and results from medical tests that are commonly used to detect cancer.

- **existing_conditions** *(Categorical)*: Any pre-existing medical conditions like diabetes, gastritis, or hypertension, which may be related to overall health and cancer risk.
- **helicobacter_pylori_infection** *(Binary: Yes/No)*: Indicates whether the patient is infected with *Helicobacter pylori*, a bacteria strongly linked to stomach cancer.
- **endoscopic_images** *(Categorical: Normal/Abnormal)*: The result of an endoscopy, which uses a camera to look inside the stomach. An abnormal result may show signs of tumors or ulcers.
- **biopsy_results** *(Categorical: Positive/Negative)*: Whether a tissue sample taken from the stomach tested positive (cancerous) or negative (non-cancerous).
- **ct_scan** *(Categorical: Normal/Abnormal)*: Results from a CT (computed tomography) scan. Abnormal results may indicate the presence of a tumor.

### 🔹 Genetic & Molecular Data (miRNA and Target Genes)

These columns relate to *microRNAs (miRNAs)*, which are small non-coding RNA molecules that regulate gene expression. Changes in miRNA behavior can be linked to cancer. Each miRNA can bind to target genes and affect how much of the protein is produced.

- **mature_mirna_acc** *(Categorical)*: A unique identifier (accession number) for the mature miRNA.
- **mature_mirna_id** *(Categorical)*: The name of the mature miRNA, such as *hsa-miR-21*, where "hsa" refers to humans.
- **target_symbol** *(Categorical)*: The shorthand symbol for a gene that the miRNA might regulate.
- **target_entrez** *(Categorical)*: A unique numerical ID for the gene in the NCBI Entrez database.
- **target_ensembl** *(Categorical)*: A unique identifier for the gene in the Ensembl database, another widely used genomic database.

### 🔹 miRNA–Gene Interaction Prediction Scores

These columns come from bioinformatics tools that predict whether a miRNA can interact with a specific gene. Each tool uses different algorithms or biological data to make these predictions. Higher scores generally indicate stronger or more confident interactions.

- **diana_microt** *(Numerical)*: Score from the DIANA-microT algorithm.
- **elmmo** *(Numerical)*: Confidence score from the ElMMo tool.
- **microcosm** *(Numerical)*: Score from the Microcosm prediction database.
- **miranda** *(Numerical)*: Score from the MiRanda alignment-based algorithm.
- **mirdb** *(Numerical)*: Confidence score from the miRDB database.
- **pictar** *(Numerical)*: Predicted interaction score from the PicTar algorithm.
- **pita** *(Numerical)*: Score based on how accessible a gene is to a miRNA (from the PITA algorithm).
- **targetscan** *(Numerical)*: Score from TargetScan, one of the most widely used miRNA target prediction tools.

- **predicted.sum** *(Numerical)*: Sum of scores across prediction algorithms. Indicates overall prediction confidence from computational tools.
- **all.sum** *(Numerical)*: May include additional scoring beyond prediction tools, such as experimentally validated data.

### 🎯 Target Variable

- **label** *(Binary: 1 = Cancer, 0 = No Cancer)*: This is the **output** or **label** column we aim to predict using machine learning. It tells whether the patient has been diagnosed with gastric cancer.

---

### 🧠 Summary
This dataset combines:
- Basic demographic and lifestyle risk factors
- Clinical and diagnostic test results
- Advanced genomic data related to miRNAs and their gene targets

Together, this rich set of features allows for a powerful machine learning model that can potentially detect gastric cancer based on diverse types of input data.



In [3]:
gc_data.columns

Index(['age', 'gender', 'ethnicity', 'geographical_location', 'family_history',
       'smoking_habits', 'alcohol_consumption',
       'helicobacter_pylori_infection', 'dietary_habits',
       'existing_conditions', 'endoscopic_images', 'biopsy_results', 'ct_scan',
       'mature_mirna_acc', 'mature_mirna_id', 'target_symbol', 'target_entrez',
       'target_ensembl', 'diana_microt', 'elmmo', 'microcosm', 'miranda',
       'mirdb', 'pictar', 'pita', 'targetscan', 'predicted.sum', 'all.sum',
       'label'],
      dtype='object')

# 📊 Feature Division by Model

We divide the dataset features into three models based on the stage of detection and the nature of the data.

---

## 🔹 Model 1: Early Risk Screening
**Goal:** Predict early risk based on lifestyle, demographic, and infection data.

**Used Features:**
- `Age`
- `Gender`
- `Ethnicity`
- `Geographical Location`
- `Family History`
- `Smoking Habits`
- `Alcohol Consumption`
- `Helicobacter Pylori Infection`
- `Dietary Habits`
- `Existing Conditions`

---

## 🔹 Model 2: Genetic Risk Model
**Goal:** Use genetic/molecular features to assess genetic predisposition among "safe" users from Model 1.

**Used Features:**
- `Mature miRNA Accession`
- `Mature miRNA ID`
- `Target Symbol`
- `Target Entrez`
- `Target Ensembl`
- `DIANA microT Score`
- `ElMMo Score`
- `Microcosm Score`
- `MiRanda Score`
- `miRDB Score`
- `PicTar Score`
- `PITA Score`
- `TargetScan Score`
- `Predicted Sum`
- `All Sum`

---

## 🔹 Model 3: Post-Clinical Decision Support
**Goal:** Support diagnosis after clinical tests, helping in confirmation or doctor-assisted decision making.

**Used Features:**
- `Endoscopic Images` (as Normal/Abnormal tag or processed features)
- `Biopsy Results` (Positive/Negative)
- `CT Scan` (Normal/Abnormal)
- `helicobacter pylori infection` (Yes/No)

> 💡 Optionally: Combine outputs from Model 1 & 2 for more context-aware analysis.

---

## ✅ Target Variable (for all models)
- `Label`: Binary (1: Cancer, 0: No Cancer)



In [5]:
early_risk_columns = [
    'age', 
    'gender', 
    'ethnicity', 
    'geographical_location', 
    'family_history',
    'smoking_habits', 
    'alcohol_consumption',
    'helicobacter_pylori_infection', 
    'dietary_habits',
    'existing_conditions',
    'label'
]

early_risk_data = gc_data[early_risk_columns]
early_risk_data.head()

Unnamed: 0,age,gender,ethnicity,geographical_location,family_history,smoking_habits,alcohol_consumption,helicobacter_pylori_infection,dietary_habits,existing_conditions,label
0,43,Male,Ethnicity_A,Other,1,0,0,0,Low_Salt,Chronic Gastritis,0
1,86,Female,Ethnicity_B,California,1,0,0,1,High_Salt,Diabetes,0
2,68,Male,Ethnicity_A,California,0,1,1,0,High_Salt,,0
3,57,Female,Ethnicity_A,Other,0,0,0,1,High_Salt,Chronic Gastritis,0
4,33,Male,Ethnicity_A,California,0,1,1,0,High_Salt,Diabetes,0


In [6]:
early_risk_data.shape

(212354, 11)

In [7]:
Genetic_Risk_columns  = [
    'mature_mirna_acc', 
    'mature_mirna_id', 
    'target_symbol', 
    'target_entrez',
    'target_ensembl', 
    'diana_microt', 
    'elmmo', 
    'microcosm', 
    'miranda',
    'mirdb', 
    'pictar', 
    'pita', 
    'targetscan', 
    'predicted.sum', 
    'all.sum',
    'label'
]

genetic_risk_data = gc_data[Genetic_Risk_columns]
genetic_risk_data.head()

Unnamed: 0,mature_mirna_acc,mature_mirna_id,target_symbol,target_entrez,target_ensembl,diana_microt,elmmo,microcosm,miranda,mirdb,pictar,pita,targetscan,predicted.sum,all.sum,label
0,MIR123,MIR123_1,TP53,5034,1946626,0.3813,0.187,0.7864,0.2048,0.5619,0.4382,0.2836,0.9282,4.3243,7.6668,0
1,MIR123,MIR234_2,TP53,6901,1503178,0.9586,0.4933,0.964,0.498,0.9856,0.1446,0.3754,0.1036,7.9677,1.4833,0
2,MIR345,MIR345_3,KRAS,4014,1787909,0.2695,0.5736,0.6669,0.5404,0.9059,0.8273,0.3509,0.1669,3.7487,3.0468,0
3,MIR123,MIR123_1,KRAS,7351,1310766,0.3723,0.2614,0.9495,0.1342,0.4299,0.9352,0.7947,0.867,5.4783,8.8113,0
4,MIR345,MIR123_1,CDH1,7982,1277058,0.9747,0.7545,0.2632,0.8768,0.6508,0.3377,0.4275,0.9158,1.8092,0.3946,0


In [8]:
genetic_risk_data.shape

(212354, 16)

In [10]:
Post_Clinical_Decision_columns = [
    'helicobacter_pylori_infection',
    'endoscopic_images', 
    'biopsy_results', 
    'ct_scan',
    'label'
]

post_clinical_decision_data = gc_data[Post_Clinical_Decision_columns]
post_clinical_decision_data.head()

Unnamed: 0,helicobacter_pylori_infection,endoscopic_images,biopsy_results,ct_scan,label
0,0,Normal,Negative,Negative,0
1,1,Normal,Negative,Negative,0
2,0,Normal,Negative,Negative,0
3,1,Normal,Negative,Negative,0
4,0,Abnormal,Negative,Negative,0


In [11]:
post_clinical_decision_data.shape

(212354, 5)

## Saving each dataset

In [12]:
early_risk_data.to_csv('../datasets/Early Risk Screening Data/early_screening_data.csv',index = False)

genetic_risk_data.to_csv('../datasets/Genetic Risk Model Data/genetic_data.csv',index = False)

post_clinical_decision_data.to_csv('../datasets/Post-Clinical Decision Support/clinical_test_data.csv',index = False)

print('All Files saved Successfully ✅')

All Files saved Successfully ✅
