In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import xgboost as xgb

# Load the dataset
data = pd.read_csv('/Users/anushkakondur/hacklytics25/BRCA.csv')

# Drop rows with missing values
data.dropna(inplace=True)

# Encode categorical variables
label_encoder = LabelEncoder()
data['Histology'] = label_encoder.fit_transform(data['Histology'])
data['HER2 status'] = label_encoder.fit_transform(data['HER2 status'])

# Define features and target
features = ['Protein1', 'Protein2', 'Protein3', 'Protein4', 'HER2 status']
X = data[features]
y = data['Histology']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the model (Random Forest as an example)
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Model Accuracy: {accuracy * 100:.2f}%')

# If accuracy is below 92%, try XGBoost
if accuracy < 0.92:
    model_xgb = xgb.XGBClassifier(n_estimators=100, random_state=42)
    model_xgb.fit(X_train, y_train)
    y_pred_xgb = model_xgb.predict(X_test)
    accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
    print(f'XGBoost Model Accuracy: {accuracy_xgb * 100:.2f}%')

Model Accuracy: 70.31%
XGBoost Model Accuracy: 64.06%


In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier

# Load data and drop missing values
data = pd.read_csv('/Users/anushkakondur/hacklytics25/BRCA.csv')
data.dropna(inplace=True)

# Use SEPARATE encoders for Histology and HER2 status
histology_encoder = LabelEncoder()
her2_encoder = LabelEncoder()

# Encode categorical features
data['Histology_Encoded'] = histology_encoder.fit_transform(data['Histology'])
data['HER2_Encoded'] = her2_encoder.fit_transform(data['HER2 status'])

# Define features and target
features = ['Protein1', 'Protein2', 'Protein3', 'Protein4', 'HER2_Encoded']
X = data[features]
y = data['Histology_Encoded']

# Split data (stratify to preserve class distribution)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Train model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict and decode labels
y_pred = model.predict(X_test)
predicted_histology = histology_encoder.inverse_transform(y_pred)
actual_histology = histology_encoder.inverse_transform(y_test)

# Add results to test set
results = X_test.copy()
results['Predicted Histology'] = predicted_histology
results['Actual Histology'] = actual_histology

# Display relevant columns
print(results[['Protein1', 'Protein2', 'Protein3', 'Protein4', 'HER2_Encoded', 'Predicted Histology', 'Actual Histology']])

     Protein1  Protein2  Protein3  Protein4  HER2_Encoded  \
293   0.78321  -0.84140  1.507000  0.816290             0   
262   0.75382   1.64250 -0.332850  0.857860             0   
272  -0.48903   2.37840 -0.467090  0.097310             0   
80    0.40384   0.48825  0.841110  0.616450             0   
142   0.22611   1.74910 -0.543970 -0.390210             1   
..        ...       ...       ...       ...           ...   
159  -0.20347   1.09770 -0.046476 -0.045144             0   
188   0.52751  -0.62123 -0.546990  0.391060             0   
246   0.45517   1.17740 -0.574560  0.366350             0   
161   0.67269  -0.63329  0.008736  0.555640             0   
324   0.15050   1.26740  0.642650  0.261420             0   

               Predicted Histology                Actual Histology  
293  Infiltrating Ductal Carcinoma  Infiltrating Lobular Carcinoma  
262  Infiltrating Ductal Carcinoma   Infiltrating Ductal Carcinoma  
272  Infiltrating Ductal Carcinoma   Infiltrating Ductal Car

In [13]:
print(data['Histology'].value_counts())

Histology
Infiltrating Ductal Carcinoma     224
Infiltrating Lobular Carcinoma     81
Mucinous Carcinoma                 12
Name: count, dtype: int64


In [14]:
model = RandomForestClassifier(
    n_estimators=100,
    class_weight='balanced',  # Assigns weights inversely proportional to class frequencies
    random_state=42
)

In [15]:
import xgboost as xgb

model = xgb.XGBClassifier(
    n_estimators=100,
    scale_pos_weight=1,  # Adjust if classes are imbalanced
    eval_metric='mlogloss',
    random_state=42
)
model.fit(X_train, y_train)

Parameters: { "scale_pos_weight" } are not used.



In [16]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline
from xgboost import XGBClassifier

# Load data
data = pd.read_csv('/Users/anushkakondur/hacklytics25/BRCA.csv')
data.dropna(inplace=True)

# Separate encoders
histology_encoder = LabelEncoder()
her2_encoder = LabelEncoder()

# Encode labels
data['Histology_Encoded'] = histology_encoder.fit_transform(data['Histology'])
data['HER2_Encoded'] = her2_encoder.fit_transform(data['HER2 status'])

# Features (add more if allowed)
features = [
    'Protein1', 'Protein2', 'Protein3', 'Protein4',
    'HER2_Encoded', 
    # 'Tumour_Stage', 'ER status', 'PR status'  # Uncomment if allowed
]
X = data[features]
y = data['Histology_Encoded']

# Split data (stratified)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    stratify=y,  # Critical for imbalance
    random_state=42
)

# Pipeline: SMOTE + XGBoost
model = make_pipeline(
    SMOTE(random_state=42),  # Oversamples minority classes
    XGBClassifier(
        n_estimators=200,
        learning_rate=0.1,
        max_depth=5,
        scale_pos_weight=3,  # Penalize misclassifying minority classes
        eval_metric='mlogloss',
        random_state=42
    )
)

# Train
model.fit(X_train, y_train)

# Predict
y_pred = model.named_steps['xgbclassifier'].predict(X_test)
y_pred_decoded = histology_encoder.inverse_transform(y_pred)
y_test_decoded = histology_encoder.inverse_transform(y_test)

# Evaluation
print("\nClassification Report:")
print(classification_report(y_test_decoded, y_pred_decoded))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test_decoded, y_pred_decoded))

# Predictions table
results = X_test.copy()
results['Predicted Histology'] = y_pred_decoded
results['Actual Histology'] = y_test_decoded
print("\nSample Predictions:")
print(results[['Protein1', 'Protein2', 'Protein3', 'Protein4', 
               'HER2_Encoded', 'Predicted Histology', 'Actual Histology']].sample(10))

Parameters: { "scale_pos_weight" } are not used.




Classification Report:
                                precision    recall  f1-score   support

 Infiltrating Ductal Carcinoma       0.64      0.51      0.57        45
Infiltrating Lobular Carcinoma       0.13      0.12      0.13        16
            Mucinous Carcinoma       0.08      0.33      0.12         3

                      accuracy                           0.41        64
                     macro avg       0.28      0.32      0.27        64
                  weighted avg       0.49      0.41      0.44        64


Confusion Matrix:
[[23 13  9]
 [11  2  3]
 [ 2  0  1]]

Sample Predictions:
     Protein1  Protein2  Protein3  Protein4  HER2_Encoded  \
262  0.753820  1.642500 -0.332850  0.857860             0   
148 -0.449040 -0.720160 -0.111630 -0.034973             1   
313 -0.091334  0.054757  0.034648 -0.588830             0   
31   0.649030  1.424000 -0.395360  1.184800             0   
15   0.142520  1.082700  0.210920  0.974280             0   
218  0.405530  0.331440 -0

In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

# Load data
data = pd.read_csv('BRCA.csv')
data.dropna(inplace=True)

# Encode labels
histology_encoder = LabelEncoder()
her2_encoder = LabelEncoder()
data['Histology_Encoded'] = histology_encoder.fit_transform(data['Histology'])
data['HER2_Encoded'] = her2_encoder.fit_transform(data['HER2 status'])

# Encode Tumour_Stage (ordinal: I=1, II=2, III=3)
tumour_stage_encoder = OrdinalEncoder(categories=[['I', 'II', 'III']])
data['Tumour_Stage_Encoded'] = tumour_stage_encoder.fit_transform(
    data[['Tumour_Stage']]
).astype(int)

# Features (all numerical)
features = [
    'Protein1', 'Protein2', 'Protein3', 'Protein4',
    'HER2_Encoded', 'Age', 'Tumour_Stage_Encoded'  # Use encoded tumour stage
]
X = data[features]
y = data['Histology_Encoded']

# Split data (stratified)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Calculate class weights
class_counts = y_train.value_counts().to_dict()
total_samples = sum(class_counts.values())
class_weights = {
    cls: total_samples / (len(class_counts) * count)
    for cls, count in class_counts.items()
}

# Pipeline: SMOTE + XGBoost
model = make_pipeline(
    SMOTE(random_state=42),
    XGBClassifier(
        objective='multi:softmax',
        num_class=3,
        n_estimators=200,
        learning_rate=0.05,
        max_depth=3,
        random_state=42
    )
)

# Train
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
print(classification_report(
    y_test, y_pred, 
    target_names=histology_encoder.classes_
))

                                precision    recall  f1-score   support

 Infiltrating Ductal Carcinoma       0.68      0.67      0.67        45
Infiltrating Lobular Carcinoma       0.00      0.00      0.00        16
            Mucinous Carcinoma       0.00      0.00      0.00         3

                      accuracy                           0.47        64
                     macro avg       0.23      0.22      0.22        64
                  weighted avg       0.48      0.47      0.47        64



In [22]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

# Load data
data = pd.read_csv('BRCA.csv')
data.dropna(inplace=True)

# Encode target
histology_encoder = LabelEncoder()
data['Histology_Encoded'] = histology_encoder.fit_transform(data['Histology'])

# Preprocess features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', ['Protein1', 'Protein2', 'Protein3', 'Protein4', 'Age']),
        ('cat', OneHotEncoder(), ['Tumour_Stage', 'HER2 status', 'ER status', 'PR status'])
    ]
)

X = preprocessor.fit_transform(data)
y = data['Histology_Encoded']

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Check actual class distribution in training data
print("Original class distribution in training data:")
print(pd.Series(y_train).value_counts())

# Handle class imbalance (only oversample classes that need it)
smote = SMOTE(
    sampling_strategy={
        1: 100,  # Double "Infiltrating Lobular Carcinoma" (from 65 → 100)
        2: 30    # Increase "Mucinous Carcinoma" aggressively (e.g., from 3 → 30)
    },
    random_state=42
)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

# Verify new class distribution
print("\nClass distribution after SMOTE:")
print(pd.Series(y_train_res).value_counts())

# Simplified model
model = XGBClassifier(
    objective='multi:softmax',
    num_class=3,
    n_estimators=300,
    learning_rate=0.01,
    max_depth=3,
    random_state=42
)

# Train
model.fit(X_train_res, y_train_res)

# Evaluate
y_pred = model.predict(X_test)
print("\nClassification Report:")
print(classification_report(
    y_test, y_pred,
    target_names=histology_encoder.classes_
))

Original class distribution in training data:
Histology_Encoded
0    179
1     65
2      9
Name: count, dtype: int64

Class distribution after SMOTE:
Histology_Encoded
0    179
1    100
2     30
Name: count, dtype: int64

Classification Report:
                                precision    recall  f1-score   support

 Infiltrating Ductal Carcinoma       0.70      0.84      0.77        45
Infiltrating Lobular Carcinoma       0.33      0.19      0.24        16
            Mucinous Carcinoma       0.00      0.00      0.00         3

                      accuracy                           0.64        64
                     macro avg       0.35      0.34      0.34        64
                  weighted avg       0.58      0.64      0.60        64



In [23]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(
    sampling_strategy={
        1: 100,  # Infiltrating Lobular Carcinoma (from 65)
        2: 50    # Mucinous Carcinoma (from 9)
    },
    k_neighbors=3,  # Allow smaller neighborhoods for tiny classes
    random_state=42
)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

In [24]:
model = XGBClassifier(
    objective='multi:softmax',
    num_class=3,
    n_estimators=500,
    learning_rate=0.1,
    max_depth=4,
    scale_pos_weight=[1, 5, 10],  # Weight Mucinous 10x higher
    reg_alpha=0.5,
    reg_lambda=0.5,
    random_state=42
)

In [26]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

# Load data
data = pd.read_csv('BRCA.csv')
data.dropna(inplace=True)

# Feature Engineering
data['Protein1_Protein2'] = data['Protein1'] * data['Protein2']
data['Protein3_HER2'] = data['Protein3'] * data['HER2 status'].map({'Negative': 0, 'Positive': 1})

# Encode target
histology_encoder = LabelEncoder()
data['Histology_Encoded'] = histology_encoder.fit_transform(data['Histology'])

# Preprocess features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', [
            'Protein1', 'Protein2', 'Protein3', 'Protein4', 
            'Protein1_Protein2', 'Protein3_HER2', 'Age'
        ]),
        ('cat', OneHotEncoder(), ['Tumour_Stage', 'ER status', 'PR status'])
    ]
)

X = preprocessor.fit_transform(data)
y = data['Histology_Encoded']

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Calculate class weights (aggressive for Mucinous)
class_weights = {
    0: 1,   # Ductal
    1: 3,   # Lobular
    2: 10   # Mucinous (most penalized)
}

# SMOTE oversampling
smote = SMOTE(
    sampling_strategy={
        1: 100,  # Lobular: 65 → 100
        2: 30    # Mucinous: 9 → 30
    },
    k_neighbors=3,  # Allow smaller neighborhoods
    random_state=42
)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

# XGBoost with explicit class weights
model = XGBClassifier(
    objective='multi:softmax',
    num_class=3,
    n_estimators=500,
    learning_rate=0.1,
    max_depth=4,
    reg_alpha=0.5,
    reg_lambda=0.5,
    scale_pos_weight=[class_weights[0], class_weights[1], class_weights[2]],
    random_state=42
)

# Train
model.fit(X_train_res, y_train_res)

# Evaluate
y_pred = model.predict(X_test)
print("\nClassification Report:")
print(classification_report(
    y_test, y_pred,
    target_names=histology_encoder.classes_
))

# Feature Importance Analysis (Critical!)
print("\nTop 10 Features:")
feature_importances = pd.Series(
    model.feature_importances_,
    index=preprocessor.get_feature_names_out()
)
print(feature_importances.sort_values(ascending=False).head(10))

Parameters: { "scale_pos_weight" } are not used.




Classification Report:
                                precision    recall  f1-score   support

 Infiltrating Ductal Carcinoma       0.71      0.76      0.73        45
Infiltrating Lobular Carcinoma       0.27      0.25      0.26        16
            Mucinous Carcinoma       0.00      0.00      0.00         3

                      accuracy                           0.59        64
                     macro avg       0.33      0.34      0.33        64
                  weighted avg       0.56      0.59      0.58        64


Top 10 Features:
cat__Tumour_Stage_II      0.194062
cat__Tumour_Stage_III     0.125607
num__Age                  0.094776
num__Protein3_HER2        0.094291
cat__Tumour_Stage_I       0.093797
num__Protein2             0.093226
num__Protein4             0.081594
num__Protein1_Protein2    0.074313
num__Protein3             0.074219
num__Protein1             0.074114
dtype: float32


In [27]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

# Load data
data = pd.read_csv('BRCA.csv')
data.dropna(inplace=True)

# Feature Engineering
data['Protein1_Protein2'] = data['Protein1'] * data['Protein2']
data['Protein3_HER2'] = data['Protein3'] * data['HER2 status'].map({'Negative': 0, 'Positive': 1})

# Encode target
histology_encoder = LabelEncoder()
data['Histology_Encoded'] = histology_encoder.fit_transform(data['Histology'])

# Preprocess features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', [
            'Protein1', 'Protein2', 'Protein3', 'Protein4', 
            'Protein1_Protein2', 'Protein3_HER2', 'Age'
        ]),
        ('cat', OneHotEncoder(), ['Tumour_Stage', 'ER status', 'PR status'])
    ]
)

X = preprocessor.fit_transform(data)
y = data['Histology_Encoded']

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Calculate class weights
class_counts = y_train.value_counts()
total_samples = sum(class_counts)
class_weights = {
    cls: total_samples / (len(class_counts) * count) 
    for cls, count in class_counts.items()
}

# SMOTE oversampling
smote = SMOTE(
    sampling_strategy={
        1: 100,  # Lobular: 65 → 100
        2: 30    # Mucinous: 9 → 30
    },
    k_neighbors=3,
    random_state=42
)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

# Map class weights to resampled data
sample_weights = y_train_res.map(class_weights)

# XGBoost with sample weights
model = XGBClassifier(
    objective='multi:softmax',
    num_class=3,
    n_estimators=500,
    learning_rate=0.1,
    max_depth=4,
    reg_alpha=0.5,
    reg_lambda=0.5,
    random_state=42
)

# Train with sample weights
model.fit(X_train_res, y_train_res, sample_weight=sample_weights)

# Evaluate
y_pred = model.predict(X_test)
print("\nClassification Report:")
print(classification_report(
    y_test, y_pred,
    target_names=histology_encoder.classes_
))


Classification Report:
                                precision    recall  f1-score   support

 Infiltrating Ductal Carcinoma       0.74      0.62      0.67        45
Infiltrating Lobular Carcinoma       0.24      0.31      0.27        16
            Mucinous Carcinoma       0.20      0.33      0.25         3

                      accuracy                           0.53        64
                     macro avg       0.39      0.42      0.40        64
                  weighted avg       0.59      0.53      0.55        64



In [28]:
data['Protein1/Protein2'] = data['Protein1'] / (data['Protein2'] + 1e-6)  # Avoid division by zero
data['Protein3_Protein4_Interaction'] = data['Protein3'] * data['Protein4']

In [29]:
data['ER_PR_Interaction'] = data['ER status'].map({'Positive': 1, 'Negative': 0}) * \
                            data['PR status'].map({'Positive': 1, 'Negative': 0})

In [30]:
data['Age_Group'] = pd.cut(data['Age'], bins=[30, 50, 70, 90], labels=['Young', 'Middle', 'Senior'])
data['Protein1_Quartile'] = pd.qcut(data['Protein1'], q=4, labels=False)