In [27]:
!pip install pandas numpy scikit-learn matplotlib seaborn streamlit xgboost joblib



In [28]:
import os
os.makedirs("models", exist_ok=True)

In [29]:
import pandas as pd
import numpy as np

from sklearn.datasets import load_breast_cancer

data = load_breast_cancer()

X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target)

print("Dataset shape:", X.shape)

X.head()

Dataset shape: (569, 30)


Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [30]:
import pandas as pd
import numpy as np

from sklearn.datasets import load_breast_cancer

data = load_breast_cancer()

X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target)

print("Dataset shape:", X.shape)

X.head()

Dataset shape: (569, 30)


Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [31]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

print("Train:", X_train.shape)
print("Test:", X_test.shape)

Train: (455, 30)
Test: (114, 30)


In [32]:
import pickle

feature_names = X.columns.tolist()

with open("models/feature_names.pkl", "wb") as f:
    pickle.dump(feature_names, f)

In [33]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

from xgboost import XGBClassifier

In [34]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import matthews_corrcoef

In [35]:
import joblib

models = {

    "logistic_regression": LogisticRegression(max_iter=5000),

    "decision_tree": DecisionTreeClassifier(),

    "knn": KNeighborsClassifier(),

    "naive_bayes": GaussianNB(),

    "random_forest": RandomForestClassifier(),

    "xgboost": XGBClassifier(use_label_encoder=False, eval_metric='logloss')

}

results = []

for name, model in models.items():

    print("Training:", name)

    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    y_prob = model.predict_proba(X_test)[:,1]

    accuracy = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_prob)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    mcc = matthews_corrcoef(y_test, y_pred)

    results.append([
        name,
        accuracy,
        auc,
        precision,
        recall,
        f1,
        mcc
    ])

    joblib.dump(model, f"models/{name}.pkl")

Training: logistic_regression
Training: decision_tree
Training: knn
Training: naive_bayes
Training: random_forest
Training: xgboost


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [36]:
results_df = pd.DataFrame(
    results,
    columns=[
        "Model",
        "Accuracy",
        "AUC",
        "Precision",
        "Recall",
        "F1 Score",
        "MCC"
    ]
)

results_df

Unnamed: 0,Model,Accuracy,AUC,Precision,Recall,F1 Score,MCC
0,logistic_regression,0.95614,0.997707,0.945946,0.985915,0.965517,0.906811
1,decision_tree,0.938596,0.932362,0.944444,0.957746,0.951049,0.86886
2,knn,0.95614,0.995906,0.934211,1.0,0.965986,0.908615
3,naive_bayes,0.973684,0.998362,0.959459,1.0,0.97931,0.944733
4,random_forest,0.95614,0.995578,0.958333,0.971831,0.965035,0.906379
5,xgboost,0.95614,0.990829,0.958333,0.971831,0.965035,0.906379


In [37]:
results_df.to_csv("models/results.csv", index=False)

In [38]:
os.listdir("models")

['logistic_regression.pkl',
 'decision_tree.pkl',
 'knn.pkl',
 'naive_bayes.pkl',
 'random_forest.pkl',
 'xgboost.pkl',
 'results.csv',
 'feature_names.pkl']

In [39]:
import os
os.path.abspath("models/results.csv")

'/home/cloud/models/results.csv'

In [40]:
import streamlit as st
import pandas as pd
import joblib

st.title("BITS ML Assignment 2 - Classification Models")

# load results
results = pd.read_csv("models/results.csv")

st.subheader("Model Performance")

st.dataframe(results)

# dropdown
model_name = st.selectbox(
    "Select Model",
    results["Model"]
)

# load selected model
model = joblib.load(f"models/{model_name}.pkl")

# upload csv
uploaded_file = st.file_uploader("Upload CSV file")

if uploaded_file is not None:

    data = pd.read_csv(uploaded_file)

    predictions = model.predict(data)

    st.write("Predictions:")

    st.write(predictions)



In [55]:
# Save correct test CSV for prediction
X_test.to_csv("test_prediction.csv", index=False)

print("test_prediction.csv created successfully")

test_prediction.csv created successfully
