# Model Training and Evaluation
Training and evaluating classification models using metrics such as accuracy, precision, recall, and F1-score with hyperparameter tuning.

In [1]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier


In [2]:
df = pd.read_csv("classification_dataset.csv")
df.head()


Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


In [3]:
X = df.drop("target", axis=1)
y = df["target"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [4]:
print(X_train)
print(X_test)
print(y_train)
print(y_test)


     mean radius  mean texture  mean perimeter  mean area  mean smoothness  \
68         9.029         17.33           58.79      250.5          0.10660   
181       21.090         26.57          142.70     1311.0          0.11410   
63         9.173         13.86           59.20      260.9          0.07721   
248       10.650         25.22           68.01      347.0          0.09657   
60        10.170         14.88           64.55      311.9          0.11340   
..           ...           ...             ...        ...              ...   
71         8.888         14.64           58.79      244.0          0.09783   
106       11.640         18.33           75.17      412.5          0.11420   
270       14.290         16.82           90.30      632.6          0.06429   
435       13.980         19.62           91.12      599.5          0.10600   
102       12.180         20.52           77.22      458.7          0.08013   

     mean compactness  mean concavity  mean concave points  mea

In [6]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier()
}
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
results = []
for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)

    results.append({
        "Model": name,
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1 Score": f1_score(y_test, y_pred)
    })
    results_df = pd.DataFrame(results)
results_df


Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,Logistic Regression,0.973684,0.972222,0.985915,0.979021
1,Decision Tree,0.938596,0.957143,0.943662,0.950355
2,Random Forest,0.964912,0.958904,0.985915,0.972222


In [7]:
# Grid Search for Logistic Regression
param_grid_lr = {
    'C': [0.01, 0.1, 1, 10],
    'solver': ['liblinear', 'lbfgs']
}
lr_grid = GridSearchCV(LogisticRegression(max_iter=1000), param_grid_lr, cv=5)
lr_grid.fit(X_train_scaled, y_train)

# Randomized Search for Decision Tree
param_dist_dt = {
    'max_depth': [5, 10, 15, None],
    'min_samples_split': [2, 5, 10],
    'criterion': ['gini', 'entropy']
}
dt_random = RandomizedSearchCV(DecisionTreeClassifier(), param_distributions=param_dist_dt, n_iter=5, cv=5, random_state=42)
dt_random.fit(X_train_scaled, y_train)

# Grid Search for Random Forest
param_grid_rf = {
    'n_estimators': [50, 100],
    'max_depth': [10, None],
    'min_samples_split': [2, 5]
}
rf_grid = GridSearchCV(RandomForestClassifier(), param_grid_rf, cv=5)
rf_grid.fit(X_train_scaled, y_train)


In [8]:

# Evaluate tuned models
tuned_models = {
    "Logistic Regression (Tuned)": lr_grid.best_estimator_,
    "Decision Tree (Tuned)": dt_random.best_estimator_,
    "Random Forest (Tuned)": rf_grid.best_estimator_
}

tuned_results = []
for name, model in tuned_models.items():
    y_pred = model.predict(X_test_scaled)
    tuned_results.append({
        "Model": name,
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1 Score": f1_score(y_test, y_pred)
    })
tuned_results_df = pd.DataFrame(tuned_results)
tuned_results_df


Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,Logistic Regression (Tuned),0.991228,0.986111,1.0,0.993007
1,Decision Tree (Tuned),0.95614,0.945946,0.985915,0.965517
2,Random Forest (Tuned),0.964912,0.958904,0.985915,0.972222


In [9]:
final_df = pd.concat([results_df, tuned_results_df])
final_df.sort_values(by="F1 Score", ascending=False)


Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,Logistic Regression (Tuned),0.991228,0.986111,1.0,0.993007
0,Logistic Regression,0.973684,0.972222,0.985915,0.979021
2,Random Forest,0.964912,0.958904,0.985915,0.972222
2,Random Forest (Tuned),0.964912,0.958904,0.985915,0.972222
1,Decision Tree (Tuned),0.95614,0.945946,0.985915,0.965517
1,Decision Tree,0.938596,0.957143,0.943662,0.950355
