In [10]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
import joblib
from preprocess import DataPreprocessor

# Load the dataset
data_path = 'data.csv'  # Ensure this path is correct
data = pd.read_csv(data_path)

# Drop unnecessary columns
data.drop(['Unnamed: 32', 'id'], axis=1, inplace=True)

# Encode diagnosis column: 1 for Malignant, 0 for Benign
data['diagnosis'] = data['diagnosis'].apply(lambda x: 1 if x == "M" else 0)

# Extract features and labels
X = data.drop('diagnosis', axis=1)
y = data['diagnosis'].values

# Check for NaN values
print("NaN values in the dataset before preprocessing:", X.isna().sum().sum())

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

# Create the pipeline
pipeline = Pipeline([
    ('preprocessor', DataPreprocessor()),
    ('log_reg', LogisticRegression(max_iter=1000, class_weight='balanced'))
])

# Train the model
pipeline.fit(X_train, y_train)

# Save the pipeline to model_lg.sav
joblib.dump(pipeline, 'model_lg.sav')

# Evaluate the model
test_score = pipeline.score(X_test, y_test)
print(f'Test accuracy: {test_score}')


NaN values in the dataset before preprocessing: 0
Test accuracy: 0.9883720930232558
