In [2]:
# task3_predictive_analytics.ipynb
# Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import LabelEncoder

# Load dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data"
columns = ['ID', 'Diagnosis'] + [f'Feature_{i}' for i in range(1, 31)]
data = pd.read_csv(url, header=None, names=columns)

# Preprocess data
# Map Diagnosis: Malignant=1 (High Priority), Benign=0 (Low Priority)
data = data.drop('ID', axis=1)  # Remove irrelevant column
le = LabelEncoder()
data['Diagnosis'] = le.fit_transform(data['Diagnosis'])

# Handle missing values
data = data.fillna(data.mean())

# Split data
X = data.drop('Diagnosis', axis=1)
y = data['Diagnosis']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

# Print results
print(f"Accuracy: {accuracy:.2f}")
print(f"F1-Score: {f1:.2f}")

# Save model
import joblib
joblib.dump(model, 'priority_classifier.pkl')

Accuracy: 0.96
F1-Score: 0.96


['priority_classifier.pkl']