In [None]:
!pip install -r requirements.txt

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
import kagglehub
warnings.filterwarnings('ignore')

In [None]:
# Download and load dataset
path = kagglehub.dataset_download("uciml/breast-cancer-wisconsin-data")
df = pd.read_csv(path+'/data.csv')

In [None]:
# Display basic information
print("Dataset Shape:", df.shape)
print("\nFirst 5 rows:")
df.head()

In [None]:
# Data preprocessing
df = df.drop(['id', 'Unnamed: 32'], axis=1)
print("\nColumns after dropping ID and Unnamed:")
print(df.columns.tolist())

In [None]:
# Check for missing values
print("\nMissing values:", df.isnull().sum().sum())

In [None]:
# Target distribution
print("\nTarget distribution:")
print(df['diagnosis'].value_counts())

In [None]:
# Encode target variable
df['diagnosis'] = df['diagnosis'].map({'M': 1, 'B': 0})

In [None]:
# Feature statistics
print("\nFeature statistics:")
df.describe()

In [None]:
# Correlation analysis
corr_matrix = df.corr()
top_corr_features = corr_matrix.index[abs(corr_matrix["diagnosis"]) > 0.5]
print("\nFeatures with high correlation to diagnosis:")
print(top_corr_features.tolist())

In [None]:
# Create visualizations
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# 1. Diagnosis distribution
sns.countplot(data=df, x='diagnosis', ax=axes[0,0])
axes[0,0].set_title('Diagnosis Distribution')
axes[0,0].set_xlabel('Diagnosis (0: Benign, 1: Malignant)')

# 2. Correlation heatmap
sns.heatmap(df[top_corr_features].corr(), annot=True, cmap='coolwarm', ax=axes[0,1])
axes[0,1].set_title('Correlation Heatmap (High Correlation Features)')

# 3. Feature distribution comparison
feature_to_plot = 'radius_mean'
df_melted = df.melt(id_vars=['diagnosis'], value_vars=[feature_to_plot])
sns.boxplot(data=df_melted, x='variable', y='value', hue='diagnosis', ax=axes[1,0])
axes[1,0].set_title(f'{feature_to_plot} by Diagnosis')
axes[1,0].set_xlabel('')

# 4. Pairplot for top features
top_features = ['radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'diagnosis']
sns.scatterplot(data=df[top_features], x='radius_mean', y='texture_mean', hue='diagnosis', ax=axes[1,1])
axes[1,1].set_title('Radius vs Texture by Diagnosis')

plt.tight_layout()
plt.show()

In [None]:
# Interactive 3D visualization
fig = px.scatter_3d(df, 
                    x='radius_mean', 
                    y='texture_mean', 
                    z='perimeter_mean',
                    color='diagnosis',
                    title='3D Scatter Plot: Radius vs Texture vs Perimeter',
                    labels={'diagnosis': 'Diagnosis (0: Benign, 1: Malignant)'})
fig.show()

In [None]:
# Feature importance visualization
X = df.drop('diagnosis', axis=1)
y = df['diagnosis']

# Train Random Forest for feature importance
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X, y)

# Get feature importance
importance_df = pd.DataFrame({
    'feature': X.columns,
    'importance': rf.feature_importances_
}).sort_values('importance', ascending=False)

# Plot top 15 features
plt.figure(figsize=(10, 8))
sns.barplot(data=importance_df.head(15), x='importance', y='feature')
plt.title('Top 15 Feature Importances')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.show()

In [None]:
# Model comparison
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize models
models = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Logistic Regression': LogisticRegression(random_state=42),
    'SVM': SVC(random_state=42)
}

# Train and evaluate models
results = {}
for name, model in models.items():
    if name == 'Random Forest':
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
    else:
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
    
    accuracy = accuracy_score(y_test, y_pred)
    results[name] = accuracy
    print(f"\n{name} Results:")
    print(f"Accuracy: {accuracy:.4f}")
    print(classification_report(y_test, y_pred))

In [None]:
# Model comparison visualization
plt.figure(figsize=(8, 5))
models_names = list(results.keys())
accuracies = list(results.values())
sns.barplot(x=models_names, y=accuracies, palette='viridis')
plt.ylabel('Accuracy')
plt.title('Model Comparison')
plt.ylim(0.9, 1.0)
for i, v in enumerate(accuracies):
    plt.text(i, v + 0.001, f"{v:.4f}", ha='center')
plt.show()

In [None]:
# Confusion matrix for best model
best_model_name = max(results, key=results.get)
best_model = models[best_model_name]

if best_model_name == 'Random Forest':
    y_pred_best = best_model.predict(X_test)
else:
    y_pred_best = best_model.predict(X_test_scaled)

# Plot confusion matrix
plt.figure(figsize=(6, 4))
sns.heatmap(confusion_matrix(y_test, y_pred_best), annot=True, fmt='d', cmap='Blues')
plt.title(f'Confusion Matrix - {best_model_name}')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:
# Advanced visualization: Parallel coordinates
fig = px.parallel_coordinates(
    df[top_features],
    color='diagnosis',
    title='Parallel Coordinates Plot for Top Features',
    color_continuous_scale=px.colors.diverging.Tealrose
)
fig.show()

In [None]:
# Distribution of top features by diagnosis
fig, axes = plt.subplots(3, 2, figsize=(15, 12))
top_features_list = importance_df['feature'].head(6).tolist()

for i, feature in enumerate(top_features_list):
    row = i // 2
    col = i % 2
    sns.histplot(data=df, x=feature, hue='diagnosis', kde=True, ax=axes[row, col])
    axes[row, col].set_title(f'Distribution of {feature}')

# Remove empty subplot
axes[2, 1].remove()
plt.tight_layout()
plt.show()

In [None]:
# Summary statistics by diagnosis
summary_stats = df.groupby('diagnosis').agg(['mean', 'std'])
print("\nSummary Statistics by Diagnosis:")
summary_stats.head()

In [None]:
# Radar chart for feature comparison
top_5_features = importance_df.head(5)['feature'].tolist()
benign_mean = df[df['diagnosis'] == 0][top_5_features].mean()
malignant_mean = df[df['diagnosis'] == 1][top_5_features].mean()

fig = go.Figure()
fig.add_trace(go.Scatterpolar(
    r=benign_mean.values,
    theta=top_5_features,
    fill='toself',
    name='Benign'
))
fig.add_trace(go.Scatterpolar(
    r=malignant_mean.values,
    theta=top_5_features,
    fill='toself',
    name='Malignant'
))

fig.update_layout(
    polar=dict(
        radialaxis=dict(
            visible=True,
            range=[0, max(benign_mean.max(), malignant_mean.max())]
        )),
    showlegend=True,
    title="Average Feature Values: Benign vs Malignant"
)
fig.show()

In [None]:
# Project Summary
print("\nProject Summary:")
print(f"- Dataset contains {df.shape[0]} samples and {df.shape[1]-1} features")
print(f"- Best performing model: {best_model_name} with accuracy: {results[best_model_name]:.4f}")
print(f"- Most important features: {', '.join(importance_df.head(3)['feature'].tolist())}")