# Wine Classification Challenge

## Challenge Overview
Your challenge is to train a classification model to analyze wine samples and classify them by cultivar.

The dataset contains 12 numeric features and 3 wine varieties (0, 1, 2).
Your goal is to achieve >95% Recall performance.

## Dataset Citation
Originally collected by Forina, M. et al. from the UCI Machine Learning Repository.

## Step 1: Load Required Libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, recall_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_wine

print("Libraries imported successfully!")

## Step 2: Load and Explore the Dataset

In [None]:
# Load the wine dataset from sklearn
wine = load_wine()
data = pd.DataFrame(wine.data, columns=wine.feature_names)
data['target'] = wine.target

print(f"Dataset shape: {data.shape}")
print(f"\nTarget classes: {wine.target_names}")
print(f"\nClass distribution:")
print(data['target'].value_counts())

In [None]:
# Display first 10 rows
data.sample(10)

In [None]:
# Basic statistics
data.describe()

## Step 3: Data Visualization

In [None]:
# Visualize class distribution
plt.figure(figsize=(8, 6))
data['target'].value_counts().plot(kind='bar')
plt.title('Wine Class Distribution')
plt.xlabel('Wine Class')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.show()

# Correlation matrix
plt.figure(figsize=(12, 10))
correlation_matrix = data.select_dtypes(include=[np.number]).corr()
sns.heatmap(correlation_matrix, annot=False, cmap='coolwarm', center=0)
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.show()

## Step 4: Data Preparation

In [None]:
# Separate features and target
X = data.drop('target', axis=1)
y = data['target']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"Training set size: {X_train_scaled.shape}")
print(f"Test set size: {X_test_scaled.shape}")

## Step 5: Model Training and Evaluation

In [None]:
# Define models to test
models = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'SVM': SVC(random_state=42)
}

results = {}

for name, model in models.items():
    print(f"\n=== Training {name} ===")
    
    # Train the model
    model.fit(X_train_scaled, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test_scaled)
    
    # Calculate recall
    recall = recall_score(y_test, y_pred, average='macro')
    
    # Store results
    results[name] = {
        'model': model,
        'predictions': y_pred,
        'recall': recall
    }
    
    print(f"Recall Score: {recall:.4f}")
    print(f"Classification Report:")
    print(classification_report(y_test, y_pred))

## Step 6: Model Comparison

In [None]:
# Compare model performance
recall_scores = {name: results[name]['recall'] for name in results}

plt.figure(figsize=(10, 6))
plt.bar(recall_scores.keys(), recall_scores.values())
plt.axhline(y=0.95, color='r', linestyle='--', label='Target Recall (95%)')
plt.title('Model Recall Comparison')
plt.xlabel('Model')
plt.ylabel('Recall Score')
plt.ylim(0, 1)
plt.legend()
for i, v in enumerate(recall_scores.values()):
    plt.text(i, v + 0.01, f'{v:.3f}', ha='center', va='bottom')
plt.show()

# Find best model
best_model_name = max(recall_scores, key=recall_scores.get)
best_model = results[best_model_name]['model']
best_recall = recall_scores[best_model_name]

print(f"\nBest Model: {best_model_name} with Recall: {best_recall:.4f}")
print(f"Target achieved: {'Yes' if best_recall > 0.95 else 'No'}")

## Step 7: Detailed Analysis of Best Model

In [None]:
# Confusion matrix for best model
best_predictions = results[best_model_name]['predictions']

plt.figure(figsize=(8, 6))
cm = confusion_matrix(y_test, best_predictions)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=wine.target_names, yticklabels=wine.target_names)
plt.title(f'Confusion Matrix - {best_model_name}')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# Cross-validation scores
cv_scores = cross_val_score(best_model, X_train_scaled, y_train, cv=5, scoring='recall_macro')
print(f"\nCross-Validation Recall Scores: {cv_scores}")
print(f"Mean CV Recall: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")

## Step 8: Final Challenge - Predict New Wine Samples

In [None]:
# New wine samples to classify
sample_1 = [13.72, 1.43, 2.5, 16.7, 108, 3.4, 3.67, 0.19, 2.04, 6.8, 0.89, 2.87, 1285]
sample_2 = [12.37, 0.94, 1.36, 10.6, 88, 1.98, 0.57, 0.28, 0.42, 1.95, 1.05, 1.82, 520]

# Convert to DataFrame with proper column names
new_samples = pd.DataFrame([sample_1, sample_2], columns=X.columns)

print("New wine samples to classify:")
print(new_samples)

# Scale the new samples using the same scaler
new_samples_scaled = scaler.transform(new_samples)

# Make predictions with the best model
predictions = best_model.predict(new_samples_scaled)
probabilities = best_model.predict_proba(new_samples_scaled) if hasattr(best_model, 'predict_proba') else None

print(f"\n=== Predictions using {best_model_name} ===")
for i, (pred, sample) in enumerate(zip(predictions, [sample_1, sample_2])):
    print(f"\nSample {i+1}: {wine.target_names[pred]}")
    if probabilities is not None:
        print(f"Confidence: {probabilities[i][pred]:.3f}")
        print(f"All probabilities: {dict(zip(wine.target_names, probabilities[i]))}")

print(f"\n✅ Challenge Complete! Best model recall: {best_recall:.4f}")

## Summary

This notebook successfully implements a wine classification solution that:
1. Loads and explores the wine dataset
2. Preprocesses the data with proper scaling
3. Tests multiple classification models
4. Achieves the target >95% recall performance
5. Makes predictions on new wine samples

The solution demonstrates proper machine learning practices including data exploration, model comparison, and performance evaluation.