In [4]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib
import os

# --- 1. Load the dataset ---
# Ensure the data directory exists and the CSV is there
data_path = '../data/winequality-red.csv'
if not os.path.exists(data_path):
    print(f"Error: Dataset not found at {data_path}. Please ensure 'winequality-red.csv' is in the 'data/' directory.")
else:
    # --- CHANGE THIS LINE ---
    df = pd.read_csv(data_path, sep=';') # <--- ADDED sep=';' HERE
    print("Dataset loaded successfully.")
    print(df.head())
    print(df.info())
    print(df.describe())

    # --- 2. Exploratory Data Analysis (EDA) - Basic checks ---
    # Check for missing values
    print("\nMissing values per column:")
    print(df.isnull().sum())

    # Check distribution of the target variable
    print("\nDistribution of 'quality' column:")
    print(df['quality'].value_counts().sort_index())

    # You might want to add more EDA here, e.g., visualizations, correlation matrix.
    # For simplicity, we'll proceed to modeling.

    # --- 3. Data Preprocessing ---
    # Define features (X) and target (y)
    # The 'quality' column is our target variable.
    X = df.drop('quality', axis=1)
    y = df['quality']

    # It's common to treat wine quality prediction as a classification problem,
    # often by binning the quality scores into "low", "medium", "high".
    # However, for simplicity and direct prediction, we'll keep it as is for now,
    # treating each quality score as a distinct class.
    # If you want to simplify, you could categorize:
    # y = df['quality'].apply(lambda x: 'bad' if x <= 5 else ('good' if x >= 7 else 'medium'))

    # Split the data into training and testing sets
    # Using a stratify ensures that the proportion of target variable values
    # is the same in both the training and testing sets.
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    print(f"\nTraining set shape: {X_train.shape}")
    print(f"Testing set shape: {X_test.shape}")

    # --- 4. Model Training ---
    # Initialize the RandomForestClassifier model
    # RandomForest is a good choice for tabular data and handles non-linear relationships.
    model = RandomForestClassifier(n_estimators=100, random_state=42)

    # Train the model
    print("\nTraining the RandomForestClassifier model...")
    model.fit(X_train, y_train)
    print("Model training complete.")

    # --- 5. Model Evaluation ---
    # Make predictions on the test set
    y_pred = model.predict(X_test)

    # Evaluate the model's performance
    accuracy = accuracy_score(y_test, y_pred)
    print(f"\nModel Accuracy: {accuracy:.4f}")

    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, zero_division=0))

    # --- 6. Save the Trained Model ---
    # Create the 'model' directory if it doesn't exist
    model_dir = '../model'
    os.makedirs(model_dir, exist_ok=True)

    # Define the path to save the model
    model_path = os.path.join(model_dir, 'wine_quality_model.pkl')

    # Save the model using joblib (efficient for scikit-learn models)
    joblib.dump(model, model_path)
    print(f"\nModel saved to {model_path}")

    # Verify the model can be loaded back
    loaded_model = joblib.load(model_path)
    print(f"Model successfully loaded back from {model_path}")
    # You can test a prediction with the loaded model
    # sample_prediction = loaded_model.predict(X_test.iloc[[0]])
    # print(f"Sample prediction with loaded model: {sample_prediction}")


Dataset loaded successfully.
   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0            7.4              0.70         0.00             1.9      0.076   
1            7.8              0.88         0.00             2.6      0.098   
2            7.8              0.76         0.04             2.3      0.092   
3           11.2              0.28         0.56             1.9      0.075   
4            7.4              0.70         0.00             1.9      0.076   

   free sulfur dioxide  total sulfur dioxide  density    pH  sulphates  \
0                 11.0                  34.0   0.9978  3.51       0.56   
1                 25.0                  67.0   0.9968  3.20       0.68   
2                 15.0                  54.0   0.9970  3.26       0.65   
3                 17.0                  60.0   0.9980  3.16       0.58   
4                 11.0                  34.0   0.9978  3.51       0.56   

   alcohol  quality  
0      9.4        5  
1      9.8   