<a href="https://colab.research.google.com/github/aradeyal/machine_learning/blob/main/%D7%A8%D7%A9%D7%AA_%D7%A2%D7%9E%D7%95%D7%A7%D7%94_%D7%AA%D7%A8%D7%92%D7%99%D7%9C_%D7%9E%D7%A2%D7%A9%D7%99%E2%80%8E.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
import os, sys, re
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, confusion_matrix, accuracy_score

# ------------------------------- 1) CSV loader -------------------------------
CANDIDATE_PATHS = [
    r"C:\Users\arade\Downloads\winequality-red.csv",  # your Windows path
    "/content/winequality-red.csv",                   # Colab (manual upload)
    "winequality-red.csv",                            # current directory
]

def try_read_all(path: str):
    """Try reading CSV from 'path' with common separators; return DataFrame or None."""
    # a) semicolon
    try:
        df = pd.read_csv(path, sep=";")
        if df.shape[1] > 1:
            return df
    except Exception:
        pass
    # b) comma
    try:
        df = pd.read_csv(path, sep=",")
        if df.shape[1] > 1:
            return df
    except Exception:
        pass
    # c) pandas sniffer
    try:
        df = pd.read_csv(path, sep=None, engine="python")
        if df.shape[1] > 1:
            return df
    except Exception:
        pass
    return None

def load_wine_df():
    # A) Use an existing candidate path if found
    for p in CANDIDATE_PATHS:
        if os.path.exists(p):
            print(f"📄 Reading from: {p}")
            df = try_read_all(p)
            if df is not None:
                return df
    # B) In Colab: ask for manual upload (no Drive needed)
    if 'google.colab' in sys.modules:
        from google.colab import files
        print("⚠️ File not found. Please upload 'winequality-red.csv' (no Drive).")
        uploaded = files.upload()
        fname = next(iter(uploaded))
        print(f"📄 Reading from uploaded file: {fname}")
        df = try_read_all(fname)
        if df is not None:
            return df
    # C) Last resort: download from UCI
    url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
    print(f"⬇️ Downloading from UCI: {url}")
    local_path = tf.keras.utils.get_file("winequality-red.csv", origin=url)
    df = try_read_all(local_path)
    if df is None:
        raise FileNotFoundError("Could not load CSV with any method.")
    return df

df = load_wine_df()

# ------------------------------ 2) Preprocessing -----------------------------
# Clean column names
df.columns = (df.columns
                .str.strip()
                .str.replace(r"\s+", "_", regex=True)
                .str.lower())

# Convert object columns to numeric where possible (also handle comma decimals)
for c in df.columns:
    if df[c].dtype == "object":
        df[c] = pd.to_numeric(df[c].astype(str).str.replace(",", "."), errors="coerce")

# Drop rows with NaN (rare in this dataset)
if df.isna().any().any():
    df = df.dropna().reset_index(drop=True)

# Detect target column
target_col = "quality" if "quality" in df.columns else None
if target_col is None:
    for c in df.columns:
        if re.fullmatch(r"\s*quality\s*", c, flags=re.IGNORECASE):
            target_col = c
            break
if target_col is None:
    target_col = df.columns[-1]
    print(f"ℹ️ 'quality' not found, using last column as target: '{target_col}'")

# Build X/y with numeric target
y_series = pd.to_numeric(df[target_col], errors="coerce")
mask = y_series.notna()
X_all = df.drop(columns=[target_col]).loc[mask].astype("float32").values
y_all = y_series.loc[mask].astype("int64").values

# ------------------------- 3) Split first, then scale ------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X_all, y_all, test_size=0.2, random_state=42, stratify=y_all
)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test  = scaler.transform(X_test)

batch = 32
steps = max(1, len(X_train) // batch)

# ------------------------------ 4) Regression --------------------------------
y_train_reg = y_train.astype("float32")
y_test_reg  = y_test.astype("float32")

train_reg = tf.data.Dataset.from_tensor_slices((X_train, y_train_reg)).shuffle(1000).repeat().batch(batch)
test_reg  = tf.data.Dataset.from_tensor_slices((X_test,  y_test_reg)).batch(batch)

reg_model = tf.keras.Sequential([
    tf.keras.Input((X_train.shape[1],)),
    tf.keras.layers.Dense(64, activation="relu"),
    tf.keras.layers.Dense(64, activation="relu"),
    tf.keras.layers.Dense(1, activation="linear"),
])
reg_model.compile(
    optimizer="adam", loss="mse",
    metrics=[tf.keras.metrics.RootMeanSquaredError(name="rmse"), "mae"]
)
reg_model.fit(train_reg, validation_data=test_reg, steps_per_epoch=steps, epochs=40, verbose=0)

y_pred_reg = reg_model.predict(X_test, verbose=0).ravel()

# sklearn-version-safe RMSE
try:
    rmse = mean_squared_error(y_test_reg, y_pred_reg, squared=False)
except TypeError:
    rmse = np.sqrt(mean_squared_error(y_test_reg, y_pred_reg))  # for older sklearn
mae  = mean_absolute_error(y_test_reg, y_pred_reg)

print("\n=== Regression ===")
print(f"RMSE: {rmse:.3f} | MAE: {mae:.3f}")

# --------------------------- 5) Classification --------------------------------
# Map actual quality labels to contiguous indices: e.g., {3,4,5,6,7,8} -> {0..5}
classes_sorted = np.sort(np.unique(y_all))
label_to_idx = {label: i for i, label in enumerate(classes_sorted)}
idx_to_label = np.array(classes_sorted)

y_train_idx = np.array([label_to_idx[v] for v in y_train], dtype="int64")
y_test_idx  = np.array([label_to_idx[v] for v in y_test],  dtype="int64")
num_classes = len(classes_sorted)

train_cls = tf.data.Dataset.from_tensor_slices((X_train, y_train_idx)).shuffle(1000).repeat().batch(batch)
test_cls  = tf.data.Dataset.from_tensor_slices((X_test,  y_test_idx)).batch(batch)

cls_model = tf.keras.Sequential([
    tf.keras.Input((X_train.shape[1],)),
    tf.keras.layers.Dense(64, activation="relu"),
    tf.keras.layers.Dense(64, activation="relu"),
    tf.keras.layers.Dense(num_classes, activation="softmax"),
])
cls_model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
cls_model.fit(train_cls, validation_data=test_cls, steps_per_epoch=steps, epochs=40, verbose=0)

proba = cls_model.predict(X_test, verbose=0)
y_pred_idx = proba.argmax(axis=1)
y_pred_labels = idx_to_label[y_pred_idx]  # back to original quality values

print("\n=== Classification ===")
print("Confusion matrix (rows=true, cols=pred):")
print(confusion_matrix(y_test, y_pred_labels, labels=classes_sorted))
print("Accuracy:", round(accuracy_score(y_test_idx, y_pred_idx), 3))


📄 Reading from: /content/winequality-red.csv

=== Regression ===
RMSE: 0.667 | MAE: 0.517

=== Classification ===
Confusion matrix (rows=true, cols=pred):
[[ 0  0  1  1  0  0]
 [ 0  0  9  2  0  0]
 [ 0  1 93 41  1  0]
 [ 0  2 22 94 10  0]
 [ 0  0  0 22 18  0]
 [ 0  0  0  2  1  0]]
Accuracy: 0.641
