In [1]:
# ================================
# IMPORT LIBRARIES
# ================================
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, mean_squared_error

from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.svm import SVR, SVC
from sklearn.decomposition import PCA

# ================================
# LOAD DATASET
# ================================
df = pd.read_csv("ecommerce_sales_data.csv")
print("Columns:", df.columns)

# ================================
# DROP DATE COLUMN (NOT NUMERIC)
# ================================
df.drop("Order Date", axis=1, inplace=True)

# ================================
# ENCODE CATEGORICAL COLUMNS
# ================================
le = LabelEncoder()
for col in df.select_dtypes(include='object').columns:
    df[col] = le.fit_transform(df[col])

# ================================
# ================================
# REGRESSION TASK (Target = Sales)
# ================================
# ================================
X = df.drop("Sales", axis=1)
y = df["Sales"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42
)

scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s = scaler.transform(X_test)

# ---- Multiple Linear Regression ----
lr = LinearRegression()
lr.fit(X_train_s, y_train)
y_pred_lr = lr.predict(X_test_s)
print("\nLinear Regression MSE:",
      mean_squared_error(y_test, y_pred_lr))

# ---- KNN Regressor ----
knn_r = KNeighborsRegressor(n_neighbors=5)
knn_r.fit(X_train_s, y_train)
y_pred_knn_r = knn_r.predict(X_test_s)
print("KNN Regressor MSE:",
      mean_squared_error(y_test, y_pred_knn_r))

# ---- Decision Tree Regressor ----
dt_r = DecisionTreeRegressor(random_state=42)
dt_r.fit(X_train, y_train)
y_pred_dt_r = dt_r.predict(X_test)
print("Decision Tree Regressor MSE:",
      mean_squared_error(y_test, y_pred_dt_r))

# ---- SVM Regressor ----
svr = SVR()
svr.fit(X_train_s, y_train)
y_pred_svr = svr.predict(X_test_s)
print("SVM Regressor MSE:",
      mean_squared_error(y_test, y_pred_svr))

# ================================
# ================================
# CLASSIFICATION TASK
# ================================
# ================================

# Create Binary Target
df["High_Sales"] = np.where(df["Sales"] >= df["Sales"].mean(), 1, 0)

Xc = df.drop(["Sales", "High_Sales"], axis=1)
yc = df["High_Sales"]

Xc_train, Xc_test, yc_train, yc_test = train_test_split(
    Xc, yc, test_size=0.25, random_state=42
)

Xc_train_s = scaler.fit_transform(Xc_train)
Xc_test_s = scaler.transform(Xc_test)

# ---- Logistic Regression ----
log = LogisticRegression()
log.fit(Xc_train_s, yc_train)
y_pred_log = log.predict(Xc_test_s)
print("\nLogistic Regression Accuracy:",
      accuracy_score(yc_test, y_pred_log))

# ---- KNN Classifier ----
knn_c = KNeighborsClassifier(n_neighbors=5)
knn_c.fit(Xc_train_s, yc_train)
y_pred_knn_c = knn_c.predict(Xc_test_s)
print("KNN Classifier Accuracy:",
      accuracy_score(yc_test, y_pred_knn_c))

# ---- Naïve Bayes ----
nb = GaussianNB()
nb.fit(Xc_train_s, yc_train)
y_pred_nb = nb.predict(Xc_test_s)
print("Naïve Bayes Accuracy:",
      accuracy_score(yc_test, y_pred_nb))

# ---- Decision Tree Classifier ----
dt_c = DecisionTreeClassifier(random_state=42)
dt_c.fit(Xc_train, yc_train)
y_pred_dt_c = dt_c.predict(Xc_test)
print("Decision Tree Classifier Accuracy:",
      accuracy_score(yc_test, y_pred_dt_c))

# ---- SVM Classifier ----
svm = SVC()
svm.fit(Xc_train_s, yc_train)
y_pred_svm = svm.predict(Xc_test_s)
print("SVM Classifier Accuracy:",
      accuracy_score(yc_test, y_pred_svm))

# ================================
# PCA + LOGISTIC REGRESSION
# ================================
pca = PCA(n_components=2)
Xc_train_pca = pca.fit_transform(Xc_train_s)
Xc_test_pca = pca.transform(Xc_test_s)

log_pca = LogisticRegression()
log_pca.fit(Xc_train_pca, yc_train)
y_pred_pca = log_pca.predict(Xc_test_pca)

print("PCA + Logistic Accuracy:",
      accuracy_score(yc_test, y_pred_pca))

print("\n--- ALL ALGORITHMS COMPARED SUCCESSFULLY ---")

Columns: Index(['Order Date', 'Product Name', 'Category', 'Region', 'Quantity', 'Sales',
       'Profit'],
      dtype='object')

Linear Regression MSE: 1612995.9441534535
KNN Regressor MSE: 1731555.1462400001
Decision Tree Regressor MSE: 2855166.9142857143
SVM Regressor MSE: 6255598.941549931

Logistic Regression Accuracy: 0.8525714285714285
KNN Classifier Accuracy: 0.8582857142857143
Naïve Bayes Accuracy: 0.8525714285714285
Decision Tree Classifier Accuracy: 0.8377142857142857
SVM Classifier Accuracy: 0.8582857142857143
PCA + Logistic Accuracy: 0.8342857142857143

--- ALL ALGORITHMS COMPARED SUCCESSFULLY ---
