In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import matplotlib.pyplot as plt

In [None]:
# 1. Load
df = pd.read_csv("sales_data.csv")
print("Initial rows:", len(df))

In [None]:
# 2. Basic cleaning (same as preprocessing)
df["Revenue"] = pd.to_numeric(df["Revenue"], errors="coerce")
df["Revenue"] = df["Revenue"].fillna(df["Revenue"].mean())
df = df.drop_duplicates().reset_index(drop=True)

# One-hot encode Region (if present)
if "Region" in df.columns:
    enc = OneHotEncoder(sparse=False, handle_unknown="ignore")
    enc_arr = enc.fit_transform(df[["Region"]])
    cols = enc.get_feature_names_out(["Region"])
    df = pd.concat([df.drop(columns=["Region"]), pd.DataFrame(enc_arr, columns=cols, index=df.index)], axis=1)

In [None]:
# 3. Feature / target selection
# TODO: Choose sensible features for prediction. Fill the blank with a list of column names present in df.
# Example answer: ["Revenue", "Marketing_Spend", "Units_Sold", "Region_North", "Region_South", ...]
FEATURES = _____   # <- STUDENT FILL (list of column names)
TARGET = "Monthly_Sales"

X = df[FEATURES]
y = df[TARGET]

In [None]:
# 4. Train/test split - blank for test_size
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= _____ , random_state=42)

In [None]:
# 5. Scaling (scale numeric columns if needed)
scaler = StandardScaler()

# We scale only numeric columns; if features include one-hot columns, transform only numeric ones.
numeric_cols = X_train.select_dtypes(include=[np.number]).columns.tolist()
X_train[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])
X_test[numeric_cols] = scaler.transform(X_test[numeric_cols])

In [None]:
# 6. Train model
model = LinearRegression()
model.fit(X_train, y_train)

In [None]:
# 7. Predict & evaluate
pred = model.predict(X_test)
mse = mean_squared_error(y_test, pred)
r2 = r2_score(y_test, pred)
print("MSE:", mse)
print("R2 :", r2)

In [None]:
# 8. Plots for intuition
# 8a. Actual vs Predicted scatter
plt.figure(figsize=(6,5))
plt.scatter(y_test, pred)
plt.xlabel("Actual Monthly Sales")
plt.ylabel("Predicted Monthly Sales")
plt.title("Actual vs Predicted")

# line y=x for reference
mn = min(y_test.min(), pred.min()); mx = max(y_test.max(), pred.max())
plt.plot([mn,mx],[mn,mx], linestyle="--")
plt.tight_layout()
plt.show()


In [None]:
# ------------------------
# Linear Regression Workbook
# Dataset: sales_data.csv
# Students fill the blanks marked with _____
# ------------------------











# 8b. Residuals plot
residuals = y_test - pred
plt.figure(figsize=(6,4))
plt.scatter(pred, residuals)
plt.axhline(0, linestyle="--")
plt.xlabel("Predicted")
plt.ylabel("Residual (Actual - Pred)")
plt.title("Residuals vs Predicted")
plt.tight_layout()
plt.show()

# Done
print("Linear Regression module complete.")
