# Dataset assumptions and instructions

- Place your dataset as `data.csv` in the project root (or change the `DATA_PATH` variable below).
- The notebook assumes a regression task. If your target column is named `target`, it will be used.
  Otherwise the last column will be treated as the target.
- The notebook performs simple cleaning: drops duplicates, fills numeric NAs with median, categorical NAs with mode, and one-hot-encodes categoricals.
- It trains a scikit-learn `LinearRegression` model and prints MSE and R^2, then saves the model to `linear_model.joblib`.


In [9]:
# Imports (install scikit-learn, pandas, joblib beforehand if needed)
import os
import pandas as pand
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
import joblib

# Helper: prints environment info to help debugging imports
print('pandas', pand.__version__)
print('numpy', np.__version__)


pandas 2.3.3
numpy 2.3.5


In [None]:
# Load dataset
DATA_PATH = 'data.csv'  # change if your file has a different name or path
if not os.path.exists(DATA_PATH):
    raise FileNotFoundError(f"Dataset not found at {DATA_PATH}. Place your CSV there or update DATA_PATH.")

df = pand.read_csv(DATA_PATH)
print('Loaded', len(df), 'rows and', len(df.columns), 'columns')
display(df.head())

In [None]:
# Identify target column
if 'target' in df.columns:
    target_col = 'target'
else:
    # use last column as target by default
    target_col = df.columns[-1]
print('Using target column:', target_col)

# Basic cleaning
df = df.drop_duplicates()
df = df.dropna(subset=[target_col])  # must have target

# Separate features and target
X = df.drop(columns=[target_col])
y = df[target_col].astype(float)

# Handle missing values: numeric -> median, categorical -> mode
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = [c for c in X.columns if c not in num_cols]

for c in num_cols:
    med = X[c].median()
    X[c] = X[c].fillna(med)
for c in cat_cols:
    mode = X[c].mode(dropna=True)
    if len(mode) > 0:
        X[c] = X[c].fillna(mode[0])
    else:
        X[c] = X[c].fillna('')

# One-hot encode categoricals
if len(cat_cols) > 0:
    X = pand.get_dummies(X, columns=cat_cols, drop_first=True)

print('Feature matrix shape after encoding:', X.shape)

In [None]:
# Train/test split and scaling
RANDOM_STATE = 42
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print('Train shape:', X_train.shape, 'Test shape:', X_test.shape)

In [None]:
# Train Linear Regression
model = LinearRegression()
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'MSE: {mse:.4f}')
print(f'R^2: {r2:.4f}')

# Save model and scaler
# Save both model and scaler together
joblib.dump({'model': model, 'scaler': scaler}, 'linear_model.joblib')
print('Saved model and scaler to linear_model.joblib')