In [40]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, cross_validate
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, make_scorer, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd
import numpy as np


In [2]:
df = pd.read_csv('data/Train.csv')

In [3]:
removed_features = [col for col in df.columns if 'angle' in col.lower() or 'sensor' in col.lower()]
removed_features += ['Place_ID X Date', 'Date', 'target_min','target_max', 'target_variance', 'target_count']


In [4]:
df = df.drop(columns=removed_features)

In [5]:
len(df.columns)

41

In [6]:
X = df.drop('target', axis=1)
y = df['target']

In [7]:
num_features = X.columns.tolist()
num_features.remove('Place_ID')

In [8]:
cat_features = ['Place_ID']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [11]:
X_train.shape, X_test.shape

((24445, 40), (6112, 40))

In [28]:
class GroupByMedianImputer(BaseEstimator, TransformerMixin):
    def __init__(self, group_col, columns=None):
        self.group_col = group_col
        self.columns = columns
        self.group_medians_ = {}
        self.global_medians_ = {}

    def fit(self, X, y=None):
        X_ = X.copy()
        if self.columns is None:
            self.columns = X_.select_dtypes(include=np.number).columns.tolist()
        if self.group_col not in X_.columns:
            raise ValueError(f"Grouping column '{self.group_col}' not found in input data.")

        # Compute group medians
        self.group_medians_ = (
            X_.groupby(self.group_col)[self.columns].median().to_dict(orient='index')
        )

        # Compute global medians for fallback
        self.global_medians_ = X_[self.columns].median().to_dict()

        return self

    def transform(self, X):
        X_ = X.copy()
        for col in self.columns:
            def impute_value(row):
                if not pd.isna(row[col]):
                    return row[col]
                group_value = self.group_medians_.get(row[self.group_col], {}).get(col, np.nan)
                if pd.isna(group_value):
                    fallback = self.global_medians_.get(col)
                    if pd.isna(fallback):
                        print(f"⚠️ No median found for group {row[self.group_col]} and no global median for {col}")
                    return fallback
                return group_value

            X_[col] = X_.apply(impute_value, axis=1)
        return X_



In [30]:
# Pipeline for numrical features 
num_pipeline = Pipeline([
    #('group_median_imputer', GroupByMedianImputer(group_col='Place_ID', columns=num_features)),
    ('std_scaler', StandardScaler())
])

# Pipeline for categorical features 
cat_pipeline = Pipeline([
    ('1hot', OneHotEncoder(handle_unknown='ignore'))
])

In [31]:
preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_features),
    ('cat', cat_pipeline, cat_features)
])

## Train different model:

### LinearRegression

In [32]:
pipe_logreg = Pipeline([
    ('group_median_imputer', GroupByMedianImputer(group_col='Place_ID', columns=num_features)),
    ('preprocessor', preprocessor),
    ('logreg', LinearRegression())
])

In [33]:
scoring = {
    'r2': 'r2',  # Built-in R-squared scorer
    'rmse': make_scorer(lambda y_true, y_pred: -mean_squared_error(y_true, y_pred, squared=False))  # RMSE scorer
}

cv_results = cross_validate(pipe_logreg, X_train, y_train, cv=5, scoring=scoring, n_jobs=-1)

print("R-squared (test):", cv_results['test_r2'])
print("RMSE (test):", -cv_results['test_rmse'])

R-squared (test): [0.66126379 0.69868868 0.68594742 0.68876803 0.69975298]
RMSE (test): [28.16987558 25.09380656 25.48171024 26.08121004 25.51470969]


In [41]:
# 1. Train the final model on all training data
pipe_logreg.fit(X_train, y_train)

# 2. Predict on test set
y_test_pred = pipe_logreg.predict(X_test)

# 3. Evaluate
r2 = r2_score(y_test, y_test_pred)
rmse = mean_squared_error(y_test, y_test_pred, squared=False)

print("R² on test set:", r2)
print("RMSE on test set:", rmse)

R² on test set: 0.6548347059919979
RMSE on test set: 28.147975842506774


### XGBRegressor

In [36]:
pipe_XGB = Pipeline([
    ('group_median_imputer', GroupByMedianImputer(group_col='Place_ID', columns=num_features)),
    ('preprocessor', preprocessor),
    ('XGB', XGBRegressor(
    n_estimators=10000,       # Equivalent to 'iterations' in CatBoost
    learning_rate=0.045,      # Same as CatBoost
    max_depth=8,              # Same as 'depth' in CatBoost
    objective='reg:squarederror',  # Default regression objective
    random_state=42,          # Same as 'random_seed' in CatBoost
    subsample=0.8,            # Similar to bagging (adjust as needed)
    colsample_bytree=0.8,     # Feature subsampling
    eval_metric='rmse',       # Same as CatBoost
    n_jobs=-1                 # Use all CPU cores
))
])

In [37]:
cv_results = cross_validate(pipe_XGB, X_train, y_train, cv=5, scoring=scoring, n_jobs=-1, verbose=5)

print("R-squared (test):", cv_results['test_r2'])
print("RMSE (test):", -cv_results['test_rmse'])

R-squared (test): [0.71754248 0.76290832 0.77445861 0.74511281 0.77860593]
RMSE (test): [25.72353722 22.25958042 21.59436898 23.60258877 21.90959131]


### Hyperparameters with Grid Search

In [None]:
param_grid = {"knn__n_neighbors" : [2,4,3,5,10], #this actually defines the model you use
              "knn__weights" : ["uniform", "distance"],
              "knn__p" : [1, 2, 3],
             }

# Instantiate gridsearch and define the metric to optimize 
gs = GridSearchCV(knn_pipline, param_grid, scoring='roc_auc', cv=5, verbose=1, n_jobs=-1)
gs.fit(X_train, y_train)

In [None]:
print('Best score:', round(gs.best_score_, 3))

# Best parameters
print('Best parameters:', gs.best_params_)