In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import os
import warnings
warnings.filterwarnings("ignore")
import xgboost as xgb

In [2]:
train=pd.read_csv('Train')
test=pd.read_csv('test')

In [3]:
train.shape

(28052, 18)

In [4]:
test.shape

(7013, 18)

In [5]:
target_col = 'consumption'

In [6]:
feature_cols = [col for col in train.columns if col != target_col]

# Extract features from training set
X_train = train[feature_cols]

# Extract target from training set
y_train = train[target_col]

# Extract features from test set
X_test = test[feature_cols]

# Extract target from test set
y_test = test[target_col]

In [7]:
from sklearn.preprocessing import StandardScaler
sclar=StandardScaler()
X_train=sclar.fit_transform(X_train)

In [8]:
X_test=sclar.transform(X_test)

In [9]:
with open('Sclar.pkl','wb') as file:
    pickle.dump(sclar,file)

In [10]:
xgb=xgb.XGBRegressor(
    n_estimators=100,
    max_depth=7,
    learning_rate=0.001,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

In [11]:
xgb.fit(X_train,y_train)

In [12]:
xgb_pred=xgb.predict(X_test)

In [13]:
from sklearn.metrics import mean_absolute_error,make_scorer

mae=mean_absolute_error(y_test,xgb_pred)

print(mae)

3.841308359652105


In [14]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, TimeSeriesSplit

In [15]:
param_grid = {
    'n_estimators': [50, 100, 200],           # Number of boosting rounds
    'max_depth': [3, 5, 7],                   # Maximum tree depth
    'learning_rate': [0.01, 0.1, 0.3,.001,.005,.00001],        # Step size shrinkage
    'subsample': [0.8, 1.0],                  # Fraction of samples for each tree
    'colsample_bytree': [0.8, 1.0],           # Fraction of features for each tree
    'min_child_weight': [1, 3, 5]             # Minimum sum of weights in a child
}


In [16]:
scorer = make_scorer(mean_absolute_error, greater_is_better=False)

In [21]:
import xgboost as xgb

In [27]:
grid_search = GridSearchCV(
    estimator=xgb.XGBRegressor(random_state=42),  # Changed this line
    param_grid=param_grid,
    cv=7,
    scoring=scorer,
    n_jobs=-1,
    verbose=2,
    return_train_score=True
)

In [28]:
grid_search.fit(X_train,y_train)

Fitting 7 folds for each of 648 candidates, totalling 4536 fits
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=50, subsample=0.8; total time=   0.2s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=50, subsample=0.8; total time=   0.2s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=50, subsample=0.8; total time=   0.2s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=50, subsample=0.8; total time=   0.2s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=50, subsample=1.0; total time=   0.2s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=50, subsample=0.8; total time=   0.2s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=50, subsample=0.8; total time

In [29]:
best_model_grid = grid_search.best_estimator_

# Make predictions on test set
grid_pred = best_model_grid.predict(X_test)

In [30]:
from sklearn.metrics import mean_absolute_error,make_scorer

grid_mae = mean_absolute_error(y_test, grid_pred)

print(grid_mae)

1.0359657896275154


In [31]:
grid_search.best_params_

{'colsample_bytree': 0.8,
 'learning_rate': 0.1,
 'max_depth': 3,
 'min_child_weight': 3,
 'n_estimators': 100,
 'subsample': 0.8}

In [32]:
with open('XGB_mode.pkl','wb') as file:
    pickle.dump(grid_search,file)