**IMPORTED LIBRARIES**

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from catboost import CatBoostRegressor, Pool
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.impute import SimpleImputer


**LOADING DATA FOR USE**

In [None]:
train_file = '/content/drive/MyDrive/YTSP/Train.csv'
test_file = '/content/drive/MyDrive/YTSP/Test.csv'
sample_submission_file = '/content/drive/MyDrive/YTSP/SampleSubmission.csv'

train_data = pd.read_csv(train_file)
test_data = pd.read_csv(test_file)
sample_submission = pd.read_csv(sample_submission_file)


**EXAMINING AND CLEANING DATA**

In [None]:
# Drop rows where the target variable is missing
train_data = train_data.dropna(subset=['target'])

# Identify numerical and categorical columns
numerical_columns = train_data.select_dtypes(include=['float64', 'int64']).columns.difference(['target'])
categorical_columns = train_data.select_dtypes(include=['object']).columns.difference(['ID'])

# Remove columns with all missing values
train_data = train_data.dropna(axis=1, how='all')
test_data = test_data.dropna(axis=1, how='all')

# Recalculate numerical columns after dropping columns
numerical_columns = train_data.select_dtypes(include=['float64', 'int64']).columns.difference(['target'])

# Handle missing values with SimpleImputer
num_imputer = SimpleImputer(strategy='median')  # Median for numerical columns
cat_imputer = SimpleImputer(strategy='most_frequent')  # Mode for categorical columns

# Apply imputers to numerical columns explicitly
train_data[numerical_columns] = pd.DataFrame(
    num_imputer.fit_transform(train_data[numerical_columns]),
    columns=numerical_columns,
    index=train_data.index
)
test_data[numerical_columns] = pd.DataFrame(
    num_imputer.transform(test_data[numerical_columns]),
    columns=numerical_columns,
    index=test_data.index
)

# Apply imputers to categorical columns explicitly
for col in categorical_columns:
    # Impute missing values for categorical columns
    train_data[col] = cat_imputer.fit_transform(train_data[[col]].values).ravel()
    test_data[col] = cat_imputer.transform(test_data[[col]].values).ravel()


# Convert 'day' column to day of the week
if 'day' in train_data.columns:
    train_data['day'] = pd.to_datetime(train_data['day'], errors='coerce').dt.dayofweek.fillna(0).astype(int)
if 'day' in test_data.columns:
    test_data['day'] = pd.to_datetime(test_data['day'], errors='coerce').dt.dayofweek.fillna(0).astype(int)

# Encode categorical variables
for col in categorical_columns:
    train_data[col] = train_data[col].astype('category').cat.codes
    test_data[col] = test_data[col].astype('category').cat.codes

# Drop irrelevant columns
columns_to_drop = ['ID', 'timestamp', '15_min_interval']
X_train = train_data.drop(columns=columns_to_drop + ['target'], errors='ignore')
y_train = train_data['target']
X_test = test_data.drop(columns=columns_to_drop, errors='ignore')

# Align X_test with X_train
missing_cols = set(X_train.columns) - set(X_test.columns)
for col in missing_cols:
    X_test[col] = 0  # Add missing columns to X_test with default value 0
X_test = X_test[X_train.columns]  # Ensure column order matches

**TRAINING AND EVALUATING MODEL**

In [None]:
# Split training data for validation
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42
)

# Define models
models = {
    "CatBoost": CatBoostRegressor(iterations=1000, learning_rate=0.1, depth=6, loss_function='RMSE', verbose=100),
    "RandomForest": RandomForestRegressor(n_estimators=200, random_state=42),
    "XGBoost": XGBRegressor(objective='reg:squarederror', n_estimators=500, max_depth=6, learning_rate=0.1),
    "LightGBM": LGBMRegressor(n_estimators=500, learning_rate=0.1, max_depth=6),
    "Ridge": Ridge(alpha=1.0)
}

# Train and evaluate models
results = {}
for model_name, model in models.items():
    print(f"Training {model_name}...")
    if model_name == "CatBoost":
        train_pool = Pool(X_train_split, y_train_split)
        val_pool = Pool(X_val_split, y_val_split)
        model.fit(train_pool, eval_set=val_pool, verbose=100)
    else:
        model.fit(X_train_split, y_train_split)

    # Predict on validation set
    val_predictions = model.predict(X_val_split)
    mse = mean_squared_error(y_val_split, val_predictions, squared=False)
    results[model_name] = mse
    print(f"{model_name} RMSE: {mse}")

# Choose the best model based on validation RMSE
best_model_name = min(results, key=results.get)
print(f"Best model: {best_model_name} with RMSE: {results[best_model_name]}")
best_model = models[best_model_name]

**PREDICTIONS**

In [None]:
# Final Predictions
final_predictions = best_model.predict(X_test)

# Save Predictions
sample_submission['target'] = final_predictions
sample_submission.to_csv('/content/drive/MyDrive/YTSP/Submissions.csv', index=False)

print("Model training and predictions completed successfully.")