In [None]:
import pandas as pd
import matplotlib.pyplot as plt  
import seaborn as sns
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.pipeline import Pipeline
from category_encoders.target_encoder import TargetEncoder
from skopt import BayesSearchCV
from skopt.space import Real,Categorical, Integer
import seaborn as sns
import matplotlib.pyplot as plt
import pickle

#Step 1: Import CSVs
df_train = pd.read_csv('../Data/train_data.csv', delimiter=",")
df_test = pd.read_csv('../Data/test_data.csv', delimiter=",")

#Step 2: Separate features and target
X_train = df_train.drop(columns=["Rain"])
y_train = df_train['Rain']

X_test = df_test.drop(columns=["Rain"])
y_test = df_test['Rain']

#Step 3: Build a pipeline and train XGBoost Model
pipeline = Pipeline(steps=[
    ('clf', XGBClassifier(eval_metric='mlogloss', random_state=47))
])

#Step 4: Define hyperparameter search space for Bayesian optimization

search_space = {
    'clf__max_depth' : Integer(2,8),
    'clf__learning_rate' : Real(0.001, 1.0, prior='log-uniform'),
    'clf__subsample' : Real(0.5, 1.0),
    'clf__colsample_bytree' : Real(0.5, 1.0),
    'clf__colsample_bylevel' : Real(0.5, 1.0),
    'clf__colsample_bynode' : Real(0.5, 1.0),
    'clf__reg_alpha' : Real(0.0, 10.0),
    'clf__reg_lambda' : Real(0.0, 10.0),
    'clf__gamma' : Real(0.0, 10.0)
}

#Step 5: Training the XGBoost model
opt = BayesSearchCV(pipeline, search_space, cv=3, n_iter=10, scoring='accuracy', random_state=47)

opt.fit(X_train, y_train)

#Step 6:Make predictions
opt.best_estimator_
opt.best_score_
opt.score(X_test, y_test)
opt.predict(X_test)
opt.predict_proba(X_test)

predictions = opt.predict(X_test)

#Step 7: Evaluation
accuracy = accuracy_score(y_test, predictions)
print("XGBoost Accuracy:", accuracy)

print("Classification Report:")
print(classification_report(y_test, predictions, target_names=label_names, zero_division=0))

#Confusion Matrix
cm = confusion_matrix(y_test, predictions)
labels = sorted(y_test.unique()) 

plt.figure(figsize=(8,6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=labels, yticklabels=labels)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.tight_layout()
plt.show()

#Step 8: Save the model as pkl file in artifacts
with open("../Artifacts/xgboost_model.pkl", "wb") as file:
   pickle.dump(opt.best_estimator_, file)

#Step 9: Save predictions to CSV
comp_df = X_test.copy()
comp_df["Actual_Rain"] = y_test.values
comp_df["Predicted_Rain"] = predictions

comp_df.to_csv("../Artifacts/xgboost_prediction.csv", index=False)