In [None]:
#first importing the libraries that are required
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
#then reading the train and test files given

tst_dta = pd.read_csv("/kaggle/input/week4-practice-hackathon-2024/Test.csv")
print(f"Test data shape: {tst_dta.shape}")
training_dataset = pd.read_csv("/kaggle/input/week4-practice-hackathon-2024/Train.csv")
print(f"Training data shape: {training_dataset.shape}")

print(training_dataset.isnull().sum())
y = training_dataset['class']

X = training_dataset.drop(['class'], axis=1)
print(training_dataset.head())
#splitting the dataset for further processing
X_tr, X_val, y_tr, y_val = train_test_split(X, y, test_size=0.3, random_state=42)

preprocessing_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

X_tr = preprocessing_pipeline.fit_transform(X_tr)
X_val = preprocessing_pipeline.transform(X_val)
#using a loop to find best possible learning rate
#then hyperparameter tuning to let the model fit rightly
learn_rate = np.linspace(0.00025, 0.00026, num=1000, endpoint=False)
param_grid = {
    'colsample_bytree': [0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8,0.84,0.86,0.83,0.87, 0.85, 0.9, 0.95, 1.0],
    'learning_rate': learn_rate,
    'n_estimators': [90, 95, 100,103,107, 105, 110, 115, 150, 200, 250],
    'subsample': [0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75,0.74,0.76,0.73,0.78, 0.8, 0.85, 0.9, 0.95, 1.0],
    'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9],
}

xgb_classifier = XGBClassifier(random_state=42)
#increased no of iterations and also cross validations as usual
random_search = RandomizedSearchCV(estimator=xgb_classifier, param_distributions=param_grid, 
                                   n_iter=15, cv=3, scoring='f1', random_state=42, n_jobs=-1)
random_search.fit(X_tr, y_tr)
best_xgb_classifier.fit(X_tr, y_tr)
print("Best Parameters:", random_search.best_params_)

best_xgb_classifier = random_search.best_estimator_
y_prd = best_xgb_classifier.predict(X_val)

f1 = f1_score(y_val, y_prd)


Index = tst_dta['Index']
tst_dta.drop(['Index'], axis=1, inplace=True)
tst_dta = preprocessing_pipeline.transform(tst_dta)
#checking the evaluation metric that is the F1 score
print(f"F1 score after tuning: {f1}")
#defining first 2 columns in new csv file to be printed out soon!!
y_tst = best_xgb_classifier.predict(tst_dta)
submission = pd.DataFrame({
    'Index': Index,
    'class': y_tst
})
#finally we have to print out the final file for submission
submission.to_csv("submission.csv", index=False)
print("Final file has been created successfully!!!")

: 