# **XGBoost Classifier**

## **1. Importing Libararies**

In [47]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder, OneHotEncoder, LabelEncoder
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

## **2. Reading Data and Header View**

In [48]:
df = pd.read_csv('weather_classification_data.csv')
df.head()

Unnamed: 0,Temperature,Humidity,Wind Speed,Precipitation (%),Cloud Cover,Atmospheric Pressure,UV Index,Season,Visibility (km),Location,Weather Type
0,14.0,73,9.5,82.0,partly cloudy,1010.82,2,Winter,3.5,inland,Rainy
1,39.0,96,8.5,71.0,partly cloudy,1011.43,7,Spring,10.0,inland,Cloudy
2,30.0,64,7.0,16.0,clear,1018.72,5,Spring,5.5,mountain,Sunny
3,38.0,83,1.5,82.0,clear,1026.25,7,Spring,1.0,coastal,Sunny
4,27.0,74,17.0,66.0,overcast,990.67,1,Winter,2.5,mountain,Rainy


## **3. Shape of Data**

In [49]:
df.shape

(13200, 11)

## **4. Info about Data**

In [50]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13200 entries, 0 to 13199
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Temperature           13200 non-null  float64
 1   Humidity              13200 non-null  int64  
 2   Wind Speed            13200 non-null  float64
 3   Precipitation (%)     13200 non-null  float64
 4   Cloud Cover           13200 non-null  object 
 5   Atmospheric Pressure  13200 non-null  float64
 6   UV Index              13200 non-null  int64  
 7   Season                13200 non-null  object 
 8   Visibility (km)       13200 non-null  float64
 9   Location              13200 non-null  object 
 10  Weather Type          13200 non-null  object 
dtypes: float64(5), int64(2), object(4)
memory usage: 1.1+ MB


## **5. Null Values Distribution**

In [51]:
df.isnull().sum()

Temperature             0
Humidity                0
Wind Speed              0
Precipitation (%)       0
Cloud Cover             0
Atmospheric Pressure    0
UV Index                0
Season                  0
Visibility (km)         0
Location                0
Weather Type            0
dtype: int64

## **6. Description of Data**

In [52]:
df.describe(include='all')

Unnamed: 0,Temperature,Humidity,Wind Speed,Precipitation (%),Cloud Cover,Atmospheric Pressure,UV Index,Season,Visibility (km),Location,Weather Type
count,13200.0,13200.0,13200.0,13200.0,13200,13200.0,13200.0,13200,13200.0,13200,13200
unique,,,,,4,,,4,,3,4
top,,,,,overcast,,,Winter,,inland,Rainy
freq,,,,,6090,,,5610,,4816,3300
mean,19.127576,68.710833,9.832197,53.644394,,1005.827896,4.005758,,5.462917,,
std,17.386327,20.194248,6.908704,31.946541,,37.199589,3.8566,,3.371499,,
min,-25.0,20.0,0.0,0.0,,800.12,0.0,,0.0,,
25%,4.0,57.0,5.0,19.0,,994.8,1.0,,3.0,,
50%,21.0,70.0,9.0,58.0,,1007.65,3.0,,5.0,,
75%,31.0,84.0,13.5,82.0,,1016.7725,7.0,,7.5,,


## **7. LabelEncoding Target Column**

In [53]:
le = LabelEncoder()
df['Weather Type'] = le.fit_transform(df['Weather Type'])

## **8. Separating Input and Output Columns**

In [54]:
X = df.drop(columns=['Weather Type'])
y = df['Weather Type']

## **9. Train Test Split**

In [55]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## **10. Separating Numerical and Categorical Columns Indices**

In [56]:
numerical_columns = [0, 1, 2, 3, 5, 6, 8]
categorical_columns = [4, 7, 9]

## **11. Creating Pipelines**

In [57]:
handle_numerical = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='mean')),
    ('scale', MinMaxScaler())
])

In [58]:
handle_categorical = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encode', OneHotEncoder())
])

## **12. Handling Pipelines in Transformer**

In [59]:
preprocessing = ColumnTransformer(transformers=[
    ('numerical', handle_numerical, numerical_columns),
    ('categorical', handle_categorical, categorical_columns)
])

## **13. Modeling and its Evaluation**

In [60]:
model = XGBClassifier()

In [61]:
pipe = make_pipeline(preprocessing, model)

In [62]:
pipe.fit(X_train, y_train)

In [63]:
y_pred = pipe.predict(X_test)

In [64]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.88      0.90      0.89       651
           1       0.90      0.90      0.90       647
           2       0.93      0.95      0.94       701
           3       0.94      0.90      0.92       641

    accuracy                           0.91      2640
   macro avg       0.91      0.91      0.91      2640
weighted avg       0.91      0.91      0.91      2640

