# **XGBoost Regressor**

## **1. Importing Libararies**

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

## **2. Reading Data and Header View**

In [3]:
df = pd.read_csv('Laptop_price.csv')
df.head()

Unnamed: 0,Brand,Processor_Speed,RAM_Size,Storage_Capacity,Screen_Size,Weight,Price
0,Asus,3.830296,16,512,11.185147,2.641094,17395.093065
1,Acer,2.912833,4,1000,11.311372,3.260012,31607.605919
2,Lenovo,3.241627,4,256,11.853023,2.029061,9291.023542
3,Acer,3.806248,16,512,12.28036,4.573865,17436.728334
4,Acer,3.268097,32,1000,14.990877,4.193472,32917.990718


## **3. Shape of Data**

In [4]:
df.shape

(1000, 7)

## **4. Info about Data**

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Brand             1000 non-null   object 
 1   Processor_Speed   1000 non-null   float64
 2   RAM_Size          1000 non-null   int64  
 3   Storage_Capacity  1000 non-null   int64  
 4   Screen_Size       1000 non-null   float64
 5   Weight            1000 non-null   float64
 6   Price             1000 non-null   float64
dtypes: float64(4), int64(2), object(1)
memory usage: 54.8+ KB


## **5. Null Values Distribution**

In [16]:
df.isnull().sum()

Brand               0
Processor_Speed     0
RAM_Size            0
Storage_Capacity    0
Screen_Size         0
Weight              0
Price               0
dtype: int64

## **6. Description of Data**

In [17]:
df.describe(include='all')

Unnamed: 0,Brand,Processor_Speed,RAM_Size,Storage_Capacity,Screen_Size,Weight,Price
count,1000,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
unique,5,,,,,,
top,Dell,,,,,,
freq,210,,,,,,
mean,,2.750611,15.5,584.576,14.05679,3.466919,19604.187963
std,,0.731796,10.988665,313.438517,1.705882,0.866541,9406.06488
min,,1.51158,4.0,256.0,11.012111,2.00056,8570.01295
25%,,2.089246,8.0,256.0,12.635523,2.717211,10114.012948
50%,,2.760885,16.0,512.0,14.099643,3.46463,17287.241878
75%,,3.36261,32.0,1000.0,15.52859,4.212583,31566.214754


## **7. Train Test Split**

In [18]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['Price']), df['Price'],
                                                    test_size=0.2,
                                                    random_state=42)

## **8. Separating Numerical and Categorical Columns Indices**

In [23]:
numerical_columns = [1, 2, 3, 4, 5]
categorical_columns = [0]

## **9. Creating Pipelines**

In [24]:
handle_numerical = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='mean')),
    ('scale', MinMaxScaler())
])

In [25]:
handle_categorical = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('encode', OneHotEncoder())
])

## **10. Combining Pipelines in Column Transformer**

In [26]:
preprocessing = ColumnTransformer(transformers=[
    ('numerical', handle_numerical, numerical_columns),
    ('categorical', handle_categorical, categorical_columns)
])

## **11. Modeling and its Evaluation**

In [27]:
model = XGBRegressor()

In [28]:
pipe = make_pipeline(preprocessing, model)

In [29]:
pipe.fit(X_train, y_train)

In [30]:
y_pred = pipe.predict(X_test)

In [31]:
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print('Mean Absolute Error: ', mae)
print('Mean Squared Error: ', mse)
print('R2 Score: ', r2)

Mean Absolute Error:  168.8684188807837
Mean Squared Error:  43257.62310296692
R2 Score:  0.9995235763622714
