## Import Libraries

In [228]:
# ! pip install xgboost
# ! pip install gzip

In [229]:
# !pip install feature-engine

In [230]:
import os

import pickle

import warnings

import numpy as np

import pandas as pd

import xgboost as xgb

import boto3

import sklearn
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.base import BaseEstimator,TransformerMixin
from sklearn.pipeline import Pipeline,FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import (
    OneHotEncoder,
    OrdinalEncoder,
    StandardScaler,
    MinMaxScaler,
    PowerTransformer,
    FunctionTransformer,
    
)

from feature_engine.outliers import Winsorizer
from feature_engine.encoding import (
    RareLabelEncoder,
    MeanEncoder,
    CountFrequencyEncoder
)
from feature_engine.datetime import DatetimeFeatures
from feature_engine.selection import SelectBySingleFeaturePerformance

import matplotlib.pyplot as plt

# we import sagemaker for hyper paramete tuning
import sagemaker
from sagemaker.estimator import Estimator
from sagemaker.inputs import TrainingInput
from sagemaker.tuner import (
    IntegerParameter,
    ContinuousParameter,
    HyperparameterTuner
    )


## Display Settings

In [231]:
# Now we do display settingab
# This pandas function show the all columns forcefully
pd.set_option('display.max_columns',None)

# We know sciket learn transformers return numpy array so we use set config funtion to convert this array into a dataframe
sklearn.set_config(transform_output = 'pandas')

# This line of code ignore all thee warnings
warnings.filterwarnings('ignore')

## Read the Datasets

In [232]:
train = pd.read_csv('train_data.csv')
train

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Jet Airways,2019-03-21,Banglore,New Delhi,08:55:00,19:10:00,615,1.0,In-flight meal not included,7832
1,Jet Airways,2019-03-27,Delhi,Cochin,17:30:00,04:25:00,655,1.0,In-flight meal not included,6540
2,Goair,2019-03-09,Banglore,New Delhi,11:40:00,14:35:00,175,0.0,No Info,7305
3,Air India,2019-06-12,Kolkata,Banglore,09:25:00,18:30:00,545,1.0,No Info,8366
4,Jet Airways,2019-03-12,Banglore,New Delhi,22:55:00,07:40:00,525,1.0,In-flight meal not included,11087
...,...,...,...,...,...,...,...,...,...,...
6690,Jet Airways,2019-03-21,Delhi,Cochin,10:45:00,18:50:00,1925,2.0,No Info,11093
6691,Air India,2019-05-01,Kolkata,Banglore,09:25:00,18:30:00,545,1.0,No Info,8891
6692,Jet Airways,2019-06-01,Delhi,Cochin,14:00:00,19:00:00,300,1.0,In-flight meal not included,10262
6693,Air Asia,2019-06-24,Delhi,Cochin,07:55:00,13:25:00,330,1.0,No Info,6152


In [233]:
validation = pd.read_csv('validation_data.csv')
validation

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Indigo,2019-06-24,Delhi,Cochin,20:25:00,01:30:00,305,1.0,No Info,5054
1,Multiple Carriers,2019-06-12,Delhi,Cochin,09:45:00,22:30:00,765,1.0,No Info,9646
2,Jet Airways,2019-03-12,Banglore,New Delhi,22:55:00,15:15:00,980,1.0,In-flight meal not included,11087
3,Multiple Carriers,2019-06-06,Delhi,Cochin,13:00:00,21:00:00,480,1.0,No Info,13587
4,Jet Airways,2019-05-18,Delhi,Cochin,23:05:00,04:25:00,1760,2.0,No Info,16704
...,...,...,...,...,...,...,...,...,...,...
1669,Spicejet,2019-05-01,Chennai,Kolkata,09:45:00,12:00:00,135,0.0,No Info,3597
1670,Indigo,2019-05-01,Kolkata,Banglore,08:10:00,13:00:00,290,1.0,No Info,5069
1671,Jet Airways,2019-05-27,Delhi,Cochin,05:30:00,12:35:00,425,2.0,In-flight meal not included,15544
1672,Jet Airways,2019-06-12,Mumbai,Hyderabad,19:35:00,21:05:00,90,0.0,In-flight meal not included,3210


In [234]:
test = pd.read_csv('test_data.csv')
test

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Jet Airways,2019-03-06,Banglore,New Delhi,08:00:00,08:15:00,1455,1.0,No Info,17996
1,Spicejet,2019-06-06,Kolkata,Banglore,22:20:00,00:40:00,140,0.0,No Info,3873
2,Indigo,2019-03-18,Kolkata,Banglore,05:30:00,08:20:00,170,0.0,No Info,4462
3,Indigo,2019-06-27,Chennai,Kolkata,19:35:00,21:55:00,140,0.0,No Info,3597
4,Indigo,2019-05-06,Kolkata,Banglore,15:15:00,17:45:00,150,0.0,No Info,4804
...,...,...,...,...,...,...,...,...,...,...
2088,Jet Airways,2019-05-27,Delhi,Cochin,19:15:00,12:35:00,1040,1.0,In-flight meal not included,12898
2089,Multiple Carriers,2019-06-27,Delhi,Cochin,11:25:00,19:15:00,470,1.0,No Info,7155
2090,Jet Airways,2019-06-03,Delhi,Cochin,02:15:00,04:25:00,1570,1.0,In-flight meal not included,11627
2091,Multiple Carriers,2019-06-06,Delhi,Cochin,15:15:00,01:30:00,615,1.0,No Info,6795


## Preprocessing Operations

In [235]:
# airline

air_tranformer = Pipeline(steps=[
    # in this step we handle the all missing values with most frequent categorey
    ('imputer',SimpleImputer(strategy='most_frequent')),
    # in this step we group the rare categories with name other
    ('grouper',RareLabelEncoder(tol=0.1,replace_with='other',n_categories=2)),
    # in this step we apply the onehot encoder to convert the categories into numbers
    ('encoder',OneHotEncoder(sparse_output=False,handle_unknown='ignore'))
])
# air_tranformer.fit_transform(x_train.loc[:,['airline']])

# date_of_journy

feature_to_extract = ['month','week','day_of_week','day_of_year']

doj_transformer = Pipeline(steps=[
    ('dt',DatetimeFeatures(features_to_extract=feature_to_extract,yearfirst=True,format='mixed')),
    ('scaler',MinMaxScaler())
])
# doj_transformer.fit_transform(x_train.loc[:,['date_of_journey']])

# source & destination

location_pipe = Pipeline(steps=[
    # we label the rare values with name other
    ('grouper',RareLabelEncoder(tol=0.1,replace_with='other',n_categories = 2)),
    # Now we perform mean encoding it calculate the mean value of each category regarding target column
    ('encoder',MeanEncoder()),
    ('transformer',PowerTransformer())
])
# location_pipe.fit_transform(location_subset,y_train)

def is_north(data):
    columns = data.columns.to_list()
    north_cities = ['Delhi','Kolkata','Mumbai','New Delhi']
    return(
        data
        .assign(**{
            f'{col}_is_north' : data.loc[:,col].isin(north_cities).astype(int)
            for col in columns
        })
        .drop(columns = columns)
    )

# is_north(location_subset)
# FunctionTransformer(func=is_north).fit_transform(location_subset)

location_transformer = FeatureUnion(transformer_list=[
    ('part1',location_pipe),
    ('part2',FunctionTransformer(func=is_north))
])
# location_transformer.fit_transform(location_subset,y_train)

# dept_time & arrival_time

time_pipe = Pipeline(steps=[
    ('dt',DatetimeFeatures(features_to_extract=['hour','minute'])),
    ('scaler',MinMaxScaler())
])
# time_pipe.fit_transform(time_subset)

def part_of_day(data,morning=4,noon=12,evening=16,night=20):
    columns = data.columns.to_list()
    x_temp = data.assign(**{
        col :pd.to_datetime(data.loc[:,col]).dt.hour
        for col in columns
    })
    return (
        x_temp
        .assign(**{
            f'{col}_part_of_day' :np.select(
                [x_temp.loc[:,col].between(morning,noon,inclusive='left'),
                x_temp.loc[:,col].between(noon,evening,inclusive='left'),
                x_temp.loc[:,col].between(evening,night,inclusive='left')],
                ['morning','afternoon','evening'],
                default='night'
                
            )
            for col in columns
        })
        .drop(columns=columns)
    )
# part_of_day(time_subset)
# FunctionTransformer(func=part_of_day).fit_transform(time_subset)

time_pipe2 = Pipeline(steps=[
    ('part',FunctionTransformer(func=part_of_day)),
    ('encoder',CountFrequencyEncoder()),
    ('scaler',MinMaxScaler())
])
# time_pipe2.fit_transform(time_subset)

time_transformer = FeatureUnion(transformer_list=[
    ('part1',time_pipe),
    ('part2',time_pipe2)
])
# time_transformer.fit_transform(time_subset)

# duration

class RBFPercentileSimilarity(BaseEstimator,TransformerMixin):
    def __init__(self,variables=None,percentile=[.25,.50,.75],gamma=0.1):
        self.variables = variables
        self.percentile = percentile
        self.gamma = gamma


    def fit(self,X,y=None):
        if not self.variables:
            self.variables = X.select_dtypes(include='number').columns.to_list()

        self.refrence_values_ = {
            col:X.loc[:,col].quantile(self.percentile).values.reshape(-1,1)
            for col in self.variables
        }
        return self


    def transform(self,X):
        objects = []
        for col in self.variables:
            columns = [f"{col}_rbf_{int(percentile*100)}" for percentile in self.percentile ]
            obj = pd.DataFrame(
                data = rbf_kernel(X.loc[:,[col]],Y=self.refrence_values_[col],gamma = self.gamma),
                columns = columns
            )
            objects.append(obj)
        return pd.concat(objects,axis=1)
    
    
def duration_catgory(data,short=180,med=400):
    return (
        data
        .assign(
            duration_cat=np.select([data.duration.lt(short),
                                    data.duration.between(short,med,inclusive='left')],
                                    ['short','medium'],
                                    default='long')
        )
        .drop(columns='duration')
    )


duration_pipe1 = Pipeline(steps=[
    ('rbf',RBFPercentileSimilarity()),
    ('scaler',PowerTransformer())
])

duration_pipe2 = Pipeline(steps=[
    ('cat',FunctionTransformer(func=duration_catgory)),
    ('encoder',OrdinalEncoder(categories=[['short','medium','long']]))
])
duration_union = FeatureUnion(transformer_list=[
    ('part1',duration_pipe1),
    ('part2',duration_pipe2),
    ('part3',StandardScaler()),
])

duration_transformer = Pipeline(steps=[
    ('outliers',Winsorizer(capping_method='iqr',fold=1.5)),
    ('imputer',SimpleImputer(strategy='median')),
    ('union',duration_union)
])
# duration_transformer.fit_transform(x_train.loc[:,['duration']])

# total_stops

def is_direct(X):
    return(
        X
        .assign(
            is_direct_flight = X.total_stops.eq(0).astype(int)
        )
    )

total_stop_transformer = Pipeline(steps=[
    ('impute',SimpleImputer(strategy='most_frequent')),
    ('fun',FunctionTransformer(func=is_direct))
])
# total_stop_transformer.fit_transform(x_train.loc[:,['total_stops']])

# additiona_info

info_pipe1 = Pipeline(steps=[
    ('group',RareLabelEncoder(tol=0.1,n_categories=2,replace_with='other')),
    ('encoder',OneHotEncoder(handle_unknown='ignore',sparse_output=False))
])
# info_pipe1.fit_transform(x_train.loc[:,['additional_info']])

def have_info(data):
    return (
        data
        .assign(additional_info = data.additional_info.ne('No Info').astype(int))
    )

info_union = FeatureUnion(transformer_list=[
    ('part1',info_pipe1),
    ('part2',FunctionTransformer(func=have_info))
])


info_transformer = Pipeline(steps=[
    ('impute',SimpleImputer(strategy='constant',fill_value='unknown')),
    ('union',info_union)
])
# info_transformer.fit_transform(x_train.loc[:,['additional_info']])

# colummn transformer

column_transformer = ColumnTransformer(transformers=[
    ('air',air_tranformer,['airline']),
    ('doj',doj_transformer,['date_of_journey']),
    ('location',location_transformer,['source','destination']),
    ('time',time_transformer,['dep_time','arrival_time']),
    ('duration',duration_transformer,['duration']),
    ('stops',total_stop_transformer,['total_stops']),
    ('info',info_transformer,['additional_info'])
],remainder='passthrough')

# feature selector

estimator = RandomForestRegressor(n_estimators=10,max_depth=3,random_state=42)

selector = SelectBySingleFeaturePerformance(
    estimator = estimator,
    scoring = 'r2',
    threshold = 0.1
)

# preprocessor

preprocessor = Pipeline(steps=[
    ('ct',column_transformer),
    ('select',selector)
])

In [236]:
preprocessor.fit(
    train.drop(columns = 'price'),
    train.price.copy()
)

In [237]:
preprocessor.transform(train.drop(columns = 'price'))

Unnamed: 0,air__airline_Indigo,air__airline_Jet Airways,air__airline_other,doj__date_of_journey_week,doj__date_of_journey_day_of_year,location__source,location__destination,duration__duration_rbf_25,duration__duration_cat,duration__duration,stops__total_stops,stops__is_direct_flight
0,0.0,1.0,0.0,0.176471,0.169492,-0.857930,-0.736484,-0.364262,2.0,-0.033916,1.0,0
1,0.0,1.0,0.0,0.235294,0.220339,1.065418,1.061694,-0.364262,2.0,0.046422,1.0,0
2,0.0,0.0,1.0,0.058824,0.067797,-0.857930,-0.736484,2.373008,0.0,-0.917631,0.0,1
3,0.0,0.0,0.0,0.882353,0.872881,-0.203928,-0.224351,-0.364262,2.0,-0.174507,1.0,0
4,0.0,1.0,0.0,0.117647,0.093220,-0.857930,-0.736484,-0.364262,2.0,-0.214676,1.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
6690,0.0,1.0,0.0,0.176471,0.169492,1.065418,1.061694,-0.364262,2.0,2.597145,2.0,0
6691,0.0,0.0,0.0,0.529412,0.516949,-0.203928,-0.224351,-0.364262,2.0,-0.174507,1.0,0
6692,0.0,1.0,0.0,0.764706,0.779661,1.065418,1.061694,-0.364262,1.0,-0.666576,1.0,0
6693,0.0,0.0,1.0,1.000000,0.974576,1.065418,1.061694,-0.364262,1.0,-0.606322,1.0,0


## Preprcess Data and Upload to Bucket

In [238]:
BUCKET_NAME = 'sagemaker-flight-prediction-bucket'

DATA_PREFIX = 'data'

In [239]:
def get_file_name(name):
    return f'{name}-pre.csv'

In [240]:
def export_data(data,name,pre):
#     Split data into X and Y subsets
    X = data.drop(columns='price')
    y = data.price.copy()
#     Now preprocess the data
#     transformation
    X_pre = pre.transform(X)
#     exporting
    file_name = get_file_name(name)
    (
#         sagemaker need the output column first so we convert our data in this format
        y
        .to_frame()
        .join(X_pre)
#         sagemaker needs the data in this fomat so we remove the header
        .to_csv(file_name,index=False,header=False)
    )
    

In [241]:
# This functio use to upload a file

def upload_to_bucket(name):
#     file name to upload
    file_name = get_file_name(name)
    (
#         boto3 is alibrary that are use to aws for python programming
        boto3
#         session is used for this current file session
        .Session()
#         resource we use what resource we want to use of aws
        .resource('s3')
#         bucket is reource where we want to store our data in s3 bucket
        .Bucket(BUCKET_NAME)
#         object mention the folder or path wher we store our data
        .Object(os.path.join(DATA_PREFIX,f'{name}/{name}.csv'))
#         uplod use to upload a file
        .upload_file(file_name)
    )

In [242]:
# function for exporting and uploading together
def export_upload_bucket(data,name,pre):
    export_data(data,name,pre)
    upload_to_bucket(name)

In [243]:
export_upload_bucket(train,'train',preprocessor)

In [244]:
pd.read_csv('train-pre.csv')

Unnamed: 0,7832,0.0,1.0,0.0.1,0.17647058823529405,0.1694915254237288,-0.8579302780061735,-0.7364841788678915,-0.36426219153366024,2.0,-0.03391591517927909,1.0.1,0
0,6540,0.0,1.0,0.0,0.235294,0.220339,1.065418,1.061694,-0.364262,2.0,0.046422,1.0,0
1,7305,0.0,0.0,1.0,0.058824,0.067797,-0.857930,-0.736484,2.373008,0.0,-0.917631,0.0,1
2,8366,0.0,0.0,0.0,0.882353,0.872881,-0.203928,-0.224351,-0.364262,2.0,-0.174507,1.0,0
3,11087,0.0,1.0,0.0,0.117647,0.093220,-0.857930,-0.736484,-0.364262,2.0,-0.214676,1.0,0
4,4544,0.0,1.0,0.0,0.411765,0.432203,-0.857930,-1.838895,-0.364262,1.0,-0.897546,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6689,11093,0.0,1.0,0.0,0.176471,0.169492,1.065418,1.061694,-0.364262,2.0,2.597145,2.0,0
6690,8891,0.0,0.0,0.0,0.529412,0.516949,-0.203928,-0.224351,-0.364262,2.0,-0.174507,1.0,0
6691,10262,0.0,1.0,0.0,0.764706,0.779661,1.065418,1.061694,-0.364262,1.0,-0.666576,1.0,0
6692,6152,0.0,0.0,1.0,1.000000,0.974576,1.065418,1.061694,-0.364262,1.0,-0.606322,1.0,0


In [245]:
export_upload_bucket(test,'test',preprocessor)

In [246]:
export_upload_bucket(validation,'val',preprocessor)

In [247]:
export_upload_bucket(test,'test',preprocessor)

## Model and Hyper Parameter Tuning Setup

In [248]:
# we create session that tell which environment we use
# we use this information to use the sagemaker builtin model
session = sagemaker.Session()  #this is the working session

region_name = session.boto_region_name  #region name

In [249]:
# After training where we save our model
# we save our model inside the bucket
# we write code for sving our model inside the bucket
# we specify the s3 location likes3://name of bucket/name of folder/inside another folder
# in this code model is new folder created inside the bucket
# output is the folder that is created inside the model folder
output_path = f's3://{BUCKET_NAME}/model/output'  #this is the address of s3 bucket where we store our model

## Model Setup

In [250]:
model = Estimator(
#     sagemaker has a lot of algortihm these algorithm are store in the form of docker container as images
#     so we call our algorithm in the form of image
#     when we use algorithms we call it 
#     uri satnd for unifrom resource identifier
#     this is the method to call algorithm
    image_uri = sagemaker.image_uris.retrieve('xgboost',region_name,'1.2-1'), #this line of code inform sagemaker we want to use xgboost
#     this line of code mention the i am role that we created when we create the nootebook inside the sagemaker
    role = sagemaker.get_execution_role(),
#     this line of code represent how many EC2 instances require to train our model
    instance_count = 1,
#     this line of code represent which type of EC2 instance we want to use for training and tuning
    instance_type = 'ml.m4.xlarge',
#     this line of code represent the s3 storage size
    volume_size = 5,
#     this line of code represent whre we want to save our model
    output_path = output_path,
#     following these three parameters are used to save our cost
    use_spot_instances = True,
    max_run = 300,  #this line of code means runs it maximum 300 secode
    max_wait= 600,
#     this line represent the environment which we working
    sagemaker_session = session
)

## Hyper Parameters 

In [251]:
# this line of tune the parameters of our model
model.set_hyperparameters(
#     in this line reg linear mean simply mean squared error 
    objective = 'reg:linear',
#     number of round means number of base estimators
    num_round = 10,
#     eta means learning rate
    eta = 0.1,
#     max depth represent the depth of the tree
    max_depth = 5,
#     sub sample mean xgoboost have multiple tree we want to train 
#     eache tree with random data set so we set the value of sample subset
#     we use random subsets of data for each tree to avoid overfitting
    subsample = 0.8,
#     this line randomly select the columns as subsample of data
    colsample_bytree = 0.8,
#     alpha value is the regularization value
    alpha = 0.1
    
)

## Hyper Parameters Tuner

In [252]:
hyperparameter_ranges = {
#     this line tune the learning rate value it is floating value so we use ContinuousParameter
    'eta' : ContinuousParameter(0.05,0.2),
#     this line tune the L2 regularization value it is floating value so we use ContinuousParameter
    'alpha' : ContinuousParameter(0,1),
#     this line set the tree depth it is an integer value so we use Integerparameter
    'max_depth' : IntegerParameter(3,5)
}

In [253]:
# now we write the code how we use hyper parameter tuner
# we call the HyperparameterTuner class
tuner = HyperparameterTuner(
#     estimator means what to tune we want to tune our model so pass the model to estimator
    estimator = model,
#     we want to check the performance on validation data so we use validation rmse final result will be reported on test set
    objective_metric_name = 'validation:rmse',
#     this line represent which hyper parameter ranges we want to use
    hyperparameter_ranges = hyperparameter_ranges,
#     Bayesian rndomly select our hyperparameters and then evaluate it 
#     and based on the score it will assign probability to each parameter
#     and then it decide what next parameter we use
#     in each iteration it will test each parameter and evaluate it result and decide which value can improve the result
    strategy = 'Bayesian',
#     objective means we want to minimize the rmse error
    objective_type = 'Minimize'
)

## Data Channels

In [254]:
# channel mean we inform the sagemaker training date where to place
# we show the path of s3 bucket to our model that is called channel
# every data has different channel training data has different channel and test data and validation data as well
def get_data_channel(name): 
#     we give the bucket path to our model so
#     we tell the data set location where is it place by using this line of code
    bucket_path = f's3://{BUCKET_NAME}/{DATA_PREFIX}/{name}'
#     this line returb the address
#     this line also show the content type of file in the s3 bucket location 
    return TrainingInput(bucket_path,content_type ='csv' )

In [255]:
# lets we create train data channel
train_data_channel = get_data_channel('train')
train_data_channel

<sagemaker.inputs.TrainingInput at 0x7f2a4ec8f580>

In [256]:
val_data_channel = get_data_channel('val')
val_data_channel

<sagemaker.inputs.TrainingInput at 0x7f2a4ec8e5f0>

In [257]:
# Now we put these data channels together
# our model tuning test on training and evaluate on validation data
# so we give differnt channel to our model for training and validation
data_channels = {
    'train' : train_data_channel,
    'validation' : val_data_channel
}

## Train and Tune the Model

In [258]:
# Now we train and tune our model
# Normally we give the x_train and y_train but now we give our model data channels
tuner.fit(data_channels)

No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config
No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config


.......................................!


In [259]:
# if we wnt to deploy our model on sagemaker we simply write this code
# in our case we will not deploy model like this we dploy on streamlit
# tuner.best_estimator().deploy()

## Model Evaluation

In [260]:
# model evaluation show the

with open('xgboost-model','rb') as f :
    best_model = pickle.load(f)

best_model

<xgboost.core.Booster at 0x7f2a4ec56470>

In [261]:
pd.read_csv('train-pre.csv').iloc[:,1:]

Unnamed: 0,0.0,1.0,0.0.1,0.17647058823529405,0.1694915254237288,-0.8579302780061735,-0.7364841788678915,-0.36426219153366024,2.0,-0.03391591517927909,1.0.1,0
0,0.0,1.0,0.0,0.235294,0.220339,1.065418,1.061694,-0.364262,2.0,0.046422,1.0,0
1,0.0,0.0,1.0,0.058824,0.067797,-0.857930,-0.736484,2.373008,0.0,-0.917631,0.0,1
2,0.0,0.0,0.0,0.882353,0.872881,-0.203928,-0.224351,-0.364262,2.0,-0.174507,1.0,0
3,0.0,1.0,0.0,0.117647,0.093220,-0.857930,-0.736484,-0.364262,2.0,-0.214676,1.0,0
4,0.0,1.0,0.0,0.411765,0.432203,-0.857930,-1.838895,-0.364262,1.0,-0.897546,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
6689,0.0,1.0,0.0,0.176471,0.169492,1.065418,1.061694,-0.364262,2.0,2.597145,2.0,0
6690,0.0,0.0,0.0,0.529412,0.516949,-0.203928,-0.224351,-0.364262,2.0,-0.174507,1.0,0
6691,0.0,1.0,0.0,0.764706,0.779661,1.065418,1.061694,-0.364262,1.0,-0.666576,1.0,0
6692,0.0,0.0,1.0,1.000000,0.974576,1.065418,1.061694,-0.364262,1.0,-0.606322,1.0,0


In [262]:
def evaluate_model(name):
    file_name = get_file_name(name)
    data = pd.read_csv(file_name)
    
#     xgboost receive data in form of matrix so we write this code
    X = xgb.DMatrix(data.iloc[:,1:],enable_categorical=True)
    y = data.iloc[:,0].copy()
    
    pred = best_model.predict(X)
    
    return r2_score(y,pred)

In [263]:
evaluate_model('train')

0.5016905069351196

In [264]:
evaluate_model('val')

0.454595148563385

In [265]:
evaluate_model('test')

0.49679362773895264