In [1]:
import re
import pickle
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.svm import SVR
from xgboost import XGBRegressor
from datetime import date, datetime
from sklearn.impute import SimpleImputer
from IPython.core.display import display, HTML
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import Lasso, LinearRegression, Ridge
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
%matplotlib inline
warnings.filterwarnings('ignore')
pd.pandas.set_option('display.max_rows', None)
pd.pandas.set_option('display.max_columns', None)
display(HTML('<style>.container{width : 100% ! important;}</style>'))

In [2]:
df = pd.read_csv('IndianPremierLeague.csv')
df.head()

Unnamed: 0,id,date,city,venue,1st_team,neutral_venue,2nd_team,toss_winner,toss_decision,inning,batting_team,bowling_team,batsman,non_striker,bowler,over,ball,batsman_runs,extra_runs,total_runs,non_boundary,is_wicket,dismissal_kind,player_dismissed,fielder,extras_type,runs,wickets,runs_last_5_overs,wickets_last_5_overs,winner,result,result_margin,player_of_match,eliminator,method,umpire1,umpire2,final_score,total_wickets
0,335982,18-04-2008,Bangalore,M Chinnaswamy Stadium,Royal Challengers Bangalore,0,Kolkata Knight Riders,Royal Challengers Bangalore,field,1,Kolkata Knight Riders,Royal Challengers Bangalore,SC Ganguly,BB McCullum,P Kumar,0,1,0,1,1,0,0,,,,legbyes,1,0,0,0,Kolkata Knight Riders,runs,140.0,BB McCullum,N,,Asad Rauf,RE Koertzen,222,3
1,335982,18-04-2008,Bangalore,M Chinnaswamy Stadium,Royal Challengers Bangalore,0,Kolkata Knight Riders,Royal Challengers Bangalore,field,1,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,SC Ganguly,P Kumar,0,2,0,0,0,0,0,,,,,1,0,1,0,Kolkata Knight Riders,runs,140.0,BB McCullum,N,,Asad Rauf,RE Koertzen,222,3
2,335982,18-04-2008,Bangalore,M Chinnaswamy Stadium,Royal Challengers Bangalore,0,Kolkata Knight Riders,Royal Challengers Bangalore,field,1,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,SC Ganguly,P Kumar,0,3,0,1,1,0,0,,,,wides,2,0,2,0,Kolkata Knight Riders,runs,140.0,BB McCullum,N,,Asad Rauf,RE Koertzen,222,3
3,335982,18-04-2008,Bangalore,M Chinnaswamy Stadium,Royal Challengers Bangalore,0,Kolkata Knight Riders,Royal Challengers Bangalore,field,1,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,SC Ganguly,P Kumar,0,4,0,0,0,0,0,,,,,2,0,2,0,Kolkata Knight Riders,runs,140.0,BB McCullum,N,,Asad Rauf,RE Koertzen,222,3
4,335982,18-04-2008,Bangalore,M Chinnaswamy Stadium,Royal Challengers Bangalore,0,Kolkata Knight Riders,Royal Challengers Bangalore,field,1,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,SC Ganguly,P Kumar,0,5,0,0,0,0,0,,,,,2,0,2,0,Kolkata Knight Riders,runs,140.0,BB McCullum,N,,Asad Rauf,RE Koertzen,222,3


In [3]:
df.columns

Index(['id', 'date', 'city', 'venue', '1st_team', 'neutral_venue', '2nd_team',
       'toss_winner', 'toss_decision', 'inning', 'batting_team',
       'bowling_team', 'batsman', 'non_striker', 'bowler', 'over', 'ball',
       'batsman_runs', 'extra_runs', 'total_runs', 'non_boundary', 'is_wicket',
       'dismissal_kind', 'player_dismissed', 'fielder', 'extras_type', 'runs',
       'wickets', 'runs_last_5_overs', 'wickets_last_5_overs', 'winner',
       'result', 'result_margin', 'player_of_match', 'eliminator', 'method',
       'umpire1', 'umpire2', 'final_score', 'total_wickets'],
      dtype='object')

In [4]:
df = df[['date', 'venue', 'toss_decision', 'inning', 'batting_team', 'bowling_team', 'over', 'ball', 'runs', 'wickets', 'runs_last_5_overs', 
         'wickets_last_5_overs', 'final_score']]

In [5]:
df.head()

Unnamed: 0,date,venue,toss_decision,inning,batting_team,bowling_team,over,ball,runs,wickets,runs_last_5_overs,wickets_last_5_overs,final_score
0,18-04-2008,M Chinnaswamy Stadium,field,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,1,1,0,0,0,222
1,18-04-2008,M Chinnaswamy Stadium,field,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,2,1,0,1,0,222
2,18-04-2008,M Chinnaswamy Stadium,field,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,3,2,0,2,0,222
3,18-04-2008,M Chinnaswamy Stadium,field,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,4,2,0,2,0,222
4,18-04-2008,M Chinnaswamy Stadium,field,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,5,2,0,2,0,222


In [6]:
df['year'] = df['date'].apply(lambda dateString : datetime.strptime(dateString,'%d-%m-%Y').year)
df.drop('date', axis = 1, inplace = True)

In [7]:
df.head()

Unnamed: 0,venue,toss_decision,inning,batting_team,bowling_team,over,ball,runs,wickets,runs_last_5_overs,wickets_last_5_overs,final_score,year
0,M Chinnaswamy Stadium,field,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,1,1,0,0,0,222,2008
1,M Chinnaswamy Stadium,field,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,2,1,0,1,0,222,2008
2,M Chinnaswamy Stadium,field,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,3,2,0,2,0,222,2008
3,M Chinnaswamy Stadium,field,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,4,2,0,2,0,222,2008
4,M Chinnaswamy Stadium,field,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,5,2,0,2,0,222,2008


In [8]:
df.columns

Index(['venue', 'toss_decision', 'inning', 'batting_team', 'bowling_team',
       'over', 'ball', 'runs', 'wickets', 'runs_last_5_overs',
       'wickets_last_5_overs', 'final_score', 'year'],
      dtype='object')

In [9]:
for col in ['batting_team', 'bowling_team']:
    t_list = []
    for i in df[col]:
        if i == 'Delhi Daredevils':
            t_list.append('Delhi Capitals')
        elif i == 'Rising Pune Supergiants':
            t_list.append('Rising Pune Supergiant')
        else:
            t_list.append(i)
    df[col] = t_list

In [10]:
t_list = []
for i in df['venue']:
    if i == 'M.Chinnaswamy Stadium':
        t_list.append('M Chinnaswamy Stadium')
    else:
        t_list.append(i)
df['venue'] = t_list

In [11]:
df = df[df['year'] >= 2016]

In [12]:
df.head()

Unnamed: 0,venue,toss_decision,inning,batting_team,bowling_team,over,ball,runs,wickets,runs_last_5_overs,wickets_last_5_overs,final_score,year
122434,Wankhede Stadium,bat,1,Mumbai Indians,Rising Pune Supergiant,0,1,1,0,1,0,121,2016
122435,Wankhede Stadium,bat,1,Mumbai Indians,Rising Pune Supergiant,0,2,1,0,1,0,121,2016
122436,Wankhede Stadium,bat,1,Mumbai Indians,Rising Pune Supergiant,0,3,3,0,3,0,121,2016
122437,Wankhede Stadium,bat,1,Mumbai Indians,Rising Pune Supergiant,0,4,7,0,7,0,121,2016
122438,Wankhede Stadium,bat,1,Mumbai Indians,Rising Pune Supergiant,0,5,8,0,8,0,121,2016


In [13]:
dummy = ['venue', 'toss_decision', 'batting_team', 'bowling_team']
df_dummy = pd.get_dummies(df[dummy], prefix_sep = '_', drop_first = False)
df = df.drop(dummy, axis = 1)
df = pd.concat([df, df_dummy], axis = 1)

In [14]:
df.head()

Unnamed: 0,inning,over,ball,runs,wickets,runs_last_5_overs,wickets_last_5_overs,final_score,year,venue_Dr. Y.S. Rajasekhara Reddy ACA-VDCA Cricket Stadium,venue_Dubai International Cricket Stadium,venue_Eden Gardens,venue_Feroz Shah Kotla,venue_Green Park,venue_Holkar Cricket Stadium,venue_M Chinnaswamy Stadium,"venue_MA Chidambaram Stadium, Chepauk",venue_Maharashtra Cricket Association Stadium,"venue_Punjab Cricket Association IS Bindra Stadium, Mohali","venue_Rajiv Gandhi International Stadium, Uppal",venue_Saurashtra Cricket Association Stadium,venue_Sawai Mansingh Stadium,venue_Shaheed Veer Narayan Singh International Stadium,venue_Sharjah Cricket Stadium,venue_Sheikh Zayed Stadium,venue_Wankhede Stadium,toss_decision_bat,toss_decision_field,batting_team_Chennai Super Kings,batting_team_Delhi Capitals,batting_team_Gujarat Lions,batting_team_Kings XI Punjab,batting_team_Kolkata Knight Riders,batting_team_Mumbai Indians,batting_team_Rajasthan Royals,batting_team_Rising Pune Supergiant,batting_team_Royal Challengers Bangalore,batting_team_Sunrisers Hyderabad,bowling_team_Chennai Super Kings,bowling_team_Delhi Capitals,bowling_team_Gujarat Lions,bowling_team_Kings XI Punjab,bowling_team_Kolkata Knight Riders,bowling_team_Mumbai Indians,bowling_team_Rajasthan Royals,bowling_team_Rising Pune Supergiant,bowling_team_Royal Challengers Bangalore,bowling_team_Sunrisers Hyderabad
122434,1,0,1,1,0,1,0,121,2016,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0
122435,1,0,2,1,0,1,0,121,2016,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0
122436,1,0,3,3,0,3,0,121,2016,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0
122437,1,0,4,7,0,7,0,121,2016,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0
122438,1,0,5,8,0,8,0,121,2016,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0


In [15]:
df.shape

(71034, 48)

In [16]:
df.columns

Index(['inning', 'over', 'ball', 'runs', 'wickets', 'runs_last_5_overs',
       'wickets_last_5_overs', 'final_score', 'year',
       'venue_Dr. Y.S. Rajasekhara Reddy ACA-VDCA Cricket Stadium',
       'venue_Dubai International Cricket Stadium', 'venue_Eden Gardens',
       'venue_Feroz Shah Kotla', 'venue_Green Park',
       'venue_Holkar Cricket Stadium', 'venue_M Chinnaswamy Stadium',
       'venue_MA Chidambaram Stadium, Chepauk',
       'venue_Maharashtra Cricket Association Stadium',
       'venue_Punjab Cricket Association IS Bindra Stadium, Mohali',
       'venue_Rajiv Gandhi International Stadium, Uppal',
       'venue_Saurashtra Cricket Association Stadium',
       'venue_Sawai Mansingh Stadium',
       'venue_Shaheed Veer Narayan Singh International Stadium',
       'venue_Sharjah Cricket Stadium', 'venue_Sheikh Zayed Stadium',
       'venue_Wankhede Stadium', 'toss_decision_bat', 'toss_decision_field',
       'batting_team_Chennai Super Kings', 'batting_team_Delhi Capitals

In [17]:
columns = []
for i in df.columns:
    i = re.sub('\s', '_', i)
    i = re.sub('\.', '', i)
    i = re.sub('\,', '', i)
    i = re.sub('\-', '_', i)
    columns.append(i)
df.columns = columns

In [18]:
df.columns

Index(['inning', 'over', 'ball', 'runs', 'wickets', 'runs_last_5_overs',
       'wickets_last_5_overs', 'final_score', 'year',
       'venue_Dr_YS_Rajasekhara_Reddy_ACA_VDCA_Cricket_Stadium',
       'venue_Dubai_International_Cricket_Stadium', 'venue_Eden_Gardens',
       'venue_Feroz_Shah_Kotla', 'venue_Green_Park',
       'venue_Holkar_Cricket_Stadium', 'venue_M_Chinnaswamy_Stadium',
       'venue_MA_Chidambaram_Stadium_Chepauk',
       'venue_Maharashtra_Cricket_Association_Stadium',
       'venue_Punjab_Cricket_Association_IS_Bindra_Stadium_Mohali',
       'venue_Rajiv_Gandhi_International_Stadium_Uppal',
       'venue_Saurashtra_Cricket_Association_Stadium',
       'venue_Sawai_Mansingh_Stadium',
       'venue_Shaheed_Veer_Narayan_Singh_International_Stadium',
       'venue_Sharjah_Cricket_Stadium', 'venue_Sheikh_Zayed_Stadium',
       'venue_Wankhede_Stadium', 'toss_decision_bat', 'toss_decision_field',
       'batting_team_Chennai_Super_Kings', 'batting_team_Delhi_Capitals',
   

In [19]:
df.head()

Unnamed: 0,inning,over,ball,runs,wickets,runs_last_5_overs,wickets_last_5_overs,final_score,year,venue_Dr_YS_Rajasekhara_Reddy_ACA_VDCA_Cricket_Stadium,venue_Dubai_International_Cricket_Stadium,venue_Eden_Gardens,venue_Feroz_Shah_Kotla,venue_Green_Park,venue_Holkar_Cricket_Stadium,venue_M_Chinnaswamy_Stadium,venue_MA_Chidambaram_Stadium_Chepauk,venue_Maharashtra_Cricket_Association_Stadium,venue_Punjab_Cricket_Association_IS_Bindra_Stadium_Mohali,venue_Rajiv_Gandhi_International_Stadium_Uppal,venue_Saurashtra_Cricket_Association_Stadium,venue_Sawai_Mansingh_Stadium,venue_Shaheed_Veer_Narayan_Singh_International_Stadium,venue_Sharjah_Cricket_Stadium,venue_Sheikh_Zayed_Stadium,venue_Wankhede_Stadium,toss_decision_bat,toss_decision_field,batting_team_Chennai_Super_Kings,batting_team_Delhi_Capitals,batting_team_Gujarat_Lions,batting_team_Kings_XI_Punjab,batting_team_Kolkata_Knight_Riders,batting_team_Mumbai_Indians,batting_team_Rajasthan_Royals,batting_team_Rising_Pune_Supergiant,batting_team_Royal_Challengers_Bangalore,batting_team_Sunrisers_Hyderabad,bowling_team_Chennai_Super_Kings,bowling_team_Delhi_Capitals,bowling_team_Gujarat_Lions,bowling_team_Kings_XI_Punjab,bowling_team_Kolkata_Knight_Riders,bowling_team_Mumbai_Indians,bowling_team_Rajasthan_Royals,bowling_team_Rising_Pune_Supergiant,bowling_team_Royal_Challengers_Bangalore,bowling_team_Sunrisers_Hyderabad
122434,1,0,1,1,0,1,0,121,2016,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0
122435,1,0,2,1,0,1,0,121,2016,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0
122436,1,0,3,3,0,3,0,121,2016,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0
122437,1,0,4,7,0,7,0,121,2016,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0
122438,1,0,5,8,0,8,0,121,2016,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0


In [20]:
df = df[['year', 'batting_team_Chennai_Super_Kings', 'batting_team_Delhi_Capitals', 'batting_team_Gujarat_Lions', 'batting_team_Kings_XI_Punjab', 
         'batting_team_Kolkata_Knight_Riders', 'batting_team_Mumbai_Indians', 'batting_team_Rajasthan_Royals', 'batting_team_Rising_Pune_Supergiant', 
         'batting_team_Royal_Challengers_Bangalore', 'batting_team_Sunrisers_Hyderabad', 'bowling_team_Chennai_Super_Kings', 'bowling_team_Delhi_Capitals', 
         'bowling_team_Gujarat_Lions', 'bowling_team_Kings_XI_Punjab', 'bowling_team_Kolkata_Knight_Riders', 'bowling_team_Mumbai_Indians', 'bowling_team_Rajasthan_Royals', 
         'bowling_team_Rising_Pune_Supergiant', 'bowling_team_Royal_Challengers_Bangalore', 'bowling_team_Sunrisers_Hyderabad', 'toss_decision_bat', 'toss_decision_field', 
         'venue_Dr_YS_Rajasekhara_Reddy_ACA_VDCA_Cricket_Stadium', 'venue_Dubai_International_Cricket_Stadium', 'venue_Eden_Gardens', 'venue_Feroz_Shah_Kotla', 
         'venue_Green_Park', 'venue_Holkar_Cricket_Stadium', 'venue_M_Chinnaswamy_Stadium', 'venue_MA_Chidambaram_Stadium_Chepauk', 
         'venue_Maharashtra_Cricket_Association_Stadium', 'venue_Punjab_Cricket_Association_IS_Bindra_Stadium_Mohali', 'venue_Rajiv_Gandhi_International_Stadium_Uppal', 
         'venue_Saurashtra_Cricket_Association_Stadium', 'venue_Sawai_Mansingh_Stadium', 'venue_Shaheed_Veer_Narayan_Singh_International_Stadium', 
         'venue_Sharjah_Cricket_Stadium', 'venue_Sheikh_Zayed_Stadium', 'venue_Wankhede_Stadium', 'inning', 'over', 'ball', 'runs', 'wickets', 'runs_last_5_overs', 
         'wickets_last_5_overs', 'final_score']]

In [21]:
train = df[df['year'] <= 2018]
test = df[df['year'] > 2018]

In [22]:
X_train = pd.DataFrame(train.drop(['year', 'final_score'], axis = 1), columns = train.drop(['year', 'final_score'], axis = 1).columns)
X_test = pd.DataFrame(test.drop(['year', 'final_score'], axis = 1), columns = test.drop(['year', 'final_score'], axis = 1).columns)
y_train = train['final_score']
y_test = test['final_score']

In [23]:
X_train.columns

Index(['batting_team_Chennai_Super_Kings', 'batting_team_Delhi_Capitals',
       'batting_team_Gujarat_Lions', 'batting_team_Kings_XI_Punjab',
       'batting_team_Kolkata_Knight_Riders', 'batting_team_Mumbai_Indians',
       'batting_team_Rajasthan_Royals', 'batting_team_Rising_Pune_Supergiant',
       'batting_team_Royal_Challengers_Bangalore',
       'batting_team_Sunrisers_Hyderabad', 'bowling_team_Chennai_Super_Kings',
       'bowling_team_Delhi_Capitals', 'bowling_team_Gujarat_Lions',
       'bowling_team_Kings_XI_Punjab', 'bowling_team_Kolkata_Knight_Riders',
       'bowling_team_Mumbai_Indians', 'bowling_team_Rajasthan_Royals',
       'bowling_team_Rising_Pune_Supergiant',
       'bowling_team_Royal_Challengers_Bangalore',
       'bowling_team_Sunrisers_Hyderabad', 'toss_decision_bat',
       'toss_decision_field',
       'venue_Dr_YS_Rajasekhara_Reddy_ACA_VDCA_Cricket_Stadium',
       'venue_Dubai_International_Cricket_Stadium', 'venue_Eden_Gardens',
       'venue_Feroz_Shah_K

In [24]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((42231, 46), (28803, 46), (42231,), (28803,))

In [25]:
lr = LinearRegression()
lr.fit(X_train, y_train)

LinearRegression()

In [46]:
y_train_pred_lr = lr.predict(X_train)
y_test_pred_lr = lr.predict(X_test)
print('Linear Regression:')
print('-' * 100)
print('Train r2_score: ', r2_score(y_train, y_train_pred_lr))
print('Test r2_score: ', r2_score(y_test, y_test_pred_lr))
print('-' * 100)
print('Train RMSE: ', np.sqrt(mean_squared_error(y_train, y_train_pred_lr)))
print('Test RMSE: ', np.sqrt(mean_squared_error(y_test, y_test_pred_lr)))
print('-' * 100)
print('Train MAPE: ', np.mean(mean_absolute_error(y_train, y_train_pred_lr) / y_train))
print('Test MAPE: ', np.mean(mean_absolute_error(y_test, y_test_pred_lr) / y_test))
error_lr = (np.mean(mean_absolute_error(y_test, y_test_pred_lr) / y_test) / 2)

Linear Regression:
----------------------------------------------------------------------------------------------------
Train r2_score:  0.44799524585401673
Test r2_score:  0.21869902452777445
----------------------------------------------------------------------------------------------------
Train RMSE:  21.89022296482985
Test RMSE:  25.641664851857062
----------------------------------------------------------------------------------------------------
Train MAPE:  0.102841688430274
Test MAPE:  0.12086034811698398


In [47]:
data_lr = pd.DataFrame()
data_lr['actual'] = y_test
data_lr['predicted'] = y_test_pred_lr
data_lr['lower range'] = y_test_pred_lr - (error_lr * y_test_pred_lr)
data_lr['upper range'] = y_test_pred_lr + (error_lr * y_test_pred_lr)
t_list = []
for i in data_lr.index:
    if data_lr['actual'][i] > data_lr['lower range'][i]:
        if data_lr['actual'][i] < data_lr['upper range'][i]:
            t_list.append('True')
        else:
            t_list.append('False')
    else:
        t_list.append('False')
data_lr['Bool'] = t_list
data_lr['Bool'].value_counts()/data_lr.shape[0]

False    0.624657
True     0.375343
Name: Bool, dtype: float64

In [30]:
la = Lasso()
possible_parameter_values = {'alpha' : [float(x)/1000 for x in np.arange(start = 1, stop = 1001, step = 1)]}
la_rscv = RandomizedSearchCV(estimator = la, param_distributions = possible_parameter_values, cv = 10, scoring = 'neg_mean_squared_error', verbose = 2, random_state = 17, 
                              n_jobs = 1)
la_rscv.fit(X_train, y_train)
la_rscv.best_params_

Fitting 10 folds for each of 10 candidates, totalling 100 fits
[CV] alpha=0.624 .....................................................
[CV] ...................................... alpha=0.624, total=   0.2s
[CV] alpha=0.624 .....................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s


[CV] ...................................... alpha=0.624, total=   0.2s
[CV] alpha=0.624 .....................................................
[CV] ...................................... alpha=0.624, total=   0.2s
[CV] alpha=0.624 .....................................................
[CV] ...................................... alpha=0.624, total=   0.2s
[CV] alpha=0.624 .....................................................
[CV] ...................................... alpha=0.624, total=   0.1s
[CV] alpha=0.624 .....................................................
[CV] ...................................... alpha=0.624, total=   0.2s
[CV] alpha=0.624 .....................................................
[CV] ...................................... alpha=0.624, total=   0.2s
[CV] alpha=0.624 .....................................................
[CV] ...................................... alpha=0.624, total=   0.2s
[CV] alpha=0.624 .....................................................
[CV] .

[CV] ...................................... alpha=0.314, total=   0.2s
[CV] alpha=0.753 .....................................................
[CV] ...................................... alpha=0.753, total=   0.1s
[CV] alpha=0.753 .....................................................
[CV] ...................................... alpha=0.753, total=   0.1s
[CV] alpha=0.753 .....................................................
[CV] ...................................... alpha=0.753, total=   0.2s
[CV] alpha=0.753 .....................................................
[CV] ...................................... alpha=0.753, total=   0.2s
[CV] alpha=0.753 .....................................................
[CV] ...................................... alpha=0.753, total=   0.2s
[CV] alpha=0.753 .....................................................
[CV] ...................................... alpha=0.753, total=   0.1s
[CV] alpha=0.753 .....................................................
[CV] .

[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:   21.9s finished


{'alpha': 0.407}

In [31]:
y_train_pred_la = la_rscv.predict(X_train)
y_test_pred_la = la_rscv.predict(X_test)
print('Lasso Regression:')
print('-' * 100)
print('Train r2_score: ', r2_score(y_train, y_train_pred_la))
print('Test r2_score: ', r2_score(y_test, y_test_pred_la))
print('-' * 100)
print('Train RMSE: ', np.sqrt(mean_squared_error(y_train, y_train_pred_la)))
print('Test RMSE: ', np.sqrt(mean_squared_error(y_test, y_test_pred_la)))
print('-' * 100)
print('Train MAPE: ', np.mean(mean_absolute_error(y_train, y_train_pred_la) / y_train))
print('Test MAPE: ', np.mean(mean_absolute_error(y_test, y_test_pred_la) / y_test))
error_la = (np.mean(mean_absolute_error(y_test, y_test_pred_la) / y_test) / 2)

Lasso Regression:
----------------------------------------------------------------------------------------------------
Train r2_score:  0.40633731647849447
Test r2_score:  0.40482695367255406
----------------------------------------------------------------------------------------------------
Train RMSE:  22.701191460709765
Test RMSE:  22.379926903231446
----------------------------------------------------------------------------------------------------
Train MAPE:  0.10567196631025814
Test MAPE:  0.10658317867546845


In [32]:
data_la = pd.DataFrame()
data_la['actual'] = y_test
data_la['predicted'] = y_test_pred_la
data_la['lower range'] = y_test_pred_la - (error_la * y_test_pred_la)
data_la['upper range'] = y_test_pred_la + (error_la * y_test_pred_la)
t_list = []
for i in data_la.index:
    if data_la['actual'][i] > data_la['lower range'][i]:
        if data_la['actual'][i] < data_la['upper range'][i]:
            t_list.append('True')
        else:
            t_list.append('False')
    else:
        t_list.append('False')
data_la['Bool'] = t_list
data_la['Bool'].value_counts()/data_la.shape[0]

False    0.643579
True     0.356421
Name: Bool, dtype: float64

In [33]:
ri = Ridge()
possible_parameter_values = {'alpha' : [float(x)/1000 for x in np.arange(start = 1, stop = 1001, step = 1)]}
ri_rscv = RandomizedSearchCV(estimator = ri, param_distributions = possible_parameter_values, cv = 10, scoring = 'neg_mean_squared_error', verbose = 2, random_state = 17, 
                              n_jobs = 1)
ri_rscv.fit(X_train, y_train)
ri_rscv.best_params_

Fitting 10 folds for each of 10 candidates, totalling 100 fits
[CV] alpha=0.624 .....................................................
[CV] ...................................... alpha=0.624, total=   0.1s
[CV] alpha=0.624 .....................................................
[CV] ...................................... alpha=0.624, total=   0.1s
[CV] alpha=0.624 .....................................................
[CV] ...................................... alpha=0.624, total=   0.0s
[CV] alpha=0.624 .....................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s


[CV] ...................................... alpha=0.624, total=   0.0s
[CV] alpha=0.624 .....................................................
[CV] ...................................... alpha=0.624, total=   0.1s
[CV] alpha=0.624 .....................................................
[CV] ...................................... alpha=0.624, total=   0.0s
[CV] alpha=0.624 .....................................................
[CV] ...................................... alpha=0.624, total=   0.0s
[CV] alpha=0.624 .....................................................
[CV] ...................................... alpha=0.624, total=   0.1s
[CV] alpha=0.624 .....................................................
[CV] ...................................... alpha=0.624, total=   0.0s
[CV] alpha=0.624 .....................................................
[CV] ...................................... alpha=0.624, total=   0.1s
[CV] alpha=0.242 .....................................................
[CV] .

[CV] ...................................... alpha=0.753, total=   0.1s
[CV] alpha=0.753 .....................................................
[CV] ...................................... alpha=0.753, total=   0.1s
[CV] alpha=0.753 .....................................................
[CV] ...................................... alpha=0.753, total=   0.0s
[CV] alpha=0.753 .....................................................
[CV] ...................................... alpha=0.753, total=   0.1s
[CV] alpha=0.753 .....................................................
[CV] ...................................... alpha=0.753, total=   0.1s
[CV] alpha=0.753 .....................................................
[CV] ...................................... alpha=0.753, total=   0.0s
[CV] alpha=0.753 .....................................................
[CV] ...................................... alpha=0.753, total=   0.0s
[CV] alpha=0.753 .....................................................
[CV] .

[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    5.7s finished


{'alpha': 0.942}

In [34]:
y_train_pred_ri = ri_rscv.predict(X_train)
y_test_pred_ri = ri_rscv.predict(X_test)
print('Ridge Regression:')
print('-' * 100)
print('Train r2_score: ', r2_score(y_train, y_train_pred_ri))
print('Test r2_score: ', r2_score(y_test, y_test_pred_ri))
print('-' * 100)
print('Train RMSE: ', np.sqrt(mean_squared_error(y_train, y_train_pred_ri)))
print('Test RMSE: ', np.sqrt(mean_squared_error(y_test, y_test_pred_ri)))
print('-' * 100)
print('Train MAPE: ', np.mean(mean_absolute_error(y_train, y_train_pred_ri) / y_train))
print('Test MAPE: ', np.mean(mean_absolute_error(y_test, y_test_pred_ri) / y_test))
error_ri = (np.mean(mean_absolute_error(y_test, y_test_pred_ri) / y_test) / 2)

Ridge Regression:
----------------------------------------------------------------------------------------------------
Train r2_score:  0.4479951951692921
Test r2_score:  0.21922802562797428
----------------------------------------------------------------------------------------------------
Train RMSE:  21.890223969802985
Test RMSE:  25.63298268828029
----------------------------------------------------------------------------------------------------
Train MAPE:  0.10284139284387749
Test MAPE:  0.1208283701613576


In [35]:
data_ri = pd.DataFrame()
data_ri['actual'] = y_test
data_ri['predicted'] = y_test_pred_ri
data_ri['lower range'] = y_test_pred_ri - (error_ri * y_test_pred_ri)
data_ri['upper range'] = y_test_pred_ri + (error_ri * y_test_pred_ri)
t_list = []
for i in data_ri.index:
    if data_ri['actual'][i] > data_ri['lower range'][i]:
        if data_ri['actual'][i] < data_ri['upper range'][i]:
            t_list.append('True')
        else:
            t_list.append('False')
    else:
        t_list.append('False')
data_ri['Bool'] = t_list
data_ri['Bool'].value_counts()/data_ri.shape[0]

False    0.624622
True     0.375378
Name: Bool, dtype: float64

In [34]:
#dtr = DecisionTreeRegressor()
#possible_parameter_values = {'criterion' : ['mse', 'friedman_mse', 'mae'],
#                             'splitter' : ['best', 'random'],
#                             'max_depth' : [int(x) for x in np.arange(start = 5, stop = 101, step = 5)],
#                             'min_samples_split' : [int(x) for x in np.arange(start = 5, stop = 101, step = 1)],
#                             'min_samples_leaf' : [int(x) for x in np.arange(start = 1, stop = 51, step = 1)]}
#dtr_rscv = RandomizedSearchCV(estimator = dtr, param_distributions = possible_parameter_values, cv = 10, scoring = 'neg_mean_squared_error', verbose = 2, random_state = 17, 
#                              n_jobs = 1)
#dtr_rscv.fit(X_train, y_train)
#dtr_rscv.best_params_

In [35]:
#y_train_pred_dtr = dtr_rscv.predict(X_train)
#y_test_pred_dtr = dtr_rscv.predict(X_test)
#print('Decision Tree Regressor:')
#print('-' * 100)
#print('Train r2_score: ', r2_score(y_train, y_train_pred_dtr))
#print('Test r2_score: ', r2_score(y_test, y_test_pred_dtr))
#print('-' * 100)
#print('Train RMSE: ', np.sqrt(mean_squared_error(y_train, y_train_pred_dtr)))
#print('Test RMSE: ', np.sqrt(mean_squared_error(y_test, y_test_pred_dtr)))
#print('-' * 100)
#print('Train MAPE: ', np.mean(mean_absolute_error(y_train, y_train_pred_dtr) / y_train))
#print('Test MAPE: ', np.mean(mean_absolute_error(y_test, y_test_pred_dtr) / y_test))
#error_dtr = np.mean(mean_absolute_error(y_test, y_test_pred_dtr) / y_test)

In [36]:
#data_dtr = pd.DataFrame()
#data_dtr['actual'] = y_test
#data_dtr['predicted'] = y_test_pred_dtr
#data_dtr['lower range'] = y_test_pred_dtr - (error_dtr * y_test_pred_dtr)
#data_dtr['upper range'] = y_test_pred_dtr + (error_dtr * y_test_pred_dtr)
#t_list = []
#for i in data_dtr.index:
#    if data_dtr['actual'][i] > data_dtr['lower range'][i]:
#        if data_dtr['actual'][i] < data_dtr['upper range'][i]:
#            t_list.append('True')
#        else:
#            t_list.append('False')
#    else:
#        t_list.append('False')
#data_dtr['Bool'] = t_list
#data_dtr['Bool'].value_counts()/data_dtr.shape[0]

In [37]:
#rfr = RandomForestRegressor()
#possible_parameter_values = {'n_estimators' : [int(x) for x in np.arange(start = 50, stop = 1001, step = 50)],
#                             'max_depth' : [int(x) for x in np.arange(start = 5, stop = 101, step = 5)],
#                             'min_samples_split' : [int(x) for x in np.arange(start = 5, stop = 101, step = 1)],
#                             'min_samples_leaf' : [int(x) for x in np.arange(start = 1, stop = 51, step = 1)]}
#rfr_rscv = RandomizedSearchCV(estimator = rfr, param_distributions = possible_parameter_values, cv = 10, scoring = 'neg_mean_squared_error', verbose = 2, random_state = 17, 
#                              n_jobs = 1)
#rfr_rscv.fit(X_train, y_train)
#rfr_rscv.best_params_

In [38]:
#y_train_pred_rfr = rfr_rscv.predict(X_train)
#y_test_pred_rfr = rfr_rscv.predict(X_test)
#print('Random Forest Regressor:')
#print('-' * 100)
#print('Train r2_score: ', r2_score(y_train, y_train_pred_rfr))
#print('Test r2_score: ', r2_score(y_test, y_test_pred_rfr))
#print('-' * 100)
#print('Train RMSE: ', np.sqrt(mean_squared_error(y_train, y_train_pred_rfr)))
#print('Test RMSE: ', np.sqrt(mean_squared_error(y_test, y_test_pred_rfr)))
#print('-' * 100)
#print('Train MAPE: ', np.mean(mean_absolute_error(y_train, y_train_pred_rfr) / y_train))
#print('Test MAPE: ', np.mean(mean_absolute_error(y_test, y_test_pred_rfr) / y_test))
#error_rfr = np.mean(mean_absolute_error(y_test, y_test_pred_rfr) / y_test)

In [39]:
#data_rfr = pd.DataFrame()
#data_rfr['actual'] = y_test
#data_rfr['predicted'] = y_test_pred_rfr
#data_rfr['lower range'] = y_test_pred_rfr - (error_rfr * y_test_pred_rfr)
#data_rfr['upper range'] = y_test_pred_rfr + (error_rfr * y_test_pred_rfr)
#t_list = []
#for i in data_rfr.index:
#    if data_rfr['actual'][i] > data_rfr['lower range'][i]:
#        if data_rfr['actual'][i] < data_rfr['upper range'][i]:
#            t_list.append('True')
#        else:
#            t_list.append('False')
#    else:
#        t_list.append('False')
#data_rfr['Bool'] = t_list
#data_rfr['Bool'].value_counts()/data_rfr.shape[0]

In [36]:
abr = AdaBoostRegressor()
possible_parameter_values = {'n_estimators' : [int(x) for x in np.arange(start = 50, stop = 1001, step = 50)],
                             'learning_rate' : [float(x)/1000 for x in np.arange(start = 1, stop = 1001, step = 1)]}
abr_rscv = RandomizedSearchCV(estimator = abr, param_distributions = possible_parameter_values, cv = 10, scoring = 'neg_mean_squared_error', verbose = 2, random_state = 17, 
                              n_jobs = 1)
abr_rscv.fit(X_train, y_train)
abr_rscv.best_params_

Fitting 10 folds for each of 10 candidates, totalling 100 fits
[CV] n_estimators=200, learning_rate=0.544 ...........................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ............ n_estimators=200, learning_rate=0.544, total=   3.1s
[CV] n_estimators=200, learning_rate=0.544 ...........................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    3.0s remaining:    0.0s


[CV] ............ n_estimators=200, learning_rate=0.544, total=   2.2s
[CV] n_estimators=200, learning_rate=0.544 ...........................
[CV] ............ n_estimators=200, learning_rate=0.544, total=   2.7s
[CV] n_estimators=200, learning_rate=0.544 ...........................
[CV] ............ n_estimators=200, learning_rate=0.544, total=   2.4s
[CV] n_estimators=200, learning_rate=0.544 ...........................
[CV] ............ n_estimators=200, learning_rate=0.544, total=   2.4s
[CV] n_estimators=200, learning_rate=0.544 ...........................
[CV] ............ n_estimators=200, learning_rate=0.544, total=   2.9s
[CV] n_estimators=200, learning_rate=0.544 ...........................
[CV] ............ n_estimators=200, learning_rate=0.544, total=   2.8s
[CV] n_estimators=200, learning_rate=0.544 ...........................
[CV] ............ n_estimators=200, learning_rate=0.544, total=   2.6s
[CV] n_estimators=200, learning_rate=0.544 ...........................
[CV] .

[CV] ............ n_estimators=250, learning_rate=0.229, total=   8.4s
[CV] n_estimators=650, learning_rate=0.713 ...........................
[CV] ............ n_estimators=650, learning_rate=0.713, total=   1.7s
[CV] n_estimators=650, learning_rate=0.713 ...........................
[CV] ............ n_estimators=650, learning_rate=0.713, total=   2.0s
[CV] n_estimators=650, learning_rate=0.713 ...........................
[CV] ............ n_estimators=650, learning_rate=0.713, total=   3.0s
[CV] n_estimators=650, learning_rate=0.713 ...........................
[CV] ............ n_estimators=650, learning_rate=0.713, total=   2.4s
[CV] n_estimators=650, learning_rate=0.713 ...........................
[CV] ............ n_estimators=650, learning_rate=0.713, total=   2.5s
[CV] n_estimators=650, learning_rate=0.713 ...........................
[CV] ............ n_estimators=650, learning_rate=0.713, total=   2.6s
[CV] n_estimators=650, learning_rate=0.713 ...........................
[CV] .

[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed: 12.9min finished


{'n_estimators': 350, 'learning_rate': 0.021}

In [37]:
y_train_pred_abr = abr_rscv.predict(X_train)
y_test_pred_abr = abr_rscv.predict(X_test)
print('Ada Boost Regressor:')
print('-' * 100)
print('Train r2_score: ', r2_score(y_train, y_train_pred_abr))
print('Test r2_score: ', r2_score(y_test, y_test_pred_abr))
print('-' * 100)
print('Train RMSE: ', np.sqrt(mean_squared_error(y_train, y_train_pred_abr)))
print('Test RMSE: ', np.sqrt(mean_squared_error(y_test, y_test_pred_abr)))
print('-' * 100)
print('Train MAPE: ', np.mean(mean_absolute_error(y_train, y_train_pred_abr) / y_train))
print('Test MAPE: ', np.mean(mean_absolute_error(y_test, y_test_pred_abr) / y_test))
error_abr = (np.mean(mean_absolute_error(y_test, y_test_pred_abr) / y_test) / 2)

Ada Boost Regressor:
----------------------------------------------------------------------------------------------------
Train r2_score:  0.28982585228982705
Test r2_score:  0.27012317276344444
----------------------------------------------------------------------------------------------------
Train RMSE:  24.829113025097474
Test RMSE:  24.783453503284647
----------------------------------------------------------------------------------------------------
Train MAPE:  0.12298852315817947
Test MAPE:  0.12201728177681342


In [52]:
error_abr

0.06100864088840671

In [38]:
data_abr = pd.DataFrame()
data_abr['actual'] = y_test
data_abr['predicted'] = y_test_pred_abr
data_abr['lower range'] = y_test_pred_abr - (error_abr * y_test_pred_abr)
data_abr['upper range'] = y_test_pred_abr + (error_abr * y_test_pred_abr)
t_list = []
for i in data_abr.index:
    if data_abr['actual'][i] > data_abr['lower range'][i]:
        if data_abr['actual'][i] < data_abr['upper range'][i]:
            t_list.append('True')
        else:
            t_list.append('False')
    else:
        t_list.append('False')
data_abr['Bool'] = t_list
data_abr['Bool'].value_counts()/data_abr.shape[0]

False    0.664549
True     0.335451
Name: Bool, dtype: float64

In [48]:
knr = KNeighborsRegressor()
possible_parameter_values = {'n_neighbors' : [int(x) for x in np.arange(start = 1, stop = 11, step = 1)],
                             'weights' : ['uniform', 'distance']}
knr_rscv = RandomizedSearchCV(estimator = knr, param_distributions = possible_parameter_values, cv = 10, scoring = 'neg_mean_squared_error', verbose = 2, random_state = 17, 
                              n_jobs = 1)
knr_rscv.fit(X_train, y_train)
knr_rscv.best_params_

Fitting 10 folds for each of 10 candidates, totalling 100 fits
[CV] weights=uniform, n_neighbors=8 ..................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ................... weights=uniform, n_neighbors=8, total=   0.8s
[CV] weights=uniform, n_neighbors=8 ..................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.7s remaining:    0.0s


[CV] ................... weights=uniform, n_neighbors=8, total=   0.7s
[CV] weights=uniform, n_neighbors=8 ..................................
[CV] ................... weights=uniform, n_neighbors=8, total=   0.7s
[CV] weights=uniform, n_neighbors=8 ..................................
[CV] ................... weights=uniform, n_neighbors=8, total=   0.7s
[CV] weights=uniform, n_neighbors=8 ..................................
[CV] ................... weights=uniform, n_neighbors=8, total=   0.8s
[CV] weights=uniform, n_neighbors=8 ..................................
[CV] ................... weights=uniform, n_neighbors=8, total=   0.7s
[CV] weights=uniform, n_neighbors=8 ..................................
[CV] ................... weights=uniform, n_neighbors=8, total=   0.7s
[CV] weights=uniform, n_neighbors=8 ..................................
[CV] ................... weights=uniform, n_neighbors=8, total=   0.7s
[CV] weights=uniform, n_neighbors=8 ..................................
[CV] .

[CV] ................... weights=uniform, n_neighbors=5, total=   0.7s
[CV] weights=uniform, n_neighbors=7 ..................................
[CV] ................... weights=uniform, n_neighbors=7, total=   0.7s
[CV] weights=uniform, n_neighbors=7 ..................................
[CV] ................... weights=uniform, n_neighbors=7, total=   0.7s
[CV] weights=uniform, n_neighbors=7 ..................................
[CV] ................... weights=uniform, n_neighbors=7, total=   0.7s
[CV] weights=uniform, n_neighbors=7 ..................................
[CV] ................... weights=uniform, n_neighbors=7, total=   0.7s
[CV] weights=uniform, n_neighbors=7 ..................................
[CV] ................... weights=uniform, n_neighbors=7, total=   0.7s
[CV] weights=uniform, n_neighbors=7 ..................................
[CV] ................... weights=uniform, n_neighbors=7, total=   0.7s
[CV] weights=uniform, n_neighbors=7 ..................................
[CV] .

[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:  1.2min finished


{'weights': 'uniform', 'n_neighbors': 8}

In [49]:
y_train_pred_knr = knr_rscv.predict(X_train)
y_test_pred_knr = knr_rscv.predict(X_test)
print('K Neighbors Regressor:')
print('-' * 100)
print('Train r2_score: ', r2_score(y_train, y_train_pred_knr))
print('Test r2_score: ', r2_score(y_test, y_test_pred_knr))
print('-' * 100)
print('Train RMSE: ', np.sqrt(mean_squared_error(y_train, y_train_pred_knr)))
print('Test RMSE: ', np.sqrt(mean_squared_error(y_test, y_test_pred_knr)))
print('-' * 100)
print('Train MAPE: ', np.mean(mean_absolute_error(y_train, y_train_pred_knr) / y_train))
print('Test MAPE: ', np.mean(mean_absolute_error(y_test, y_test_pred_knr) / y_test))
error_knr = (np.mean(mean_absolute_error(y_test, y_test_pred_knr) / y_test) / 2)

K Neighbors Regressor:
----------------------------------------------------------------------------------------------------
Train r2_score:  0.6607275192843649
Test r2_score:  0.275552754445054
----------------------------------------------------------------------------------------------------
Train RMSE:  17.16141498489965
Test RMSE:  24.691098895226503
----------------------------------------------------------------------------------------------------
Train MAPE:  0.07843920673906425
Test MAPE:  0.11754539317402085


In [50]:
data_knr = pd.DataFrame()
data_knr['actual'] = y_test
data_knr['predicted'] = y_test_pred_knr
data_knr['lower range'] = y_test_pred_knr - (error_knr * y_test_pred_knr)
data_knr['upper range'] = y_test_pred_knr + (error_knr * y_test_pred_knr)
t_list = []
for i in data_knr.index:
    if data_knr['actual'][i] > data_knr['lower range'][i]:
        if data_knr['actual'][i] < data_knr['upper range'][i]:
            t_list.append('True')
        else:
            t_list.append('False')
    else:
        t_list.append('False')
data_knr['Bool'] = t_list
data_knr['Bool'].value_counts()/data_knr.shape[0]

False    0.63476
True     0.36524
Name: Bool, dtype: float64

In [46]:
#svr = SVR()
#possible_parameter_values = {'gamma' : [float(x)/10000 for x in range(100001)],
#                             'C' : [float(x)/10 for x in range(1001)]}
#svr_rscv = RandomizedSearchCV(estimator = svr, param_distributions = possible_parameter_values, cv = 10, scoring = 'neg_mean_squared_error', verbose = 2, random_state = 17, 
#                              n_jobs = 1)
#svr_rscv.fit(X_train, y_train)
#svr_rscv.best_params_

In [47]:
#y_train_pred_svr = svr_rscv.predict(X_train)
#y_test_pred_svr = svr_rscv.predict(X_test)
#print('Support Vector Regressor:')
#print('-' * 100)
#print('Train r2_score: ', r2_score(y_train, y_train_pred_svr))
#print('Test r2_score: ', r2_score(y_test, y_test_pred_svr))
#print('-' * 100)
#print('Train RMSE: ', np.sqrt(mean_squared_error(y_train, y_train_pred_svr)))
#print('Test RMSE: ', np.sqrt(mean_squared_error(y_test, y_test_pred_svr)))
#print('-' * 100)
#print('Train MAPE: ', np.mean(mean_absolute_error(y_train, y_train_pred_svr) / y_train))
#print('Test MAPE: ', np.mean(mean_absolute_error(y_test, y_test_pred_svr) / y_test))
#error_svr = np.mean(mean_absolute_error(y_test, y_test_pred_svr) / y_test)

In [48]:
#data_svr = pd.DataFrame()
#data_svr['actual'] = y_test
#data_svr['predicted'] = y_test_pred_svr
#data_svr['lower range'] = y_test_pred_svr - (error_svr * y_test_pred_svr)
#data_svr['upper range'] = y_test_pred_svr + (error_svr * y_test_pred_svr)
#t_list = []
#for i in data_svr.index:
#    if data_svr['actual'][i] > data_svr['lower range'][i]:
#        if data_svr['actual'][i] < data_svr['upper range'][i]:
#            t_list.append('True')
#        else:
#            t_list.append('False')
#    else:
#        t_list.append('False')
#data_svr['Bool'] = t_list
#data_svr['Bool'].value_counts()/data_svr.shape[0]

In [49]:
xgbr = XGBRegressor()
possible_parameter_values = {'n_estimators' : [int(x) for x in np.arange(start = 50, stop = 1001, step = 50)], 
                             'use_label_encoder' : [True, False], 
                             'max_depth' : [int(x) for x in np.arange(start = 5, stop = 101, step = 5)], 
                             'learning_rate' : [float(x)/1000 for x in np.arange(start = 1, stop = 1001, step = 1)]}
xgbr_rscv = RandomizedSearchCV(estimator = xgbr, param_distributions = possible_parameter_values, cv = 10, scoring = 'neg_mean_squared_error', verbose = 2, 
                               random_state = 17, n_jobs = 1)
xgbr_rscv.fit(X_train, y_train)
xgbr_rscv.best_params_

Fitting 10 folds for each of 10 candidates, totalling 100 fits
[CV] use_label_encoder=False, n_estimators=50, max_depth=35, learning_rate=0.737 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  use_label_encoder=False, n_estimators=50, max_depth=35, learning_rate=0.737, total=   7.1s
[CV] use_label_encoder=False, n_estimators=50, max_depth=35, learning_rate=0.737 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    7.0s remaining:    0.0s


[CV]  use_label_encoder=False, n_estimators=50, max_depth=35, learning_rate=0.737, total=   5.9s
[CV] use_label_encoder=False, n_estimators=50, max_depth=35, learning_rate=0.737 
[CV]  use_label_encoder=False, n_estimators=50, max_depth=35, learning_rate=0.737, total=   5.9s
[CV] use_label_encoder=False, n_estimators=50, max_depth=35, learning_rate=0.737 
[CV]  use_label_encoder=False, n_estimators=50, max_depth=35, learning_rate=0.737, total=   6.1s
[CV] use_label_encoder=False, n_estimators=50, max_depth=35, learning_rate=0.737 
[CV]  use_label_encoder=False, n_estimators=50, max_depth=35, learning_rate=0.737, total=   6.0s
[CV] use_label_encoder=False, n_estimators=50, max_depth=35, learning_rate=0.737 
[CV]  use_label_encoder=False, n_estimators=50, max_depth=35, learning_rate=0.737, total=   5.8s
[CV] use_label_encoder=False, n_estimators=50, max_depth=35, learning_rate=0.737 
[CV]  use_label_encoder=False, n_estimators=50, max_depth=35, learning_rate=0.737, total=   5.9s
[CV] use

[CV]  use_label_encoder=False, n_estimators=750, max_depth=15, learning_rate=0.971, total=   7.2s
[CV] use_label_encoder=False, n_estimators=750, max_depth=15, learning_rate=0.971 
[CV]  use_label_encoder=False, n_estimators=750, max_depth=15, learning_rate=0.971, total=   7.4s
[CV] use_label_encoder=False, n_estimators=750, max_depth=15, learning_rate=0.971 
[CV]  use_label_encoder=False, n_estimators=750, max_depth=15, learning_rate=0.971, total=   7.2s
[CV] use_label_encoder=True, n_estimators=400, max_depth=50, learning_rate=0.646 
[CV]  use_label_encoder=True, n_estimators=400, max_depth=50, learning_rate=0.646, total=   8.0s
[CV] use_label_encoder=True, n_estimators=400, max_depth=50, learning_rate=0.646 
[CV]  use_label_encoder=True, n_estimators=400, max_depth=50, learning_rate=0.646, total=   8.7s
[CV] use_label_encoder=True, n_estimators=400, max_depth=50, learning_rate=0.646 
[CV]  use_label_encoder=True, n_estimators=400, max_depth=50, learning_rate=0.646, total=   9.1s
[CV

[CV]  use_label_encoder=True, n_estimators=750, max_depth=75, learning_rate=0.1, total= 1.1min
[CV] use_label_encoder=True, n_estimators=750, max_depth=75, learning_rate=0.1 
[CV]  use_label_encoder=True, n_estimators=750, max_depth=75, learning_rate=0.1, total= 1.0min
[CV] use_label_encoder=True, n_estimators=750, max_depth=75, learning_rate=0.1 
[CV]  use_label_encoder=True, n_estimators=750, max_depth=75, learning_rate=0.1, total= 1.1min
[CV] use_label_encoder=True, n_estimators=750, max_depth=75, learning_rate=0.1 
[CV]  use_label_encoder=True, n_estimators=750, max_depth=75, learning_rate=0.1, total= 1.0min
[CV] use_label_encoder=True, n_estimators=750, max_depth=75, learning_rate=0.1 
[CV]  use_label_encoder=True, n_estimators=750, max_depth=75, learning_rate=0.1, total= 1.0min
[CV] use_label_encoder=True, n_estimators=750, max_depth=75, learning_rate=0.1 
[CV]  use_label_encoder=True, n_estimators=750, max_depth=75, learning_rate=0.1, total= 1.1min
[CV] use_label_encoder=True, n

[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed: 25.1min finished


{'use_label_encoder': True,
 'n_estimators': 50,
 'max_depth': 15,
 'learning_rate': 0.158}

In [50]:
y_train_pred_xgbr = xgbr_rscv.predict(X_train)
y_test_pred_xgbr = xgbr_rscv.predict(X_test)
print('XGBoost Regressor:')
print('-' * 100)
print('Train r2_score: ', r2_score(y_train, y_train_pred_xgbr))
print('Test r2_score: ', r2_score(y_test, y_test_pred_xgbr))
print('-' * 100)
print('Train RMSE: ', np.sqrt(mean_squared_error(y_train, y_train_pred_xgbr)))
print('Test RMSE: ', np.sqrt(mean_squared_error(y_test, y_test_pred_xgbr)))
print('-' * 100)
print('Train MAPE: ', np.mean(mean_absolute_error(y_train, y_train_pred_xgbr) / y_train))
print('Test MAPE: ', np.mean(mean_absolute_error(y_test, y_test_pred_xgbr) / y_test))
error_xgbr = np.mean(mean_absolute_error(y_test, y_test_pred_xgbr) / y_test)

XGBoost Regressor:
----------------------------------------------------------------------------------------------------
Train r2_score:  0.9911279076048587
Test r2_score:  0.004897164918433616
----------------------------------------------------------------------------------------------------
Train RMSE:  2.775184134805433
Test RMSE:  28.938166958722093
----------------------------------------------------------------------------------------------------
Train MAPE:  0.009858027822225404
Test MAPE:  0.13397964417977942


In [51]:
data_xgbr = pd.DataFrame()
data_xgbr['actual'] = y_test
data_xgbr['predicted'] = y_test_pred_xgbr
data_xgbr['lower range'] = y_test_pred_xgbr - (error_xgbr * y_test_pred_xgbr)
data_xgbr['upper range'] = y_test_pred_xgbr + (error_xgbr * y_test_pred_xgbr)
t_list = []
for i in data_xgbr.index:
    if data_xgbr['actual'][i] > data_xgbr['lower range'][i]:
        if data_xgbr['actual'][i] < data_xgbr['upper range'][i]:
            t_list.append('True')
        else:
            t_list.append('False')
    else:
        t_list.append('False')
data_xgbr['Bool'] = t_list
data_xgbr['Bool'].value_counts()/data_xgbr.shape[0]

True     0.641947
False    0.358053
Name: Bool, dtype: float64

In [53]:
file = open(r'D:\\IPL\\ipl_score_prediction.pkl', 'wb')
pickle.dump(abr_rscv, file)
file.close()

In [59]:
'batting_team_Chennai_Super_Kings', 'batting_team_Delhi_Capitals',
'batting_team_Gujarat_Lions', 'batting_team_Kings_XI_Punjab',
'batting_team_Kolkata_Knight_Riders', 'batting_team_Mumbai_Indians',
'batting_team_Rajasthan_Royals', 'batting_team_Rising_Pune_Supergiant',
'batting_team_Royal_Challengers_Bangalore',
'batting_team_Sunrisers_Hyderabad', 'bowling_team_Chennai_Super_Kings',
'bowling_team_Delhi_Capitals', 'bowling_team_Gujarat_Lions',
'bowling_team_Kings_XI_Punjab', 'bowling_team_Kolkata_Knight_Riders',
'bowling_team_Mumbai_Indians', 'bowling_team_Rajasthan_Royals',
'bowling_team_Rising_Pune_Supergiant',
'bowling_team_Royal_Challengers_Bangalore',
'bowling_team_Sunrisers_Hyderabad', 'toss_decision_bat',
'toss_decision_field',
'venue_Dr_YS_Rajasekhara_Reddy_ACA_VDCA_Cricket_Stadium',
'venue_Dubai_International_Cricket_Stadium', 'venue_Eden_Gardens',
'venue_Feroz_Shah_Kotla', 'venue_Green_Park',
'venue_Holkar_Cricket_Stadium', 'venue_M_Chinnaswamy_Stadium',
'venue_MA_Chidambaram_Stadium_Chepauk',
'venue_Maharashtra_Cricket_Association_Stadium',
'venue_Punjab_Cricket_Association_IS_Bindra_Stadium_Mohali',
'venue_Rajiv_Gandhi_International_Stadium_Uppal',
'venue_Saurashtra_Cricket_Association_Stadium',
'venue_Sawai_Mansingh_Stadium',
'venue_Shaheed_Veer_Narayan_Singh_International_Stadium',
'venue_Sharjah_Cricket_Stadium', 'venue_Sheikh_Zayed_Stadium',
'venue_Wankhede_Stadium', 'inning', 'over', 'ball', 'runs', 'wickets',
'runs_last_5_overs', 'wickets_last_5_overs'

('runs_last_5_overs', 'wickets_last_5_overs')

In [60]:
arr = np.array([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,  1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 5, 0, 30, 1, 30, 1])

In [61]:
arr = arr.reshape(1, -1)

In [62]:
s = knr_rscv.predict(arr)[0]
s

165.5

In [63]:
s - (0.09185259348772307 * s)

150.29839577778182

In [64]:
s + (0.09185259348772307 * s)

180.70160422221818

In [31]:
int(5.3/1)

5

In [40]:
np.round((5.3%1)*10, 0)

3.0

In [35]:
np.round(7.4%1, 1)

0.4