# MLR model for predicting the auction price of IPL players

In [None]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
np.set_printoptions(precision=2, linewidth=100)
#Number of digits of precision for floating point output (default 8).
#The number of characters per line for the purpose of inserting line breaks (default 75).

In [None]:
import statsmodels.api as sm
#statsmodels provides classes and functions for the estimation 
#of many different statistical models

In [None]:
#sklearn features various classification, regression and clustering algorithms
from sklearn.model_selection import train_test_split
#for splitting data arrays into two subsets: for training data and for testing data.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sn
%matplotlib inline
#%matplotlib inline makes your plot outputs appear and be stored within the notebook.

### Loading the dataset

In [None]:
#https://github.com/Foridur3210/IPL-Dataset-Player-price-prediction/blob/master/IPL%20IMB381IPL2013.csv
ipl_df = pd.read_csv( 'IPL IMB381IPL2013.csv' )

In [None]:
ipl_df.shape

In [None]:
ipl_df.info()

In [None]:
ipl_df.head()

In [None]:
ipl_df.columns

In [None]:
X_features = ipl_df.columns
X_features = ['AGE', 'COUNTRY', 'PLAYING ROLE',
'T-RUNS', 'T-WKTS', 'ODI-RUNS-S', 'ODI-SR-B',
'ODI-WKTS', 'ODI-SR-BL', 'CAPTAINCY EXP', 'RUNS-S',
'HS', 'AVE', 'SR-B', 'SIXERS', 'RUNS-C', 'WKTS',
'AVE-BL', 'ECON', 'SR-BL']

In [None]:
X_features

### Encoding using dummy variables

In [None]:
ipl_df['AGE'].unique()

In [None]:
ipl_df['COUNTRY'].unique()

In [None]:
ipl_df['PLAYING ROLE'].unique()

In [None]:
ipl_df['CAPTAINCY EXP'].unique()

In [None]:
pd.get_dummies(ipl_df['PLAYING ROLE'])[0:10]

In [None]:
category_features = ['AGE', 'COUNTRY', 'PLAYING ROLE', 'CAPTAINCY EXP']
ipl_encoded_df = pd.get_dummies( ipl_df[X_features],
columns = category_features, drop_first = True)
ipl_encoded_df.columns

In [None]:
ipl_encoded_df.head()

### Dividing data into training and test sets

train_test_split(*arrays, test_size=None, train_size=None, random_state=None, shuffle=True, stratify=None). The test_size (or train_size) should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the test(or train) split. Need to specify only one of them.
If train_size is 0.8, the test-size will be set to 0.2. If both are  None, the test_size (train_size) will be set to 0.25 (0.75) by default.
random-state passes an int for reproducible output.

In [None]:
X_features = ipl_encoded_df.columns
X = sm.add_constant( ipl_encoded_df )
#dependent variable
Y = ipl_df['SOLD PRICE']
train_X, test_X, train_y, test_y = train_test_split( X,Y,train_size = 0.8,random_state = 41)
#default is 0.75
# 52 is a seed  for reproducibility of randomness 

In [None]:
ipl_model1 = sm.OLS(train_y, train_X).fit()
ipl_model1.summary2()

### Handling Multicollinearity

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
def compute_vif_factors( X ):
   # X_matrix = X.as_matrix()
    vif_factors = pd.DataFrame()
    vif_factors['column'] = X.columns
    vif = [ variance_inflation_factor( X.values, i ) for i in range( X.shape[1] ) ]
    vif_factors['vif'] = vif
    return vif_factors

In [None]:
vif_factors = compute_vif_factors( X[X_features] )
vif_factors

In [None]:
large_vif_features = vif_factors[vif_factors.vif > 4].column
plt.figure( figsize = (12,10) )
sn.heatmap( X[large_vif_features].corr(), annot = True );
plt.title( "Heatmap to check multicollinearity");

In [None]:
columns_removed = ['T-RUNS', 'T-WKTS', 'RUNS-S', 'HS','AVE', 'RUNS-C', 'SR-B', 'AVE-BL',
'ECON', 'ODI-SR-B', 'ODI-RUNS-S', 'AGE_2', 'SR-BL']
X_revised_features = list( set(X_features) - set(columns_removed) )

In [None]:
compute_vif_factors( X[X_revised_features] )

All VIF values are less than 4.
Now build a new model with these features.

In [None]:
train_X = train_X[X_revised_features]
ipl_model2 = sm.OLS(train_y, train_X).fit()
ipl_model2.summary2()

In [None]:
significant_features = ['COUNTRY_IND', 'COUNTRY_ENG', 'SIXERS', 'ODI-WKTS']
train_X = train_X[significant_features]
ipl_model3 = sm.OLS(train_y, train_X).fit()
ipl_model3.summary2()

### Test for Normality

In [None]:
ppplot = sm.ProbPlot(ipl_model3.resid, fit=True);
plt.figure( figsize = (8, 6) );
ppplot.ppplot( line='45' );
plt.title("Normal p-p plot");
plt.show()

### Test for Homoscedasticity

In [None]:
plt.scatter( ipl_model3.fittedvalues,ipl_model3.resid,marker="o")
plt.xlabel("Standardized predicted values")
plt.ylabel("Standardized residual values");

### Test for Autocorrelation between error terms

### Making the Predictions and Measurements

In [None]:
pred_y = np.power( ipl_model3.predict( test_X[train_X.columns] ), 2)

In [None]:
from sklearn import metrics
import numpy
np.sqrt(mean_squared_error(pred_y,test_y))

In [None]:
from sklearn.metrics import r2_score, mean_squared_error
np.abs(r2_score(pred_y, test_y))

### Detecting Influential Observations

In [None]:
k = train_X.shape[1]
n = train_X.shape[0]
print( "Number of variables:", k, " and number of observations:", n)

In [None]:
leverage_threshold = 3*((k + 1)/n)
print( "Threshold for leverage value: ", round(leverage_threshold, 3) )

In [None]:
from statsmodels.graphics.regressionplots import influence_plot
fig, ax = plt.subplots( figsize=(8,6) )
influence_plot( ipl_model3, ax = ax )
plt.title( "Leverage Value Vs Residuals")
plt.show()

In [None]:
ipl_df[ipl_df.index.isin( [26, 58, 83] )]

In [None]:
train_X_revised = train_X.drop( [26, 58, 83], axis = 0)
train_y_revised = train_y.drop( [26, 58, 83], axis = 0)

In [None]:
train_X = train_X_revised
train_y = train_y_revised
ipl_model4 = sm.OLS(train_y, train_X).fit()
ipl_model4.summary2()

In [None]:
pred_y = np.power( ipl_model4.predict( test_X[train_X.columns] ), 2)

In [None]:
import numpy
np.sqrt(mean_squared_error(test_y, pred_y))

In [None]:
from sklearn.metrics import r2_score, mean_squared_error
np.abs(r2_score(pred_y, test_y))

### Transforming the Response Variable

In [None]:
train_y = np.sqrt( train_y )

In [None]:
ipl_model5 = sm.OLS(train_y, train_X).fit()
ipl_model5.summary2()

In [None]:
ppplot = sm.ProbPlot( ipl_model5.resid, fit=True );
plt.figure( figsize = (8, 6) );
ppplot.ppplot( line='45' );
plt.title("Normal p-p plot of Regression Standardized Residuals");
plt.show()

In [None]:
pred_y = np.power( ipl_model5.predict( test_X[train_X.columns] ), 2)

In [None]:
from sklearn import metrics
np.sqrt(metrics.mean_squared_error(pred_y, test_y))

In [None]:
from sklearn.metrics import r2_score, mean_squared_error
np.abs(r2_score(pred_y, test_y))