In [None]:
import pandas as pd
import matplotlib as plt

In [None]:
data = pd.read_csv('../../static/data/ipl_scores.csv')
data.head()

In [None]:
print('Shape => ', data.shape)
data.info()

In [None]:
# Check for null values
data.isnull().sum()

In [None]:
# Check Summary Statistics
data.describe()

##### Data Cleaning


In [None]:
# Removing Irrelevant Columns
features_to_remove = ['mid' , 'venue' , 'batsman', 'bowler', 'striker', 'non-striker']
data.drop(labels=features_to_remove, axis=1, inplace=True)
data.head()

In [None]:
teams = data['bat_team'].unique()
teams

In [None]:
# Only keep current teams which are present
consistent_team = ['Kolkata Knight Riders', 'Chennai Super Kings', 'Rajasthan Royals',
       'Mumbai Indians','Kings XI Punjab',
       'Royal Challengers Bangalore', 'Delhi Daredevils','Sunrisers Hyderabad']

In [None]:
# Filtering Consistent Teams
data = data[(data['bat_team'].isin(consistent_team)) & (data['bowl_team'].isin(consistent_team))]
print(data['bat_team'].unique())
print(data['bowl_team'].unique())

In [None]:
# Filtering Based On 5 Overs
data = data[data['overs'] >= 5.0]
data.head()

In [None]:
# Change Date Column Type
# converting date cols from string to date time object
from datetime import datetime
data['date'] = data['date'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d'))

##### Data Preparation

In [None]:
# Encoding Categorical Variables
# converting categorical features using 'One Hot Encoding' # for numerical values
cat_data = pd.get_dummies(data = data, columns = ['bat_team' , 'bowl_team'])
cat_data.head()

##### Splitting Dataset

In [None]:
# split the data into train and test set - based on date column
X_train = cat_data.drop(labels = 'total', axis = 1)[cat_data['date'].dt.year <= 2016]
X_test = cat_data.drop(labels = 'total', axis = 1) [cat_data['date'].dt.year >= 2017]

# since only one column so cosidered as series
y_train = cat_data[cat_data['date'].dt.year <= 2016]['total'].values
y_test = cat_data[cat_data['date'].dt.year >= 2017]['total'].values

In [None]:
# Dropping Date Column as it is redundant
X_train.drop(labels = 'date', axis = True, inplace = True)
X_test.drop(labels = 'date', axis = True, inplace = True)
X_test

##### Model Development

In [None]:
# Model Selection & Training
from sklearn.linear_model import LinearRegression
model = LinearRegression()

In [None]:
# Training Model
model.fit(X_train , y_train)

##### Model Evaluation

In [None]:
pred = model.predict(X_test)

##### Visualizing Results

In [None]:
import seaborn as sns
sns.distplot(y_test - pred)

In [None]:
# Using Evaluation Metrics
from sklearn import metrics
import numpy as np

# Mean Absolute Error
print('MAE: ', metrics.mean_absolute_error(y_test , pred))

# Mean Squared Error
print('MSE: ', metrics.mean_squared_error(y_test, pred))

# Root Mean Squared Error
print('RMSE: ', np.sqrt(metrics.mean_squared_error(y_test, pred)))


In [None]:
# Saving the IPL Score Predictor Model
import pickle
file_name = '../../static/models/ipl_score_predict_model.pkl'
pickle.dump(model , open(file_name,'wb'))