In [1]:
# Importing Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
import seaborn as sns
from pyspark.sql import SparkSession
from pyspark.sql.types import *
%matplotlib inline

In [2]:
path = "E:/Rutgers/Projects/MDSR/IPL-MSDR"

In [3]:
# Reading Data
matches = pd.read_csv(path + '/dataset/original_ipldata/matches.csv')
deliveries = pd.read_csv(path + '/dataset/original_ipldata/deliveries.csv')

In [4]:
# Schema of original data (matches.csv)
matches.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 756 entries, 0 to 755
Data columns (total 18 columns):
id                 756 non-null int64
season             756 non-null int64
city               749 non-null object
date               756 non-null object
team1              756 non-null object
team2              756 non-null object
toss_winner        756 non-null object
toss_decision      756 non-null object
result             756 non-null object
dl_applied         756 non-null int64
winner             752 non-null object
win_by_runs        756 non-null int64
win_by_wickets     756 non-null int64
player_of_match    752 non-null object
venue              756 non-null object
umpire1            754 non-null object
umpire2            754 non-null object
umpire3            119 non-null object
dtypes: int64(5), object(13)
memory usage: 68.0+ KB


In [5]:
# Schema of original data (deliveries.csv)
deliveries.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 179078 entries, 0 to 179077
Data columns (total 21 columns):
match_id            179078 non-null int64
inning              179078 non-null int64
batting_team        179078 non-null object
bowling_team        179078 non-null object
over                179078 non-null int64
ball                179078 non-null int64
batsman             179078 non-null object
non_striker         179078 non-null object
bowler              179078 non-null object
is_super_over       179078 non-null int64
wide_runs           179078 non-null int64
bye_runs            179078 non-null int64
legbye_runs         179078 non-null int64
noball_runs         179078 non-null int64
penalty_runs        179078 non-null int64
batsman_runs        179078 non-null int64
extra_runs          179078 non-null int64
total_runs          179078 non-null int64
player_dismissed    8834 non-null object
dismissal_kind      8834 non-null object
fielder             6448 non-null object
dtype

# Data Cleaning

In [6]:
# Dropping columns that are of no use
matches = matches.drop(columns = ['umpire1', 'umpire2','umpire3','date'])

In [7]:
#Filing empty values
matches = matches.fillna(value = 'None')
deliveries = deliveries.fillna(value = 0)

In [8]:
# Schema of cleaned data (matches.csv)
matches.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 756 entries, 0 to 755
Data columns (total 14 columns):
id                 756 non-null int64
season             756 non-null int64
city               756 non-null object
team1              756 non-null object
team2              756 non-null object
toss_winner        756 non-null object
toss_decision      756 non-null object
result             756 non-null object
dl_applied         756 non-null int64
winner             756 non-null object
win_by_runs        756 non-null int64
win_by_wickets     756 non-null int64
player_of_match    756 non-null object
venue              756 non-null object
dtypes: int64(5), object(9)
memory usage: 56.1+ KB


In [9]:
# Schema of cleaned data (deliveries.csv)
deliveries.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 179078 entries, 0 to 179077
Data columns (total 21 columns):
match_id            179078 non-null int64
inning              179078 non-null int64
batting_team        179078 non-null object
bowling_team        179078 non-null object
over                179078 non-null int64
ball                179078 non-null int64
batsman             179078 non-null object
non_striker         179078 non-null object
bowler              179078 non-null object
is_super_over       179078 non-null int64
wide_runs           179078 non-null int64
bye_runs            179078 non-null int64
legbye_runs         179078 non-null int64
noball_runs         179078 non-null int64
penalty_runs        179078 non-null int64
batsman_runs        179078 non-null int64
extra_runs          179078 non-null int64
total_runs          179078 non-null int64
player_dismissed    179078 non-null object
dismissal_kind      179078 non-null object
fielder             179078 non-null object

In [10]:
# Saving cleaned data (matches.csv)
matches.to_csv(path + '/dataset/clean_data/matches.csv')

In [11]:
# Saving cleaned data (deliveries.csv)
deliveries.to_csv(path + '/dataset/clean_data/deliveries.csv')

# Basic Analysis

In [12]:
# Teams playing in the league
teams = matches['team1'].unique()
print("Total number of teams participated so far: " + str(len(matches['team1'].unique())))
print("Teams participated so far: ")
for i in teams:
    print("- " + i)

Total number of teams participated so far: 15
Teams participated so far: 
- Sunrisers Hyderabad
- Mumbai Indians
- Gujarat Lions
- Rising Pune Supergiant
- Royal Challengers Bangalore
- Kolkata Knight Riders
- Delhi Daredevils
- Kings XI Punjab
- Chennai Super Kings
- Rajasthan Royals
- Deccan Chargers
- Kochi Tuskers Kerala
- Pune Warriors
- Rising Pune Supergiants
- Delhi Capitals


In [13]:
# Total Venues
print("Number of venues matches were played: " + str(len(matches['venue'].unique())))
for i in matches['venue'].unique():
    print("- " + i)

Number of venues matches were played: 41
- Rajiv Gandhi International Stadium, Uppal
- Maharashtra Cricket Association Stadium
- Saurashtra Cricket Association Stadium
- Holkar Cricket Stadium
- M Chinnaswamy Stadium
- Wankhede Stadium
- Eden Gardens
- Feroz Shah Kotla
- Punjab Cricket Association IS Bindra Stadium, Mohali
- Green Park
- Punjab Cricket Association Stadium, Mohali
- Sawai Mansingh Stadium
- MA Chidambaram Stadium, Chepauk
- Dr DY Patil Sports Academy
- Newlands
- St George's Park
- Kingsmead
- SuperSport Park
- Buffalo Park
- New Wanderers Stadium
- De Beers Diamond Oval
- OUTsurance Oval
- Brabourne Stadium
- Sardar Patel Stadium, Motera
- Barabati Stadium
- Vidarbha Cricket Association Stadium, Jamtha
- Himachal Pradesh Cricket Association Stadium
- Nehru Stadium
- Dr. Y.S. Rajasekhara Reddy ACA-VDCA Cricket Stadium
- Subrata Roy Sahara Stadium
- Shaheed Veer Narayan Singh International Stadium
- JSCA International Stadium Complex
- Sheikh Zayed Stadium
- Sharjah Cric

In [14]:
# Cities the matches were played
print("Number of cities matches were played: " + str(len(matches['city'].unique())))
for i in matches['city'].unique():
    print("- " + i)

Number of cities matches were played: 33
- Hyderabad
- Pune
- Rajkot
- Indore
- Bangalore
- Mumbai
- Kolkata
- Delhi
- Chandigarh
- Kanpur
- Jaipur
- Chennai
- Cape Town
- Port Elizabeth
- Durban
- Centurion
- East London
- Johannesburg
- Kimberley
- Bloemfontein
- Ahmedabad
- Cuttack
- Nagpur
- Dharamsala
- Kochi
- Visakhapatnam
- Raipur
- Ranchi
- Abu Dhabi
- Sharjah
- None
- Mohali
- Bengaluru


In [15]:
# Total number of bowlers so far
print("Total number of bowlers: " + str(len(deliveries['bowler'].unique())))

Total number of bowlers: 405


In [16]:
# Total number of batsmen so far
print("Total number of batsmen: " + str(len(deliveries['batsman'].unique())))

Total number of batsmen: 516


In [None]:
# Total number of participating players
players = set()
for i in range(len(deliveries['match_id'])):
    players.add(deliveries['bowler'][i])
    players.add(deliveries['batsman'][i])
    players.add(deliveries['non_striker'][i])
print("Total number of player: " + str(len(players)))

# Spark Analysis

### Total number of matches per season

In [None]:
matches.registerTempTable('seasons')
seasons = spark.sql('''Select distinct(season),count(*) as total_matches from seasons group by season ''') 
seasons.show()

In [None]:
# Plot
fig, a = plt.subplots()
a = sns.barplot(x ="season", y="total_matches", data=seasons.toPandas(),palette='viridis')
a.set_xlabel('Season')
a.set_ylabel('Total Matches')
a.set_title('Number of matches in each season')

### Number of maches played by each team since season 1

In [None]:
matches.registerTempTable('team')
team = spark.sql('''Select distinct(team), count(*) as total_matches from (Select team1 as team from team UNION ALL (select team2 as team from team)) group by team ''')
team.show()

In [None]:
# Plot
fig, a = plt.subplots(figsize = (5,5))
a = sns.barplot(x ="total_matches", y="team", data=team.toPandas(), palette='viridis')
a.set_ylabel('Team')
a.set_xlabel('Total Matches')
a.set_title('Number of matches played by each team')

### Total season in which teams have played

In [None]:
matches.registerTempTable('team_season')
team_season = spark.sql('''Select team1 as team, min(season) as first_season, max(season) as last_season, count(distinct(season)) as total_seasons from team_season group by team1 order by total_seasons desc''')
team_season.show()

### Total number of matches won by teams

In [None]:
matches.registerTempTable('most_win')
most_win = spark.sql('''Select distinct(winner) as team, count(*) as total_matches from most_win where winner <>'None' group by winner order by total_matches ''')
most_win.show()

In [None]:
# Plot
fig, a = plt.subplots(figsize = (5,5))
a = sns.barplot(x ="total_matches", y="team", data=most_win.toPandas(), palette='viridis')
a.set_ylabel('Team')
a.set_xlabel('Total Matches')
a.set_title('Number of matches won by each team')

### Total matches won by teams in each season

In [None]:
matches.registerTempTable('most_win_by_season')
most_win_by_season = spark.sql('''Select season, winner as team, count(*) as total_matches_won from most_win_by_season where winner <> 'None' group by season, winner order by total_matches_won desc''')
most_win_by_season.show()

### Players with maximum man of the match awards 

In [None]:
matches.registerTempTable('man_match')
man_match = spark.sql('''Select distinct(player_of_match), count(*) as total_matches from man_match group by player_of_match order by total_matches desc limit 10 ''')
man_match.show()

In [None]:
# Plot
fig, a = plt.subplots(figsize = (5,5))
a = sns.barplot(x ="total_matches", y="player_of_match", data=man_match.toPandas(), palette='viridis')
a.set_xlabel('Total Matches')
a.set_ylabel('Player')
a.set_title('Number of times player won man of the match'

### Number of matches per Venue

In [None]:
matches.registerTempTable('venue')
venue = spark.sql('''Select distinct(venue), count(*) as total_matches from venue group by venue''')
venue.show()

In [None]:
# Plot
fig, a = plt.subplots(figsize = (10,20))
a = sns.barplot(x ="total_matches", y="venue", data=venue.toPandas(), palette='viridis')
a.set_ylabel('Venue')
a.set_xlabel('Total Matches')
a.set_title('Number of matches at each venue')

### Percentage toss decisions 

In [None]:
matches.registerTempTable('toss')
toss = spark.sql('''Select distinct(toss_decision), ((count(toss_decision)*100)/ (select count(*) from toss)) as percentage_count from toss group by toss_decision''')
toss.show()

In [None]:
# Plot
fig, a = plt.subplots(figsize = (5,5))
a = sns.barplot(x ="toss_decision", y="percentage_count", data=toss.toPandas(), palette='viridis')
a.set_ylabel('Percentage')
a.set_xlabel('Toss Decision')
a.set_title('Percentage Plot of toss_decision')

### Percentage of team winning the toss as well as the match

In [None]:
matches.registerTempTable('toss_and_won')
matches.registerTempTable('toss_won_data')
toss_won_data = spark.sql('''Select t1.season, t1.total_matches, \
          t2.count_toss_and_won as count_toss_and_won, \
          (t2.count_toss_and_won / t1.total_matches * 100) as percent_toss_and_won from \
          (Select distinct(season),count(*) as total_matches from seasons group by season)t1 \
          left join (Select distinct(season), count(*) as count_toss_and_won from toss_and_won where toss_winner = winner group by season)t2 on t1.season = t2.season order by season''')
toss_won_data.show()

In [None]:
# Plot
fig, a = plt.subplots(figsize = (10,5))
a = sns.barplot(x ="season", y="percent_toss_and_won", data=toss_won_data.toPandas(), palette='viridis')
a.set_ylabel('Percentage')
a.set_xlabel('Season')
a.set_title('Percentage Plot of Season and Toss_and_won')

### Percentage matches won by batting first 

In [None]:
win_batting_first = spark.sql('''Select t1.season, t1.total_matches, \
          t2.win_batting_first as win_batting_first, \
          (t2.win_batting_first/ t1.total_matches * 100) as percent_win_batting_first from \
          (Select distinct(season),count(*) as total_matches from seasons group by season)t1 \
          left join (Select distinct(season), count(*) as win_batting_first from seasons where win_by_runs > 0  group by season)t2 on t1.season = t2.season order by season ''')
win_batting_first.show()

In [None]:
# Plot
fig, a = plt.subplots(figsize = (10,5))
a = sns.barplot(x ="season", y="percent_win_batting_first", data=win_batting_first.toPandas(), palette='viridis')
a.set_ylabel('Percentage')
a.set_xlabel('Season')
a.set_title('Percentage Plot of Season and won by batting ')

### Percentage matches won by fielding first

In [None]:
win_bowling_first = spark.sql('''Select t1.season, t1.total_matches, \
          t2.win_bowling_first as win_bowling_first, \
          (t2.win_bowling_first/ t1.total_matches * 100) as percent_win_bowling_first from \
          (Select distinct(season),count(*) as total_matches from seasons group by season)t1 \
          left join (Select distinct(season), count(*) as win_bowling_first from seasons where win_by_wickets > 0  group by season)t2 on t1.season = t2.season order by season ''')
win_bowling_first.show()

In [None]:
# Plot
fig, a = plt.subplots(figsize = (10,5))
a = sns.barplot(x ="season", y="percent_win_bowling_first", data=win_bowling_first.toPandas(), palette='viridis')
a.set_ylabel('Percentage')
a.set_xlabel('Season')
a.set_title('Percentage Plot of Season and won by wickets ')

# Final Analysis

In [None]:
# Reading data
matches = spark.read.csv(path + 'dataset/clean_data/matches.csv',inferSchema=True,header=True)
deliveries = spark.read.csv(path + 'dataset/clean_data/deliveries.csv',inferSchema=True,header=True)

In [None]:
# Creating temporary tables of the data
matches.registerTempTable('matches_db')
deliveries.registerTempTable('deliveries_db')

In [None]:
# Merging both the tables
merged_db = sqlContext.sql('select m.*,d.* from matches_db as m inner join deliveries_db as d on m.id=d.match_id')

## Batting Metrics 

### Hard Hitting Ability 

In [None]:
# Hard Hitting Ability = (4*Fours + 6*Sixes) / Balls Played by Batsman


### Finisher 

In [None]:
# Finisher = Not Out innings / Total Innings played


### Fast Scoring Ability

In [None]:
# Fast Scoring Ability = Total Runs / Balls Played by Batsman


### Consistency

In [None]:
# Consistency = Total Runs/Number of Times Out


###  Running Between Wickets

In [None]:
# Running Between Wickets = (Total Runs – (4*Fours + 6*Sixes))/(Total Balls Played – Boundary Balls)


## Bowling Metrics 

### Economy 

In [None]:
# Economy = Runs Scored / (Number of balls bowled by bowler/6)


### Wicket Taking Ability

In [None]:
# Wicket Taking Ability = Number of balls bowled / Wickets Taken


### Consistency 

In [None]:
# Consistency = Runs Conceded / Wickets Taken


### Crucial Wicket Taking Ability

In [None]:
# Crucial Wicket Taking Ability = Number of times Four or Five Wickets Taken / Number of Innings Played


### Short Performance Index

In [None]:
# Short Performance Index = (Wickets Taken – 4* Number of Times Four Wickets Taken – 5* Number of Times Five Wickets Taken) / (Innings Played – Number of Times Four Wickets or Five Wickets Taken)
