In [29]:
import numpy as np
import pandas as pd

In [39]:
matchDF = pd.read_csv("./Datasets/matches_2008-2024.csv")
deliveryDF = pd.read_csv("./Datasets/deliveries_2008-2024.csv")

In [40]:
deliveryDF.columns = deliveryDF.columns.str.strip()
matchDF.columns = matchDF.columns.str.strip()

print(deliveryDF.columns.to_list())
print()
print(matchDF.columns.to_list())

['match_id', 'inning', 'batting_team', 'bowling_team', 'over', 'ball', 'batter', 'bowler', 'non_striker', 'batsman_runs', 'extra_runs', 'total_runs', 'extras_type', 'is_wicket', 'player_dismissed', 'dismissal_kind', 'fielder']

['id', 'season', 'city', 'date', 'match_type', 'player_of_match', 'venue', 'team1', 'team2', 'toss_winner', 'toss_decision', 'winner', 'result', 'result_margin', 'target_runs', 'target_overs', 'super_over', 'method', 'umpire1', 'umpire2']


In [41]:
# Merging delivery and match dataframes based on match_id
merged_df = pd.merge(
    deliveryDF,
    matchDF[['id', 'date', 'venue']],
    left_on='match_id',
    right_on='id',
    how='left'
)

In [42]:
merged_df.head()

Unnamed: 0,match_id,inning,batting_team,bowling_team,over,ball,batter,bowler,non_striker,batsman_runs,extra_runs,total_runs,extras_type,is_wicket,player_dismissed,dismissal_kind,fielder,id,date,venue
0,335982,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,1,SC Ganguly,P Kumar,BB McCullum,0,1,1,legbyes,0,,,,335982,2008-04-18,M Chinnaswamy Stadium
1,335982,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,2,BB McCullum,P Kumar,SC Ganguly,0,0,0,,0,,,,335982,2008-04-18,M Chinnaswamy Stadium
2,335982,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,3,BB McCullum,P Kumar,SC Ganguly,0,1,1,wides,0,,,,335982,2008-04-18,M Chinnaswamy Stadium
3,335982,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,4,BB McCullum,P Kumar,SC Ganguly,0,0,0,,0,,,,335982,2008-04-18,M Chinnaswamy Stadium
4,335982,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,5,BB McCullum,P Kumar,SC Ganguly,0,0,0,,0,,,,335982,2008-04-18,M Chinnaswamy Stadium


In [43]:
# Selecting and renaming relevant columns
newdataset = merged_df[['match_id', 'inning', 'date', 'venue', 'batting_team', 'bowling_team',
                        'batter', 'bowler', 'total_runs', 'is_wicket', 'player_dismissed', 'over', 'ball']].copy()


newdataset.rename(columns={
    'match_id': 'mid',
    'batting_team': 'bat_team',
    'bowling_team': 'bowl_team',
    'batter': 'batsman',
    'total_runs': 'runs'
}, inplace=True)


newdataset['mid'] = newdataset['mid'].astype('int64') - 335981

newdataset.head(10)

Unnamed: 0,mid,inning,date,venue,bat_team,bowl_team,batsman,bowler,runs,is_wicket,player_dismissed,over,ball
0,1,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,SC Ganguly,P Kumar,1,0,,0,1
1,1,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,0,0,,0,2
2,1,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,1,0,,0,3
3,1,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,0,0,,0,4
4,1,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,0,0,,0,5
5,1,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,0,0,,0,6
6,1,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,1,0,,0,7
7,1,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,Z Khan,0,0,,1,1
8,1,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,Z Khan,4,0,,1,2
9,1,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,Z Khan,4,0,,1,3


In [44]:
# Group by both match and inning for all calculations
grouping_cols = ['mid', 'inning']

# 1. Correctly calculate the cumulative 'wickets' per inning
newdataset['wickets'] = newdataset.groupby(grouping_cols)['is_wicket'].cumsum()

# Calculate 'overs' as a floating point
newdataset['overs'] = newdataset['over'] + (newdataset['ball'] - 1) * 0.1

# 2. Calculate the cumulative 'total' score per inning
newdataset['total'] = newdataset.groupby(grouping_cols)['runs'].cumsum()

# 3. Correctly calculate runs and wickets in the last 5 overs per inning
newdataset['runs_last_5'] = newdataset.groupby(grouping_cols)['runs'].transform(
    lambda x: x.rolling(30, min_periods=1).sum()
)
newdataset['wickets_last_5'] = newdataset.groupby(grouping_cols)['is_wicket'].transform(
    lambda x: x.rolling(30, min_periods=1).sum()
)

# Cumulative runs for striker and non-striker per inning
newdataset['striker'] = newdataset.groupby(grouping_cols)['runs'].cumsum()
newdataset['non_striker'] = newdataset.groupby(
    grouping_cols)['runs'].cumsum().shift(1).fillna(0).astype(int)

# Selecting final columns for the dataset
final_dataset = newdataset[
    ['mid', 'inning', 'date', 'venue', 'bat_team', 'bowl_team', 'batsman', 'bowler', 'runs',
     'wickets', 'overs', 'runs_last_5', 'wickets_last_5', 'striker', 'non_striker', 'total']
]

# Displaying the head of the final dataset
final_dataset['final_total'] = final_dataset.groupby(['mid', 'inning'])['total'].transform('max')
final_dataset[110:130]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_dataset['final_total'] = final_dataset.groupby(['mid', 'inning'])['total'].transform('max')


Unnamed: 0,mid,inning,date,venue,bat_team,bowl_team,batsman,bowler,runs,wickets,overs,runs_last_5,wickets_last_5,striker,non_striker,total,final_total
110,1,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,AA Noffke,4,3,17.4,54.0,1.0,178,174,178,222
111,1,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,AA Noffke,1,3,17.5,54.0,1.0,179,178,179,222
112,1,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,JH Kallis,6,3,18.0,60.0,1.0,185,179,185,222
113,1,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,JH Kallis,0,3,18.1,59.0,1.0,185,185,185,222
114,1,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,JH Kallis,6,3,18.2,64.0,1.0,191,185,191,222
115,1,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,JH Kallis,4,3,18.3,67.0,1.0,195,191,195,222
116,1,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,JH Kallis,1,3,18.4,66.0,1.0,196,195,196,222
117,1,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,Mohammad Hafeez,JH Kallis,4,3,18.5,66.0,1.0,200,196,200,222
118,1,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,6,3,19.0,71.0,1.0,206,200,206,222
119,1,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,6,3,19.1,71.0,1.0,212,206,212,222


In [45]:
final_dataset['total'].to_list()

[1,
 1,
 2,
 2,
 2,
 2,
 3,
 3,
 7,
 11,
 17,
 21,
 21,
 21,
 21,
 22,
 26,
 27,
 27,
 32,
 38,
 39,
 43,
 43,
 44,
 50,
 54,
 55,
 59,
 59,
 60,
 60,
 61,
 61,
 61,
 61,
 61,
 61,
 62,
 63,
 64,
 66,
 67,
 68,
 68,
 69,
 70,
 71,
 72,
 73,
 73,
 73,
 73,
 74,
 75,
 77,
 78,
 79,
 80,
 80,
 86,
 87,
 88,
 92,
 92,
 98,
 98,
 102,
 102,
 108,
 110,
 111,
 111,
 112,
 112,
 116,
 116,
 118,
 119,
 123,
 124,
 125,
 125,
 126,
 127,
 128,
 130,
 134,
 135,
 141,
 145,
 147,
 148,
 154,
 154,
 155,
 157,
 157,
 158,
 158,
 159,
 161,
 162,
 168,
 170,
 172,
 172,
 173,
 173,
 174,
 178,
 179,
 185,
 185,
 191,
 195,
 196,
 200,
 206,
 212,
 214,
 214,
 216,
 222,
 1,
 2,
 2,
 3,
 4,
 4,
 4,
 4,
 4,
 8,
 9,
 9,
 9,
 9,
 9,
 10,
 10,
 11,
 11,
 12,
 14,
 15,
 15,
 16,
 16,
 16,
 17,
 18,
 18,
 24,
 24,
 24,
 24,
 24,
 24,
 25,
 26,
 26,
 26,
 26,
 27,
 28,
 29,
 29,
 29,
 33,
 34,
 35,
 36,
 38,
 38,
 38,
 38,
 38,
 38,
 39,
 40,
 41,
 43,
 43,
 44,
 44,
 44,
 45,
 46,
 50,
 51,
 51,
 51,
 5

In [46]:
final_dataset.to_csv('Datasets/scorePridiction.csv', index=False)