# Predicting playing 11 players for each team

## Setting up dataset 

In [None]:
import numpy as np
import pandas as pd


In [None]:
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle datasets download -d utkarshtomar736/odi-mens-cricket-match-data-2002-2023 --force

Downloading odi-mens-cricket-match-data-2002-2023.zip to /content
  0% 0.00/6.87M [00:00<?, ?B/s]100% 6.87M/6.87M [00:00<00:00, 71.4MB/s]
100% 6.87M/6.87M [00:00<00:00, 71.1MB/s]


In [None]:
!unzip /content/odi-mens-cricket-match-data-2002-2023.zip

Archive:  /content/odi-mens-cricket-match-data-2002-2023.zip
  inflating: ODI_Match_Data.csv      
  inflating: ODI_Match_info.csv      


In [None]:
df = pd.read_csv('/content/ODI_Match_Data.csv')

  df = pd.read_csv('/content/ODI_Match_Data.csv')


In [None]:
df.head()

Unnamed: 0,match_id,season,start_date,venue,innings,ball,batting_team,bowling_team,striker,non_striker,...,wides,noballs,byes,legbyes,penalty,wicket_type,player_dismissed,other_wicket_type,other_player_dismissed,cricsheet_id
0,1389389,2023/24,2023-09-24,"Holkar Cricket Stadium, Indore",1,0.1,India,Australia,RD Gaikwad,Shubman Gill,...,,,,,,,,,,1389389
1,1389389,2023/24,2023-09-24,"Holkar Cricket Stadium, Indore",1,0.2,India,Australia,RD Gaikwad,Shubman Gill,...,,,,,,,,,,1389389
2,1389389,2023/24,2023-09-24,"Holkar Cricket Stadium, Indore",1,0.3,India,Australia,RD Gaikwad,Shubman Gill,...,,,,,,,,,,1389389
3,1389389,2023/24,2023-09-24,"Holkar Cricket Stadium, Indore",1,0.4,India,Australia,RD Gaikwad,Shubman Gill,...,,,,,,,,,,1389389
4,1389389,2023/24,2023-09-24,"Holkar Cricket Stadium, Indore",1,0.5,India,Australia,RD Gaikwad,Shubman Gill,...,,,,,,,,,,1389389


In [None]:
df.columns

Index(['match_id', 'season', 'start_date', 'venue', 'innings', 'ball',
       'batting_team', 'bowling_team', 'striker', 'non_striker', 'bowler',
       'runs_off_bat', 'extras', 'wides', 'noballs', 'byes', 'legbyes',
       'penalty', 'wicket_type', 'player_dismissed', 'other_wicket_type',
       'other_player_dismissed', 'cricsheet_id'],
      dtype='object')

In [None]:
df = df[["match_id","ball","batting_team","bowling_team","striker","bowler","runs_off_bat","wicket_type","player_dismissed"]]

In [None]:
features = ["match_id","ball","batting_team","bowling_team","striker","bowler","runs_off_bat","wicket_type","player_dismissed"]

In [None]:
df.shape

(1265103, 9)

## Removing null values and calculating cummulative runs, wickets, strike rate

In [None]:
df.isna().sum()

match_id                  0
ball                      0
batting_team              0
bowling_team              0
striker                   0
bowler                    0
runs_off_bat              0
wicket_type         1230629
player_dismissed    1230629
dtype: int64

In [None]:
df['wickets'] = 0

# Update wickets count for rows where player is dismissed and name matches striker
mask = (df['player_dismissed'].notnull()) & (df['player_dismissed'] == df['striker'])
df.loc[mask, 'wickets'] = 1

# Calculate cumulative wickets player-wise for all matches
df['cumulative_wickets'] = df.groupby('striker')['wickets'].cumsum()

# Fill NaN values with 0
df['cumulative_wickets'] = df['cumulative_wickets'].fillna(0).astype(int)

In [None]:
df.head(30)

Unnamed: 0,match_id,ball,batting_team,bowling_team,striker,bowler,runs_off_bat,wicket_type,player_dismissed,wickets,cumulative_wickets
0,1389389,0.1,India,Australia,RD Gaikwad,SH Johnson,4,,,0,0
1,1389389,0.2,India,Australia,RD Gaikwad,SH Johnson,0,,,0,0
2,1389389,0.3,India,Australia,RD Gaikwad,SH Johnson,4,,,0,0
3,1389389,0.4,India,Australia,RD Gaikwad,SH Johnson,0,,,0,0
4,1389389,0.5,India,Australia,RD Gaikwad,SH Johnson,0,,,0,0
5,1389389,0.6,India,Australia,RD Gaikwad,SH Johnson,0,,,0,0
6,1389389,0.7,India,Australia,RD Gaikwad,SH Johnson,0,,,0,0
7,1389389,1.1,India,Australia,Shubman Gill,JR Hazlewood,0,,,0,0
8,1389389,1.2,India,Australia,Shubman Gill,JR Hazlewood,1,,,0,0
9,1389389,1.3,India,Australia,RD Gaikwad,JR Hazlewood,0,,,0,0


In [None]:
# df  = df.drop(["wicket_type","player_dismissed","wickets"],axis=1)

In [None]:
df['cumulative_runs'] = df.groupby('striker')['runs_off_bat'].cumsum()

In [None]:
df.head(30)

Unnamed: 0,match_id,ball,batting_team,bowling_team,striker,bowler,runs_off_bat,wicket_type,player_dismissed,wickets,cumulative_wickets,cumulative_runs
0,1389389,0.1,India,Australia,RD Gaikwad,SH Johnson,4,,,0,0,4
1,1389389,0.2,India,Australia,RD Gaikwad,SH Johnson,0,,,0,0,4
2,1389389,0.3,India,Australia,RD Gaikwad,SH Johnson,4,,,0,0,8
3,1389389,0.4,India,Australia,RD Gaikwad,SH Johnson,0,,,0,0,8
4,1389389,0.5,India,Australia,RD Gaikwad,SH Johnson,0,,,0,0,8
5,1389389,0.6,India,Australia,RD Gaikwad,SH Johnson,0,,,0,0,8
6,1389389,0.7,India,Australia,RD Gaikwad,SH Johnson,0,,,0,0,8
7,1389389,1.1,India,Australia,Shubman Gill,JR Hazlewood,0,,,0,0,0
8,1389389,1.2,India,Australia,Shubman Gill,JR Hazlewood,1,,,0,0,1
9,1389389,1.3,India,Australia,RD Gaikwad,JR Hazlewood,0,,,0,0,8


In [None]:
df['balls_faced'] = df.groupby('striker').cumcount() + 1
df.head(30)

Unnamed: 0,match_id,ball,batting_team,bowling_team,striker,bowler,runs_off_bat,wicket_type,player_dismissed,wickets,cumulative_wickets,cumulative_runs,balls_faced,batsman_strike_rate
0,1389389,0.1,India,Australia,RD Gaikwad,SH Johnson,4,,,0,0,4,1,400.0
1,1389389,0.2,India,Australia,RD Gaikwad,SH Johnson,0,,,0,0,4,2,200.0
2,1389389,0.3,India,Australia,RD Gaikwad,SH Johnson,4,,,0,0,8,3,266.666667
3,1389389,0.4,India,Australia,RD Gaikwad,SH Johnson,0,,,0,0,8,4,200.0
4,1389389,0.5,India,Australia,RD Gaikwad,SH Johnson,0,,,0,0,8,5,160.0
5,1389389,0.6,India,Australia,RD Gaikwad,SH Johnson,0,,,0,0,8,6,133.333333
6,1389389,0.7,India,Australia,RD Gaikwad,SH Johnson,0,,,0,0,8,7,114.285714
7,1389389,1.1,India,Australia,Shubman Gill,JR Hazlewood,0,,,0,0,0,1,0.0
8,1389389,1.2,India,Australia,Shubman Gill,JR Hazlewood,1,,,0,0,1,2,50.0
9,1389389,1.3,India,Australia,RD Gaikwad,JR Hazlewood,0,,,0,0,8,8,100.0


In [None]:
df['batsman_strike_rate'] = (df['cumulative_runs'] / df['balls_faced']) * 100

In [None]:
df.head(30)

Unnamed: 0,match_id,ball,batting_team,bowling_team,striker,bowler,runs_off_bat,wicket_type,player_dismissed,wickets,cumulative_wickets,cumulative_runs,balls_faced,batsman_strike_rate
0,1389389,0.1,India,Australia,RD Gaikwad,SH Johnson,4,,,0,0,4,1,400.0
1,1389389,0.2,India,Australia,RD Gaikwad,SH Johnson,0,,,0,0,4,2,200.0
2,1389389,0.3,India,Australia,RD Gaikwad,SH Johnson,4,,,0,0,8,3,266.666667
3,1389389,0.4,India,Australia,RD Gaikwad,SH Johnson,0,,,0,0,8,4,200.0
4,1389389,0.5,India,Australia,RD Gaikwad,SH Johnson,0,,,0,0,8,5,160.0
5,1389389,0.6,India,Australia,RD Gaikwad,SH Johnson,0,,,0,0,8,6,133.333333
6,1389389,0.7,India,Australia,RD Gaikwad,SH Johnson,0,,,0,0,8,7,114.285714
7,1389389,1.1,India,Australia,Shubman Gill,JR Hazlewood,0,,,0,0,0,1,0.0
8,1389389,1.2,India,Australia,Shubman Gill,JR Hazlewood,1,,,0,0,1,2,50.0
9,1389389,1.3,India,Australia,RD Gaikwad,JR Hazlewood,0,,,0,0,8,8,100.0


In [None]:
df['cumulative_wickets_bowler'] = df.groupby('bowler')['wickets'].cumsum()

# If you want to fill NaN values with 0
df['cumulative_wickets_bowler'].fillna(0, inplace=True)

In [None]:
df.head(30)

Unnamed: 0,match_id,ball,batting_team,bowling_team,striker,bowler,runs_off_bat,wicket_type,player_dismissed,wickets,cumulative_wickets,cumulative_runs,balls_faced,batsman_strike_rate,cumulative_wickets_bowler
0,1389389,0.1,India,Australia,RD Gaikwad,SH Johnson,4,,,0,0,4,1,400.0,0
1,1389389,0.2,India,Australia,RD Gaikwad,SH Johnson,0,,,0,0,4,2,200.0,0
2,1389389,0.3,India,Australia,RD Gaikwad,SH Johnson,4,,,0,0,8,3,266.666667,0
3,1389389,0.4,India,Australia,RD Gaikwad,SH Johnson,0,,,0,0,8,4,200.0,0
4,1389389,0.5,India,Australia,RD Gaikwad,SH Johnson,0,,,0,0,8,5,160.0,0
5,1389389,0.6,India,Australia,RD Gaikwad,SH Johnson,0,,,0,0,8,6,133.333333,0
6,1389389,0.7,India,Australia,RD Gaikwad,SH Johnson,0,,,0,0,8,7,114.285714,0
7,1389389,1.1,India,Australia,Shubman Gill,JR Hazlewood,0,,,0,0,0,1,0.0,0
8,1389389,1.2,India,Australia,Shubman Gill,JR Hazlewood,1,,,0,0,1,2,50.0,0
9,1389389,1.3,India,Australia,RD Gaikwad,JR Hazlewood,0,,,0,0,8,8,100.0,0


In [None]:
df['balls_bowled'] = df.groupby('bowler').cumcount() + 1
df.head(30)

Unnamed: 0,match_id,ball,batting_team,bowling_team,striker,bowler,runs_off_bat,wicket_type,player_dismissed,wickets,cumulative_wickets,cumulative_runs,balls_faced,batsman_strike_rate,cumulative_wickets_bowler,balls_bowled
0,1389389,0.1,India,Australia,RD Gaikwad,SH Johnson,4,,,0,0,4,1,400.0,0,1
1,1389389,0.2,India,Australia,RD Gaikwad,SH Johnson,0,,,0,0,4,2,200.0,0,2
2,1389389,0.3,India,Australia,RD Gaikwad,SH Johnson,4,,,0,0,8,3,266.666667,0,3
3,1389389,0.4,India,Australia,RD Gaikwad,SH Johnson,0,,,0,0,8,4,200.0,0,4
4,1389389,0.5,India,Australia,RD Gaikwad,SH Johnson,0,,,0,0,8,5,160.0,0,5
5,1389389,0.6,India,Australia,RD Gaikwad,SH Johnson,0,,,0,0,8,6,133.333333,0,6
6,1389389,0.7,India,Australia,RD Gaikwad,SH Johnson,0,,,0,0,8,7,114.285714,0,7
7,1389389,1.1,India,Australia,Shubman Gill,JR Hazlewood,0,,,0,0,0,1,0.0,0,1
8,1389389,1.2,India,Australia,Shubman Gill,JR Hazlewood,1,,,0,0,1,2,50.0,0,2
9,1389389,1.3,India,Australia,RD Gaikwad,JR Hazlewood,0,,,0,0,8,8,100.0,0,3


In [None]:
# Calculate Bowling Strike Rate
df['bowling_strike_rate'] = df['balls_bowled'] / df['cumulative_wickets_bowler'].where(df['cumulative_wickets_bowler'] != 0, 1)

# If you want to fill NaN values (for cases where a bowler hasn't taken any wickets yet)
df['bowling_strike_rate'].fillna(0, inplace=True)


In [None]:
df.head(30)

Unnamed: 0,match_id,ball,batting_team,bowling_team,striker,bowler,runs_off_bat,wicket_type,player_dismissed,wickets,cumulative_wickets,cumulative_runs,balls_faced,batsman_strike_rate,cumulative_wickets_bowler,balls_bowled,bowling_strike_rate
0,1389389,0.1,India,Australia,RD Gaikwad,SH Johnson,4,,,0,0,4,1,400.0,0,1,1.0
1,1389389,0.2,India,Australia,RD Gaikwad,SH Johnson,0,,,0,0,4,2,200.0,0,2,2.0
2,1389389,0.3,India,Australia,RD Gaikwad,SH Johnson,4,,,0,0,8,3,266.666667,0,3,3.0
3,1389389,0.4,India,Australia,RD Gaikwad,SH Johnson,0,,,0,0,8,4,200.0,0,4,4.0
4,1389389,0.5,India,Australia,RD Gaikwad,SH Johnson,0,,,0,0,8,5,160.0,0,5,5.0
5,1389389,0.6,India,Australia,RD Gaikwad,SH Johnson,0,,,0,0,8,6,133.333333,0,6,6.0
6,1389389,0.7,India,Australia,RD Gaikwad,SH Johnson,0,,,0,0,8,7,114.285714,0,7,7.0
7,1389389,1.1,India,Australia,Shubman Gill,JR Hazlewood,0,,,0,0,0,1,0.0,0,1,1.0
8,1389389,1.2,India,Australia,Shubman Gill,JR Hazlewood,1,,,0,0,1,2,50.0,0,2,2.0
9,1389389,1.3,India,Australia,RD Gaikwad,JR Hazlewood,0,,,0,0,8,8,100.0,0,3,3.0


In [None]:
batsmen_df = df[['batting_team', 'striker', 'batsman_strike_rate']].copy()
bowlers_df = df[['bowling_team', 'bowler', 'bowling_strike_rate']].copy()

In [None]:
batsmen_df[batsmen_df['striker'] == 'Shubman Gill']

Unnamed: 0,batting_team,striker,batsman_strike_rate
7,India,Shubman Gill,0.000000
8,India,Shubman Gill,50.000000
13,India,Shubman Gill,33.333333
14,India,Shubman Gill,25.000000
15,India,Shubman Gill,20.000000
...,...,...,...
296655,India,Shubman Gill,100.209096
296656,India,Shubman Gill,100.156740
296657,India,Shubman Gill,100.104439
296658,India,Shubman Gill,100.052192


In [None]:

# Calculate average batting strike rate for each batsman
batsmen_df['avg_batting_strike_rate'] = batsmen_df.groupby(['batting_team', 'striker'])['batsman_strike_rate'].transform('mean')

# Calculate average bowling strike rate for each bowler
bowlers_df['avg_bowling_strike_rate'] = bowlers_df.groupby(['bowling_team', 'bowler'])['bowling_strike_rate'].transform('mean')



In [None]:
batsmen_df = batsmen_df.drop_duplicates(subset=['batting_team', 'striker'])
bowlers_df = bowlers_df.drop_duplicates(subset=['bowling_team', 'bowler'])

In [None]:
batsmen_df[batsmen_df['striker'] == 'Shubman Gill']

Unnamed: 0,batting_team,striker,batsman_strike_rate,avg_batting_strike_rate
7,India,Shubman Gill,0.0,99.836175


In [None]:
# Calculate rankings for batsmen and bowlers within each team
batsmen_df['batsman_rank'] = batsmen_df.groupby(['batting_team'])['avg_batting_strike_rate'].rank(ascending=False)
bowlers_df['bowler_rank'] = bowlers_df.groupby(['bowling_team'])['avg_bowling_strike_rate'].rank(ascending=True)



In [None]:
batsmen_df[batsmen_df['striker'] == 'Shubman Gill']

Unnamed: 0,batting_team,striker,batsman_strike_rate,avg_batting_strike_rate,batsman_rank
7,India,Shubman Gill,0.0,99.836175,14.0


In [None]:
# Display the resulting DataFrames
print("Batsmen DataFrame:")
print(batsmen_df[batsmen_df['batting_team'] == 'India'])

print("\nBowlers DataFrame:")
print(bowlers_df)

Batsmen DataFrame:
        batting_team       striker  batsman_strike_rate  \
0              India    RD Gaikwad                400.0   
7              India  Shubman Gill                  0.0   
23             India       SS Iyer                100.0   
189            India      KL Rahul                  0.0   
213            India  Ishan Kishan                600.0   
...              ...           ...                  ...   
1169861        India   RS Gavaskar                  0.0   
1194915        India     HK Badani                  0.0   
1201747        India     SB Bangar                  0.0   
1208745        India   SV Bahutule                100.0   
1232666        India     J Srinath                  0.0   

         avg_batting_strike_rate  batsman_rank  
0                      82.307363          38.0  
7                      99.836175          14.0  
23                     96.858235          18.0  
189                    88.817308          27.0  
213                    97.6

In [None]:
batsmen_df.head(30)

Unnamed: 0,batting_team,striker,batsman_strike_rate,avg_batting_strike_rate,batsman_rank
0,India,RD Gaikwad,400.0,82.307363,73.5
1,India,RD Gaikwad,200.0,82.307363,73.5
2,India,RD Gaikwad,266.666667,82.307363,73.5
3,India,RD Gaikwad,200.0,82.307363,73.5
4,India,RD Gaikwad,160.0,82.307363,73.5
5,India,RD Gaikwad,133.333333,82.307363,73.5
6,India,RD Gaikwad,114.285714,82.307363,73.5
7,India,Shubman Gill,0.0,99.836175,959.0
8,India,Shubman Gill,50.0,99.836175,959.0
9,India,RD Gaikwad,100.0,82.307363,73.5


In [None]:
bowlers_df.head(30)

Unnamed: 0,bowling_team,bowler,bowling_strike_rate,avg_bowling_strike_rate,bowler_rank
0,Australia,SH Johnson,1.0,25.5,25.5
1,Australia,SH Johnson,2.0,25.5,25.5
2,Australia,SH Johnson,3.0,25.5,25.5
3,Australia,SH Johnson,4.0,25.5,25.5
4,Australia,SH Johnson,5.0,25.5,25.5
5,Australia,SH Johnson,6.0,25.5,25.5
6,Australia,SH Johnson,7.0,25.5,25.5
7,Australia,JR Hazlewood,1.0,37.207895,1967.0
8,Australia,JR Hazlewood,2.0,37.207895,1967.0
9,Australia,JR Hazlewood,3.0,37.207895,1967.0


In [None]:
batsmen_df[batsmen_df['striker'] == 'RG Sharma']

Unnamed: 0,batting_team,striker,batsman_strike_rate,avg_batting_strike_rate,batsman_rank
4478,India,RG Sharma,0.0,94.746554,20.0


## Declaring squad

In [None]:
squad_players = [
    'RG Sharma', 'HH Pandya', 'Shubman Gill', 'V Kohli',
    'SS Iyer', 'Ishan Kishan', 'KL Rahul', 'SA Yadav',
    'RA Jadeja', 'AR Patel', 'SN Thakur', 'JJ Bumrah',
    'Mohammed Shami', 'Mohammed Siraj', 'Kuldeep Yadav'
]

# Filter batsmen_df for players in the squad
squad_batsmen = batsmen_df[batsmen_df['striker'].isin(squad_players)]

# Sort the squad batsmen by their batting rank
top_squad_batsmen = squad_batsmen.sort_values(by='batsman_rank').head(6)

# Display the list of top 6 batsmen from the squad
print("Top 6 Batsmen from the Squad:")
print(top_squad_batsmen[['striker', 'avg_batting_strike_rate', 'batsman_rank']])

Top 6 Batsmen from the Squad:
           striker  avg_batting_strike_rate  batsman_rank
249       SA Yadav               114.193459           3.0
7     Shubman Gill                99.836175          14.0
213   Ishan Kishan                97.624412          16.0
6563     HH Pandya                97.303743          17.0
23         SS Iyer                96.858235          18.0
6446       V Kohli                96.032862          19.0


In [None]:
squad_players = [
    'RG Sharma', 'HH Pandya', 'Shubman Gill', 'V Kohli',
    'SS Iyer', 'Ishan Kishan', 'KL Rahul', 'SA Yadav',
    'RA Jadeja', 'AR Patel', 'SN Thakur', 'JJ Bumrah',
    'Mohammed Shami', 'Mohammed Siraj', 'Kuldeep Yadav'
]

# Filter batsmen_df for players in the squad
squad_bowlers = bowlers_df[bowlers_df['bowler'].isin(squad_players)]

# Sort the squad batsmen by their batting rank
top_squad_bowlers = squad_bowlers.sort_values(by='bowler_rank').head(5)

# Display the list of top 6 batsmen from the squad
print("Top 5 bowlers from the Squad:")
print(top_squad_bowlers[['bowler', 'avg_bowling_strike_rate', 'bowler_rank']])

Top 5 bowlers from the Squad:
               bowler  avg_bowling_strike_rate  bowler_rank
2971   Mohammed Siraj                18.719078         12.0
76324         SS Iyer                20.000000         16.0
310    Mohammed Shami                26.153925         25.0
366         SN Thakur                29.368974         28.0
3051    Kuldeep Yadav                29.369847         29.0


In [None]:
bowlers_df[bowlers_df['bowler'] == 'JJ Bumrah']

Unnamed: 0,bowling_team,bowler,bowling_strike_rate,avg_bowling_strike_rate,bowler_rank
1649,India,JJ Bumrah,1.0,33.482115,41.0


In [None]:
import warnings

# Ignore all warnings
warnings.filterwarnings("ignore")

## Creating model and predicting playing 11

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Assuming you have a DataFrame named 'df' with relevant features
# (batting_team, striker, batsman_strike_rate, bowling_team, bowler, bowling_strike_rate)

# Separate data for batsmen and bowlers
batsmen_df = df[['batting_team', 'striker', 'batsman_strike_rate']].copy()
bowlers_df = df[['bowling_team', 'bowler', 'bowling_strike_rate']].copy()

# Calculate average batting strike rate for each batsman
batsmen_df['avg_batting_strike_rate'] = batsmen_df.groupby(['batting_team', 'striker'])['batsman_strike_rate'].transform('mean')

# Calculate average bowling strike rate for each bowler
bowlers_df['avg_bowling_strike_rate'] = bowlers_df.groupby(['bowling_team', 'bowler'])['bowling_strike_rate'].transform('mean')

batsmen_df = batsmen_df.drop_duplicates(subset=['batting_team', 'striker'])
bowlers_df = bowlers_df.drop_duplicates(subset=['bowling_team', 'bowler'])

# Use only relevant features for prediction
batsmen_features = ['avg_batting_strike_rate']
bowlers_features = ['avg_bowling_strike_rate']

# Separate squads for batsmen and bowlers
squad_players = [
    'RG Sharma', 'Shubman Gill', 'V Kohli',
    'SS Iyer', 'Ishan Kishan', 'KL Rahul', 'SA Yadav',
    'RA Jadeja', 'SN Thakur', 'JJ Bumrah',
    'Mohammed Shami', 'Mohammed Siraj', 'Kuldeep Yadav','R Ashwin','M Prasidh Krishna'
]

squad_batsmen = batsmen_df[batsmen_df['striker'].isin(squad_players)]
squad_bowlers = bowlers_df[bowlers_df['bowler'].isin(squad_players)]

# Train separate models for batsmen and bowlers
batsmen_model = LinearRegression()
bowlers_model = LinearRegression()

# Train batsmen model
X_batsmen = squad_batsmen[batsmen_features].values.reshape(-1, 1)
y_batsmen = squad_batsmen.groupby(['batting_team'])['avg_batting_strike_rate'].rank(ascending=False)

batsmen_model.fit(X_batsmen, y_batsmen)

# Train bowlers model
X_bowlers = squad_bowlers[bowlers_features].values.reshape(-1, 1)
y_bowlers = squad_bowlers.groupby(['bowling_team'])['avg_bowling_strike_rate'].rank(ascending=True)

bowlers_model.fit(X_bowlers, y_bowlers)

# Predict for squad players
# squad_df = pd.DataFrame({'player': squad_players})

# Predict batsmen ranks
squad_batsmen['predicted_batsmen_rank'] = batsmen_model.predict(squad_batsmen[batsmen_features].values.reshape(-1, 1))

# Predict bowlers ranks
squad_bowlers['predicted_bowlers_rank'] = bowlers_model.predict(squad_bowlers[bowlers_features].values.reshape(-1, 1))

# Rank squad players based on predicted performance
squad_batsmen['batsmen_rank'] = squad_batsmen['predicted_batsmen_rank'].rank()
squad_bowlers['bowlers_rank'] = squad_bowlers['predicted_bowlers_rank'].rank()

# Select top 6 batsmen and top 5 bowlers
top_batsmen = squad_batsmen.sort_values(by='batsmen_rank').head(6)
top_bowlers = squad_bowlers.sort_values(by='bowlers_rank').head(6)

# Display the results
print("Top 6 Batsmen from the Indian Squad:")
print(top_batsmen[['striker', 'predicted_batsmen_rank', 'batsmen_rank']])

print("Top 6 Bowlers from the Indian Squad:")
print(top_bowlers[['bowler', 'predicted_bowlers_rank', 'bowlers_rank']])


Top 6 Batsmen from the Indian Squad:
           striker  predicted_batsmen_rank  batsmen_rank
249       SA Yadav                3.119099           1.0
7     Shubman Gill                5.041043           2.0
213   Ishan Kishan                5.337122           3.0
23         SS Iyer                5.439687           4.0
6446       V Kohli                5.550176           5.0
4478     RG Sharma                5.722368           6.0
Top 6 Bowlers from the Indian Squad:
                  bowler  predicted_bowlers_rank  bowlers_rank
2971      Mohammed Siraj                3.354928           1.0
76324            SS Iyer                3.510340           2.0
310       Mohammed Shami                4.256983           3.0
366            SN Thakur                4.647059           4.0
3051       Kuldeep Yadav                4.647165           5.0
316    M Prasidh Krishna                4.918943           6.0


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Assuming you have a DataFrame named 'df' with relevant features
# (batting_team, striker, batsman_strike_rate, bowling_team, bowler, bowling_strike_rate)

# Separate data for batsmen and bowlers
batsmen_df = df[['batting_team', 'striker', 'batsman_strike_rate']].copy()
bowlers_df = df[['bowling_team', 'bowler', 'bowling_strike_rate']].copy()

# Calculate average batting strike rate for each batsman
batsmen_df['avg_batting_strike_rate'] = batsmen_df.groupby(['batting_team', 'striker'])['batsman_strike_rate'].transform('mean')

# Calculate average bowling strike rate for each bowler
bowlers_df['avg_bowling_strike_rate'] = bowlers_df.groupby(['bowling_team', 'bowler'])['bowling_strike_rate'].transform('mean')

batsmen_df = batsmen_df.drop_duplicates(subset=['batting_team', 'striker'])
bowlers_df = bowlers_df.drop_duplicates(subset=['bowling_team', 'bowler'])

# Use only relevant features for prediction
batsmen_features = ['avg_batting_strike_rate']
bowlers_features = ['avg_bowling_strike_rate']

# Separate squads for batsmen and bowlers
squad_players = [
    'T Bavuma', 'Q de Kock', 'RR Hendricks',
    'H Klaasen', 'AK Markram', 'DA Miller', 'HE van der Dussen',
    'M Jansen', 'AL Phehlukwayo', 'G Coetzee',
    'KA Maharaj', 'L Ngidi', 'K Rabada','T Shamsi','LB Williams'
]

squad_batsmen = batsmen_df[batsmen_df['striker'].isin(squad_players)]
squad_bowlers = bowlers_df[bowlers_df['bowler'].isin(squad_players)]

# Train separate models for batsmen and bowlers
batsmen_model = LinearRegression()
bowlers_model = LinearRegression()

# Train batsmen model
X_batsmen = squad_batsmen[batsmen_features].values.reshape(-1, 1)
y_batsmen = squad_batsmen.groupby(['batting_team'])['avg_batting_strike_rate'].rank(ascending=False)

batsmen_model.fit(X_batsmen, y_batsmen)

# Train bowlers model
X_bowlers = squad_bowlers[bowlers_features].values.reshape(-1, 1)
y_bowlers = squad_bowlers.groupby(['bowling_team'])['avg_bowling_strike_rate'].rank(ascending=True)

bowlers_model.fit(X_bowlers, y_bowlers)

# Predict for squad players
# squad_df = pd.DataFrame({'player': squad_players})

# Predict batsmen ranks
squad_batsmen['predicted_batsmen_rank'] = batsmen_model.predict(squad_batsmen[batsmen_features].values.reshape(-1, 1))

# Predict bowlers ranks
squad_bowlers['predicted_bowlers_rank'] = bowlers_model.predict(squad_bowlers[bowlers_features].values.reshape(-1, 1))

# Rank squad players based on predicted performance
squad_batsmen['batsmen_rank'] = squad_batsmen['predicted_batsmen_rank'].rank()
squad_bowlers['bowlers_rank'] = squad_bowlers['predicted_bowlers_rank'].rank()

# Select top 6 batsmen and top 5 bowlers
top_batsmen = squad_batsmen.sort_values(by='batsmen_rank').head(6)
top_bowlers = squad_bowlers.sort_values(by='bowlers_rank').head(8)

# Display the results
print("Top 6 Batsmen from the South African Squad:")
print(top_batsmen[['striker', 'predicted_batsmen_rank', 'batsmen_rank']])

print("Top 8 Bowlers from the South African Squad:")
print(top_bowlers[['bowler', 'predicted_bowlers_rank', 'bowlers_rank']])


Top 6 Batsmen from the South African Squad:
         striker  predicted_batsmen_rank  batsmen_rank
2699    M Jansen                1.635375           1.0
2560   H Klaasen                1.810755           2.0
2519  AK Markram                4.825255           3.0
2590   DA Miller                4.873963           4.0
2448   Q de Kock                5.997006           5.0
2454    T Bavuma                6.430455           6.0
Top 8 Bowlers from the South African Squad:
                   bowler  predicted_bowlers_rank  bowlers_rank
146349  HE van der Dussen               -0.578298           1.0
182170          H Klaasen                3.030550           2.0
146145           T Bavuma                4.083131           3.0
298951       RR Hendricks                4.834975           4.0
182056        LB Williams                6.639399           5.0
2774             M Jansen                6.864020           6.0
6234             T Shamsi                7.213380           7.0
2756           