In [1]:
import pandas as pd
import numpy as np
import json
import requests

In [2]:
batting_data = {}
bowling_data = {}

## Bowling Data

In [3]:
for year in range(2008,2024):
    bowling_data[year] = pd.read_csv(f'runs_wickets_per_season/{year}_bowlers.csv')

In [4]:
for df in list(bowling_data.values())[:-2]:
    df.drop(columns=['Hattricks'],inplace=True)

In [5]:
for df in list(bowling_data.values())[-2:]:
    df.drop(columns=['ClientPlayerID', 'InningsRuns'],inplace=True)

In [6]:
rows = bowling_data[2021].shape[0]
empty_cols = []
for column in bowling_data[2021].columns:
    if column not in ['RightHandedBat','Nationality']:
        empty_rows = bowling_data[2021][column].isnull().sum()
        if empty_rows == rows:
            empty_cols.append(column)
    else:
        continue

In [7]:
empty_cols

['CompetitionID',
 'TeamID',
 'LegalBallsBowled',
 'DotBallPercent',
 'ScoringBallsBowled',
 'BoundaryPercentage',
 'BoundaryFrequency',
 'Ones',
 'Twos',
 'Threes',
 'Wides',
 'NoBalls',
 'Byes',
 'LegBye',
 'InningsWickets',
 'MatchRuns',
 'MatchWickets',
 'MaidenWickets',
 'TenWickets']

In [8]:
for df in bowling_data.values():
    df.drop(columns=empty_cols,inplace=True)

In [9]:
for df in bowling_data.values():
    df.OversBowled = df.OversBowled.astype(str)

In [10]:
for df in bowling_data.values():
    df.drop(columns=['BowlerID'],inplace=True)

In [11]:
for df in bowling_data.values():
    df.rename(columns={'StrikeRate':'BowlingStrikeRate','Sixes':'SixesConceded','Fours':'FoursConceded'},inplace=True)

In [12]:
for df in list(bowling_data.values())[:-2]:
    df.RightHandedBat = False

In [13]:
for df in list(bowling_data.values())[:-1]:
    df.Nationality = df.Nationality.astype(str)
    df.Nationality = df.Nationality.apply(lambda x:None if x == 'nan' else x)

In [14]:
for df in bowling_data.values():
    df.rename(columns={'BowlerName':'Name'},inplace=True)

In [21]:
bowling_data[2022].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103 entries, 0 to 102
Data columns (total 22 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Name               103 non-null    object 
 1   RightHandedBat     103 non-null    bool   
 2   Nationality        0 non-null      object 
 3   TeamCode           103 non-null    object 
 4   TeamName           103 non-null    object 
 5   Matches            103 non-null    int64  
 6   Innings            103 non-null    int64  
 7   TotalRunsConceded  103 non-null    int64  
 8   DotBallsBowled     103 non-null    int64  
 9   BowlingAverage     103 non-null    float64
 10  BowlingStrikeRate  103 non-null    float64
 11  BowlingSR          103 non-null    float64
 12  EconomyRate        103 non-null    float64
 13  OversBowled        103 non-null    object 
 14  FoursConceded      103 non-null    int64  
 15  SixesConceded      103 non-null    int64  
 16  Wickets            103 non

In [23]:
for df in bowling_data.values():
    df.drop(columns=['BowlingSR'],inplace=True)

In [25]:
def number_of_balls(overs):
    if '.' in overs:
        over,ball = tuple(map(int,overs.split('.')))
    else:
        over,ball = int(overs),0
    return 6*over + ball

In [26]:
for df in bowling_data.values():
    df['BallsBowled'] = df['OversBowled'].apply(number_of_balls)

In [27]:
bowling_data[2022].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103 entries, 0 to 102
Data columns (total 22 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Name               103 non-null    object 
 1   RightHandedBat     103 non-null    bool   
 2   Nationality        0 non-null      object 
 3   TeamCode           103 non-null    object 
 4   TeamName           103 non-null    object 
 5   Matches            103 non-null    int64  
 6   Innings            103 non-null    int64  
 7   TotalRunsConceded  103 non-null    int64  
 8   DotBallsBowled     103 non-null    int64  
 9   BowlingAverage     103 non-null    float64
 10  BowlingStrikeRate  103 non-null    float64
 11  EconomyRate        103 non-null    float64
 12  OversBowled        103 non-null    object 
 13  FoursConceded      103 non-null    int64  
 14  SixesConceded      103 non-null    int64  
 15  Wickets            103 non-null    int64  
 16  BBIW               103 non

## Batting Data

In [28]:
for year in range(2008,2024):
    batting_data[year] = pd.read_csv(f'runs_seasonwise/batsmen_{year}.csv')

In [35]:
# for df in batting_data.values():
#     df.rename(columns={'StrikerName':'Name'},inplace=True)

In [38]:
# for df in batting_data.values():
#     df.drop(columns=['PlayerId'],inplace=True)

In [39]:
for year in [2008,2021,2022,2023]:
    print(year)
    batting_data[year].info()

2008
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Name            150 non-null    object 
 1   Matches         150 non-null    int64  
 2   PlayerDOB       150 non-null    object 
 3   TeamCode        150 non-null    object 
 4   TeamName        150 non-null    object 
 5   Innings         150 non-null    int64  
 6   TotalRuns       150 non-null    int64  
 7   Balls           150 non-null    int64  
 8   StrikeRate      150 non-null    float64
 9   Fours           150 non-null    int64  
 10  Sixes           150 non-null    int64  
 11  Outs            150 non-null    int64  
 12  NotOuts         150 non-null    int64  
 13  FiftyPlusRuns   150 non-null    int64  
 14  Centuries       150 non-null    int64  
 15  HighestScore    150 non-null    object 
 16  BattingAverage  150 non-null    float64
 17  Nation          150 non-null  

In [105]:
batting = batting_data[2008]
bowling = bowling_data[2008]

In [106]:
first = pd.merge(batting,bowling,on=['Name'],how='left')
first

Unnamed: 0,Name,Matches_x,PlayerDOB,TeamCode_x,TeamName_x,Innings_x,TotalRuns,Balls,StrikeRate,Fours,...,OversBowled,FoursConceded,SixesConceded,Wickets,BBIW,BBMW,Maidens,FourWickets,FiveWickets,BallsBowled
0,Shaun Marsh,11,9-7-1983,PBKS,Punjab Kings,11,616,441,139.68,59,...,,,,,,,,,,
1,Gautam Gambhir,14,14-10-1981,DC,Delhi Capitals,14,534,379,140.89,68,...,,,,,,,,,,
2,Sanath Jayasuriya,14,30-6-1969,MI,Mumbai Indians,14,518,309,167.63,58,...,21,58.0,31.0,4.0,3/14,3/14,1.0,0.0,0.0,126.0
3,Shane Watson,15,17-6-1981,RR,Rajasthan Royals,15,472,311,151.76,47,...,54,47.0,19.0,17.0,4/29,4/29,0.0,0.0,0.0,324.0
4,Graeme Smith,11,1-2-1981,RR,Rajasthan Royals,11,441,362,121.82,54,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145,Paidikalva Vijaykumar,9,20-10-1986,DEC,Deccan Chargers,2,1,1,100.00,0,...,25,0.0,0.0,4.0,1/17,1/17,0.0,0.0,0.0,150.0
146,Abdur Razzak,1,15-6-1982,RCB,Royal Challengers Bangalore,1,0,2,0.00,0,...,,,,,,,,,,
147,Abhinav Mukund,2,6-1-1990,CSK,Chennai Super Kings,1,0,1,0.00,0,...,,,,,,,,,,
148,Lakshmipathy Balaji,9,27-9-1981,CSK,Chennai Super Kings,1,0,4,0.00,0,...,33,0.0,0.0,11.0,5/24,5/24,0.0,0.0,1.0,198.0


In [107]:
second = pd.concat([first,bowling],ignore_index=True)
second

  output = repr(obj)
  return method()


Unnamed: 0,Name,Matches_x,PlayerDOB,TeamCode_x,TeamName_x,Innings_x,TotalRuns,Balls,StrikeRate,Fours,...,Maidens,FourWickets,FiveWickets,BallsBowled,RightHandedBat,Nationality,TeamCode,TeamName,Matches,Innings
0,Shaun Marsh,11.0,9-7-1983,PBKS,Punjab Kings,11.0,616.0,441.0,139.68,59.0,...,,,,,,,,,,
1,Gautam Gambhir,14.0,14-10-1981,DC,Delhi Capitals,14.0,534.0,379.0,140.89,68.0,...,,,,,,,,,,
2,Sanath Jayasuriya,14.0,30-6-1969,MI,Mumbai Indians,14.0,518.0,309.0,167.63,58.0,...,1.0,0.0,0.0,126.0,,,,,,
3,Shane Watson,15.0,17-6-1981,RR,Rajasthan Royals,15.0,472.0,311.0,151.76,47.0,...,0.0,0.0,0.0,324.0,,,,,,
4,Graeme Smith,11.0,1-2-1981,RR,Rajasthan Royals,11.0,441.0,362.0,121.82,54.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
232,Dinesh Salunkhe,,,,,,,,,,...,0.0,0.0,0.0,48.0,False,,RR,Rajasthan Royals,6.0,5.0
233,Dwaraka Ravi Teja,,,,,,,,,,...,0.0,0.0,0.0,12.0,False,,DEC,Deccan Chargers,7.0,1.0
234,Mohammad Hafeez,,,,,,,,,,...,0.0,0.0,0.0,30.0,False,,KKR,Kolkata Knight Riders,8.0,2.0
235,Abhishek Nayar,,,,,,,,,,...,0.0,0.0,0.0,42.0,False,,MI,Mumbai Indians,14.0,5.0


In [108]:
final = second.drop_duplicates(subset=['Name'])
final

Unnamed: 0,Name,Matches_x,PlayerDOB,TeamCode_x,TeamName_x,Innings_x,TotalRuns,Balls,StrikeRate,Fours,...,Maidens,FourWickets,FiveWickets,BallsBowled,RightHandedBat,Nationality,TeamCode,TeamName,Matches,Innings
0,Shaun Marsh,11.0,9-7-1983,PBKS,Punjab Kings,11.0,616.0,441.0,139.68,59.0,...,,,,,,,,,,
1,Gautam Gambhir,14.0,14-10-1981,DC,Delhi Capitals,14.0,534.0,379.0,140.89,68.0,...,,,,,,,,,,
2,Sanath Jayasuriya,14.0,30-6-1969,MI,Mumbai Indians,14.0,518.0,309.0,167.63,58.0,...,1.0,0.0,0.0,126.0,,,,,,
3,Shane Watson,15.0,17-6-1981,RR,Rajasthan Royals,15.0,472.0,311.0,151.76,47.0,...,0.0,0.0,0.0,324.0,,,,,,
4,Graeme Smith,11.0,1-2-1981,RR,Rajasthan Royals,11.0,441.0,362.0,121.82,54.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210,Gajendra Singh,,,,,,,,,,...,0.0,0.0,0.0,84.0,False,,RR,Rajasthan Royals,4.0,4.0
213,Daniel Vettori,,,,,,,,,,...,0.0,0.0,0.0,48.0,False,,DC,Delhi Capitals,2.0,2.0
225,Brett Geeves,,,,,,,,,,...,0.0,0.0,0.0,48.0,False,,DC,Delhi Capitals,2.0,2.0
226,Andre Nel,,,,,,,,,,...,0.0,0.0,0.0,18.0,False,,MI,Mumbai Indians,1.0,1.0


In [65]:
final[['Name','Innings','Innings_x','Innings_y']].tail(20)

Unnamed: 0,Name,Innings,Innings_x,Innings_y
138,Dale Steyn,,2.0,10.0
139,Dhawal Kulkarni,,1.0,10.0
140,Ashok Dinda,,3.0,12.0
141,Pragyan Ojha,,4.0,12.0
142,Iqbal Abdulla,,1.0,
143,Srikkanth Anirudha,,1.0,
144,PM Sarvesh Kumar,,1.0,2.0
145,Paidikalva Vijaykumar,,2.0,9.0
146,Abdur Razzak,,1.0,
147,Abhinav Mukund,,1.0,


In [66]:
sorted(final.columns)

['BBIW',
 'BBMW',
 'Balls',
 'BallsBowled',
 'BattingAverage',
 'BattingStyle',
 'BowlingAverage',
 'BowlingStrikeRate',
 'Centuries',
 'DotBallsBowled',
 'EconomyRate',
 'FiftyPlusRuns',
 'FiveWickets',
 'FourWickets',
 'Fours',
 'FoursConceded',
 'HighestScore',
 'Innings',
 'Innings_x',
 'Innings_y',
 'Maidens',
 'Matches',
 'Matches_x',
 'Matches_y',
 'Name',
 'Nation',
 'Nationality',
 'Nationality_x',
 'Nationality_y',
 'NotOuts',
 'Outs',
 'OversBowled',
 'PlayerDOB',
 'RightHandedBat',
 'RightHandedBat_x',
 'RightHandedBat_y',
 'Sixes',
 'SixesConceded',
 'StrikeRate',
 'TeamCode',
 'TeamCode_x',
 'TeamCode_y',
 'TeamName',
 'TeamName_x',
 'TeamName_y',
 'TotalRuns',
 'TotalRunsConceded',
 'Wickets']

In [67]:
cols = []
for col in final.columns:
    if '_y' in col:
        cols.append(col)

In [68]:
cols

['RightHandedBat_y',
 'Nationality_y',
 'TeamCode_y',
 'TeamName_y',
 'Matches_y',
 'Innings_y']

In [69]:
final.drop(columns=cols,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final.drop(columns=cols,inplace=True)


In [70]:
sorted(final.columns)

['BBIW',
 'BBMW',
 'Balls',
 'BallsBowled',
 'BattingAverage',
 'BattingStyle',
 'BowlingAverage',
 'BowlingStrikeRate',
 'Centuries',
 'DotBallsBowled',
 'EconomyRate',
 'FiftyPlusRuns',
 'FiveWickets',
 'FourWickets',
 'Fours',
 'FoursConceded',
 'HighestScore',
 'Innings',
 'Innings_x',
 'Maidens',
 'Matches',
 'Matches_x',
 'Name',
 'Nation',
 'Nationality',
 'Nationality_x',
 'NotOuts',
 'Outs',
 'OversBowled',
 'PlayerDOB',
 'RightHandedBat',
 'RightHandedBat_x',
 'Sixes',
 'SixesConceded',
 'StrikeRate',
 'TeamCode',
 'TeamCode_x',
 'TeamName',
 'TeamName_x',
 'TotalRuns',
 'TotalRunsConceded',
 'Wickets']

In [80]:
final['Innings_x'].dtype

dtype('float64')

In [81]:
final['Matches_x'].dtype

dtype('float64')

In [82]:
final['Nationality_x'].dtype

dtype('O')

In [83]:
final['RightHandedBat_x'].dtype

dtype('O')

In [84]:
final['TeamCode_x'].dtype

dtype('O')

In [85]:
final['TeamName_x'].dtype

dtype('O')

In [86]:
final[['Innings','Innings_x','Matches','Matches_x','Nationality','Nationality_x','RightHandedBat','RightHandedBat_x','TeamCode','TeamCode_x','TeamName','TeamName_x']].tail(20)

Unnamed: 0,Innings,Innings_x,Matches,Matches_x,Nationality,Nationality_x,RightHandedBat,RightHandedBat_x,TeamCode,TeamCode_x,TeamName,TeamName_x
138,,2.0,,10.0,,Overseas,,True,,RCB,,Royal Challengers Bangalore
139,,1.0,,10.0,,Indian,,True,,MI,,Mumbai Indians
140,,3.0,,13.0,,Indian,,True,,KKR,,Kolkata Knight Riders
141,,4.0,,13.0,,Indian,,False,,DEC,,Deccan Chargers
142,,1.0,,1.0,,Indian,,False,,KKR,,Kolkata Knight Riders
143,,1.0,,1.0,,Indian,,True,,CSK,,Chennai Super Kings
144,,1.0,,2.0,,Indian,,True,,DEC,,Deccan Chargers
145,,2.0,,9.0,,Indian,,True,,DEC,,Deccan Chargers
146,,1.0,,1.0,,Overseas,,False,,RCB,,Royal Challengers Bangalore
147,,1.0,,2.0,,Indian,,False,,CSK,,Chennai Super Kings


In [109]:
final['Nationality'].fillna(' ')

0       
1       
2       
3       
4       
      ..
210     
213     
225     
226     
228     
Name: Nationality, Length: 158, dtype: object

In [87]:
final.drop(columns=['RightHandedBat'],inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final.drop(columns=['RightHandedBat'],inplace=True)


In [89]:
final.rename(columns={'RightHandedBat_x':'RightHandedBat'},inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final.rename(columns={'RightHandedBat_x':'RightHandedBat'},inplace=True)


In [90]:
sorted(final.columns)

['BBIW',
 'BBMW',
 'Balls',
 'BallsBowled',
 'BattingAverage',
 'BattingStyle',
 'BowlingAverage',
 'BowlingStrikeRate',
 'Centuries',
 'DotBallsBowled',
 'EconomyRate',
 'FiftyPlusRuns',
 'FiveWickets',
 'FourWickets',
 'Fours',
 'FoursConceded',
 'HighestScore',
 'Innings',
 'Innings_x',
 'Maidens',
 'Matches',
 'Matches_x',
 'Name',
 'Nation',
 'Nationality',
 'Nationality_x',
 'NotOuts',
 'Outs',
 'OversBowled',
 'PlayerDOB',
 'RightHandedBat',
 'Sixes',
 'SixesConceded',
 'StrikeRate',
 'TeamCode',
 'TeamCode_x',
 'TeamName',
 'TeamName_x',
 'TotalRuns',
 'TotalRunsConceded',
 'Wickets']

In [93]:
team_code = (final['TeamCode'].fillna(' ') + final['TeamCode_x'].fillna(' ')).apply(str.strip)

In [95]:
final.drop(columns=['TeamCode','TeamCode_x'],inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final.drop(columns=['TeamCode','TeamCode_x'],inplace=True)


In [96]:
final['TeamCode'] = team_code

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final['TeamCode'] = team_code


In [97]:
final.columns

Index(['Name', 'Matches_x', 'PlayerDOB', 'TeamName_x', 'Innings_x',
       'TotalRuns', 'Balls', 'StrikeRate', 'Fours', 'Sixes', 'Outs', 'NotOuts',
       'FiftyPlusRuns', 'Centuries', 'HighestScore', 'BattingAverage',
       'Nation', 'BattingStyle', 'Nationality_x', 'RightHandedBat',
       'TotalRunsConceded', 'DotBallsBowled', 'BowlingAverage',
       'BowlingStrikeRate', 'EconomyRate', 'OversBowled', 'FoursConceded',
       'SixesConceded', 'Wickets', 'BBIW', 'BBMW', 'Maidens', 'FourWickets',
       'FiveWickets', 'BallsBowled', 'Nationality', 'TeamName', 'Matches',
       'Innings', 'TeamCode'],
      dtype='object')

In [99]:
innings = final['Innings'].fillna(0) + final['Innings_x'].fillna(0)

In [100]:
final.drop(columns=['Innings','Innings_x'],inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final.drop(columns=['Innings','Innings_x'],inplace=True)


In [101]:
final['Innings'] = innings

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final['Innings'] = innings


In [102]:
final.columns

Index(['Name', 'Matches_x', 'PlayerDOB', 'TeamName_x', 'TotalRuns', 'Balls',
       'StrikeRate', 'Fours', 'Sixes', 'Outs', 'NotOuts', 'FiftyPlusRuns',
       'Centuries', 'HighestScore', 'BattingAverage', 'Nation', 'BattingStyle',
       'Nationality_x', 'RightHandedBat', 'TotalRunsConceded',
       'DotBallsBowled', 'BowlingAverage', 'BowlingStrikeRate', 'EconomyRate',
       'OversBowled', 'FoursConceded', 'SixesConceded', 'Wickets', 'BBIW',
       'BBMW', 'Maidens', 'FourWickets', 'FiveWickets', 'BallsBowled',
       'Nationality', 'TeamName', 'Matches', 'TeamCode', 'Innings'],
      dtype='object')

### Creating a combined dataframe for each year

In [148]:
combined = {}
for year in range(2008,2024):
    batting = batting_data[year]
    bowling = bowling_data[year]
    
    first = pd.merge(batting,bowling,on=['Name'],how='left')
    
    second = pd.concat([first,bowling],ignore_index=True)
    
    final = second.drop_duplicates(subset=['Name'])
    
    cols = []
    for col in final.columns:
        if '_y' in col:
            cols.append(col)
            
    final.drop(columns=cols,inplace=True)
    
    final.drop(columns=['RightHandedBat'],inplace=True)
    final.rename(columns={'RightHandedBat_x':'RightHandedBat'},inplace=True)
    
    for feature in ['TeamCode','TeamName','Nationality']:
        temp = (final[feature].fillna(' ') + final[f'{feature}_x'].fillna(' ')).apply(str.strip)
        final.drop(columns=[feature,f'{feature}_x'],inplace=True)
        final[feature] = temp
        
    for feature in ['Innings','Matches']:
        temp = final[feature].fillna(0) + final[f'{feature}_x'].fillna(0)
        final.drop(columns=[feature,f'{feature}_x'],inplace=True)
        final[feature] = temp
        
    combined[year] = final

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final.drop(columns=cols,inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final.drop(columns=['RightHandedBat'],inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final.rename(columns={'RightHandedBat_x':'RightHandedBat'},inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final.drop(columns=cols,inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final.drop(columns=['RightHandedBat'],inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final.rename(columns={'RightHandedBat_x':'RightHandedBat'},inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final

In [149]:
for df in combined.values():
    df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 158 entries, 0 to 228
Data columns (total 36 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Name               158 non-null    object 
 1   PlayerDOB          150 non-null    object 
 2   TotalRuns          150 non-null    float64
 3   Balls              150 non-null    float64
 4   StrikeRate         150 non-null    float64
 5   Fours              150 non-null    float64
 6   Sixes              150 non-null    float64
 7   Outs               150 non-null    float64
 8   NotOuts            150 non-null    float64
 9   FiftyPlusRuns      150 non-null    float64
 10  Centuries          150 non-null    float64
 11  HighestScore       150 non-null    object 
 12  BattingAverage     150 non-null    float64
 13  Nation             150 non-null    object 
 14  BattingStyle       150 non-null    object 
 15  RightHandedBat     150 non-null    object 
 16  TotalRunsConceded  87 non-

### Finding the missing data in combined dataframes

In [150]:
from googlesearch import search
import time

In [151]:
combined[2008][['Name','Nation','BattingStyle','PlayerDOB']].tail(15)

Unnamed: 0,Name,Nation,BattingStyle,PlayerDOB
143,Srikkanth Anirudha,India,rhb,14-4-1987
144,PM Sarvesh Kumar,India,rhb,26-4-1989
145,Paidikalva Vijaykumar,India,rhb,20-10-1986
146,Abdur Razzak,Bangladesh,lhb,15-6-1982
147,Abhinav Mukund,India,lhb,6-1-1990
148,Lakshmipathy Balaji,India,rhb,27-9-1981
149,Munaf Patel,India,rhb,12-7-1983
152,Shanthakumaran Sreesanth,,,
171,Muttiah Muralitharan,,,
186,Palani Amarnath,,,


In [152]:
name = 'Daniel Vettori'
row = combined[2008][combined[2008]['Name']==name].index.values[0]

In [153]:
row

213

In [159]:
combined[2008].loc[row,'Nation'] is np.nan

True

In [136]:
def googler(query):
    for url in search(query,num=10):
        if 'https://www.espncricinfo.com/cricketers' in url:
            return url

In [137]:
def get_player_id(s):
    player_id = ""
    for character in s.split('-')[-1]:
        if character in '1234567890':
            player_id = player_id + character
        else:
            break
    return player_id

In [160]:
for df in combined.values():
    for name in df['Name']:
        
        row = df[df['Name']==name].index.values[0]
        
        if df.loc[row,'Nation'] is np.nan:
            try:
                espn_url = googler(name)
                
                player_id = get_player_id(espn_url)

                url = 'https://hs-consumer-api.espncricinfo.com/v1/pages/player/home?playerId=' + player_id

                response = requests.get(url)

                player_dict = json.loads(response.content.decode())

                dob = player_dict['player']['dateOfBirth']

                df.at[row,'PlayerDOB'] = f"{dob['date']}-{dob['month']}-{dob['year']}"

                df.at[row,'BattingStyle'] = player_dict['player']['battingStyles'][0]

                df.at[row,'Nation'] = player_dict['player']['country']['name']

            except Exception as e:
                print(f'{name} : {e}')

            time.sleep(3)
            
        else:
            continue

In [166]:
combined[2008][['Name','Nation','Nationality','PlayerDOB','BattingStyle','RightHandedBat']].tail(30)

Unnamed: 0,Name,Nation,Nationality,PlayerDOB,BattingStyle,RightHandedBat
128,Younis Khan,Pakistan,Overseas,29-11-1977,rhb,True
129,Doddapaneni Kalyankrishna,India,Indian,16-12-1983,rhb,True
130,Manish Pandey,India,Indian,10-9-1989,rhb,True
131,Mohammad Asif,Pakistan,Overseas,20-12-1982,lhb,False
132,Ashish Nehra,India,Indian,29-4-1979,rhb,True
133,Halhadar Das,India,Indian,10-12-1986,rhb,True
134,Vikrant Yeligati,India,Indian,23-4-1985,rhb,True
135,Shoaib Akhtar,Pakistan,Overseas,13-8-1975,rhb,True
136,Dilhara Fernando,Sri Lanka,Overseas,19-7-1979,rhb,True
137,Pradeep Sangwan,India,Indian,5-11-1990,rhb,True


In [167]:
for df in combined.values():
    df['IsIndian'] = df['Nation'].apply(lambda x: 'Indian' if x == 'India' else 'Overseas')
    df['IsRightHanded'] = df['BattingStyle'].apply(lambda x: True if x == 'rhb' else False)
    df.drop(columns=['Nationality','RightHandedBat'],inplace=True)
    df.rename(columns={'IsIndian':'Nationality','IsRightHanded':'RightHandedBat'},inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['IsIndian'] = df['Nation'].apply(lambda x: 'Indian' if x == 'India' else 'Overseas')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['IsRightHanded'] = df['BattingStyle'].apply(lambda x: True if x == 'rhb' else False)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(columns=['Nationality','RightHandedBat'],inplace=True)
A value is trying to be

In [181]:
combined[2019][['Name','Nation','Nationality','PlayerDOB','BattingStyle','RightHandedBat']].tail(30)

Unnamed: 0,Name,Nation,Nationality,PlayerDOB,BattingStyle,RightHandedBat
131,Mohammad Shami,India,Indian,3-9-1990,rhb,True
132,Joe Denly,England,Overseas,16-3-1986,rhb,True
133,Mohit Sharma,India,Indian,18-9-1988,rhb,True
134,Sam Billings,England,Overseas,15-6-1991,rhb,True
135,Harshal Patel,India,Indian,23-11-1990,rhb,True
136,Prithvi Raj Yarra,India,Indian,20-2-1998,lhb,False
137,Mujeeb Ur Rahman,Afghanistan,Overseas,28-3-2001,rhb,True
138,Andrew Tye,Australia,Overseas,12-12-1986,rhb,True
139,Sandeep Lamichhane,Nepal,Overseas,2-8-2000,rhb,True
140,Siddarth Kaul,India,Indian,19-5-1990,rhb,True


## Saving the obtained results

In [169]:
import os
os.makedirs('seasonwise/runs',exist_ok=True)
os.makedirs('seasonwise/wickets',exist_ok=True)
os.makedirs('seasonwise/combined',exist_ok=True)

In [170]:
for year,df in batting_data.items():
    df.to_csv(f'seasonwise/runs/runs_{year}.csv',index=False)

In [171]:
for year,df in bowling_data.items():
    df.to_csv(f'seasonwise/wickets/wickets_{year}.csv',index=False)

In [172]:
for year,df in combined.items():
    df.to_csv(f'seasonwise/combined/combined_{year}.csv',index=False)

In [182]:
for df in combined.values():
    print(df.shape[0])

158
137
168
184
175
187
147
139
139
152
145
161
144
162
186
198


In [183]:
for df in batting_data.values():
    print(df.shape[0])

150
130
153
161
157
158
132
129
129
136
138
144
133
149
162
166


In [184]:
for df in bowling_data.values():
    print(df.shape[0])

87
81
97
97
101
107
89
84
84
86
82
87
78
89
103
113


In [188]:
combined[2023].columns

Index(['Name', 'PlayerDOB', 'TotalRuns', 'Balls', 'StrikeRate', 'Fours',
       'Sixes', 'Outs', 'NotOuts', 'FiftyPlusRuns', 'Centuries',
       'HighestScore', 'BattingAverage', 'Nation', 'BattingStyle',
       'TotalRunsConceded', 'DotBallsBowled', 'BowlingAverage',
       'BowlingStrikeRate', 'EconomyRate', 'OversBowled', 'FoursConceded',
       'SixesConceded', 'Wickets', 'BBIW', 'BBMW', 'Maidens', 'FourWickets',
       'FiveWickets', 'BallsBowled', 'TeamCode', 'TeamName', 'Innings',
       'Matches', 'Nationality', 'RightHandedBat'],
      dtype='object')

In [189]:
combined[2023][combined[2023]['TeamCode'] == 'CSK'][['Name','PlayerDOB','TotalRuns','Balls','Outs','Wickets','BallsBowled','TotalRunsConceded']]

Unnamed: 0,Name,PlayerDOB,TotalRuns,Balls,Outs,Wickets,BallsBowled,TotalRunsConceded
2,Devon Conway,8-7-1991,672.0,481.0,13.0,,,
6,Ruturaj Gaikwad,31-1-1997,590.0,400.0,14.0,,,
12,Shivam Dube,26-6-1993,418.0,264.0,11.0,,,
28,Ajinkya Rahane,6-6-1988,326.0,189.0,10.0,,,
50,Ravindra Jadeja,6-12-1988,190.0,133.0,8.0,20.0,342.0,431.0
56,Ambati Rayudu,23-9-1985,158.0,113.0,10.0,,,
65,Moeen Ali,18-6-1987,124.0,91.0,7.0,9.0,156.0,195.0
72,MS Dhoni,7-7-1981,104.0,57.0,4.0,,,
123,Ben Stokes,4-6-1991,15.0,14.0,2.0,,,
151,Mitchell Santner,5-2-1992,2.0,4.0,0.0,3.0,72.0,81.0
