In [1]:
import pandas as pd
import numpy as np
import pyodbc
import json
import warnings
from tqdm import tqdm_notebook as tqdm
warnings.filterwarnings('ignore')

In [2]:
df_match_batting = pd.read_csv('E:/Google_Drive_Contents/data/Batsmen_scorecard_data.csv')

In [3]:
df_match_batting.head()

Unnamed: 0,match_no,match_city,year,month,day,team_1,team_2,batsmen,wicket_status,R,B,M,fours,sixes,SR,special_role
0,1,Cape Town,2009,April,18,Mumbai Indians,Chennai Super Kings,ST Jayasuriya,c Hayden b Thushara,26,20.0,,5.0,0.0,130.0,normal player
1,1,Cape Town,2009,April,18,Mumbai Indians,Chennai Super Kings,SR Tendulkar (c),not out,59,49.0,,7.0,0.0,120.4,captain
2,1,Cape Town,2009,April,18,Mumbai Indians,Chennai Super Kings,S Dhawan,c Dhoni b Gony,22,21.0,,2.0,0.0,104.76,normal player
3,1,Cape Town,2009,April,18,Mumbai Indians,Chennai Super Kings,JP Duminy,c & b Gony,9,7.0,,1.0,0.0,128.57,normal player
4,1,Cape Town,2009,April,18,Mumbai Indians,Chennai Super Kings,DJ Bravo,c Hayden b Joginder Sharma,5,4.0,,1.0,0.0,125.0,normal player


In [4]:
# Let's add some new columns

# creating dismissal mode

def extract_dismissal(wkt_info):
    """
    this function will extract dismissal form from the wkt_info.
    """
    if wkt_info != None and wkt_info.strip().split(' ')[0] == 'c':
        return "caught"
    elif wkt_info != None and wkt_info.strip().split(' ')[0] == 'b':
        return "bowled"
    elif wkt_info != None and wkt_info.strip().split(' ')[0] == 'st':
        return "stumped"
    elif wkt_info != None and wkt_info.strip().split(' ')[0] == 'lbw':
        return "lbw"
    elif wkt_info != None and wkt_info.strip().split(' ')[0] == 'run':
        return "run out"
    elif wkt_info != None and wkt_info.strip().split(' ')[0] == 'not':
        return "not out"
    elif wkt_info != None and wkt_info.strip().split(' ')[0] == 'retired':
        return "retired"
    elif wkt_info != None and wkt_info.strip().split(' ')[0] == 'obstructing':
        return "Obstructing the field"
    elif wkt_info != None and wkt_info.strip().split(' ')[0] == 'hit':
        return "hit wicket"
    else:
        return wkt_info

df_match_batting['dismissal_mode'] = df_match_batting['wicket_status'].apply(extract_dismissal)
df_match_batting['dismissal_mode'].unique()

array(['caught', 'not out', 'run out', 'bowled', 'stumped', 'lbw',
       'absent hurt', 'hit wicket', 'retired', 'Obstructing the field'],
      dtype=object)

In [5]:
df_match_batting.columns

Index(['match_no', 'match_city', 'year', 'month', 'day', 'team_1', 'team_2',
       'batsmen', 'wicket_status', 'R', 'B', 'M', 'fours', 'sixes', 'SR',
       'special_role', 'dismissal_mode'],
      dtype='object')

In [6]:
df_match_batting['month'].unique()

array(['April', 'May', 'June', 'March', 'Apr', 'Jun'], dtype=object)

In [7]:
# some data_cleaning_steps for year 2008

filter_ = df_match_batting['year'] == 2008

def month_map(month):
    month_map_dict = {'Apr':'April','May':'May','Jun':'June','Mar':'March'}
    if month in month_map_dict.keys():
        return month_map_dict[month]
    else:
        return month
    
    

df_match_batting['month'] = df_match_batting['month'].apply(month_map)


def filter_special_players(player):
    if '(c)' in player:
        return 'captain'
    elif '†' in player:
        return 'wicket keeper'
    else:
        return 'normal player'

df_match_batting['special_role'] = df_match_batting['batsmen'].apply(filter_special_players)

In [8]:
# to make sure we don't have extra spaces in any of the string columns

df_match_batting['match_city'] = df_match_batting['match_city'].apply(lambda x : x.strip())
df_match_batting['team_1'] = df_match_batting['team_1'].apply(lambda x : x.strip())
df_match_batting['team_2'] = df_match_batting['team_2'].apply(lambda x : x.strip())
df_match_batting['batsmen']  = df_match_batting['batsmen'].apply(lambda x : x.strip())
df_match_batting['wicket_status'] = df_match_batting['wicket_status'].apply(lambda x : x.strip())

In [9]:
# let's also convert month into numerical column

month_num = {'April':4, 'May':5, 'June':6, 'March':3}

df_match_batting['month'] = df_match_batting['month'].apply(lambda x : month_num[x])

In [10]:
df_match_batting.head(2)

Unnamed: 0,match_no,match_city,year,month,day,team_1,team_2,batsmen,wicket_status,R,B,M,fours,sixes,SR,special_role,dismissal_mode
0,1,Cape Town,2009,4,18,Mumbai Indians,Chennai Super Kings,ST Jayasuriya,c Hayden b Thushara,26,20.0,,5.0,0.0,130.0,normal player,caught
1,1,Cape Town,2009,4,18,Mumbai Indians,Chennai Super Kings,SR Tendulkar (c),not out,59,49.0,,7.0,0.0,120.4,captain,not out


In [11]:
filter_ =  df_match_batting['year'] == 2013

df_match_batting[filter_]['match_no'].unique()

array(['1', '2nd match (N)', '3rd match (N)', '4th match (D/N)',
       '5th match (N)', '6th match (D/N)', '7th match (N)',
       '8th match (N)', '9th match (D/N)', '10th match (N)',
       '11th match (N)', '12th match (D/N)', '13th match (N)',
       '14th match (N)', '15th match (D/N)', '16th match (N)',
       '17th match (D/N)', '18th match (N)', '19th match (N)',
       '20th match (D/N)', '21', '22nd match (D/N)', '23rd match (N)',
       '24th match (N)', '25th match (N)', '26th match (D/N)',
       '27th match (N)', '28th match (D/N)', '29th match (N)',
       '30th match (N)', '31', '32nd match (N)', '33rd match (N)',
       '34th match (N)', '35th match (N)', '36th match (D/N)',
       '37th match (N)', '38th match (D/N)', '39th match (N)',
       '40th match (D/N)', '41', '42nd match (N)', '43rd match (D/N)',
       '44th match (N)', '45th match (D/N)', '46th match (N)',
       '47th match (N)', '48th match (N)', '49th match (D/N)',
       '50th match (N)', '51', '52nd m

In [12]:
df_match_batting.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11546 entries, 0 to 11545
Data columns (total 17 columns):
match_no          11546 non-null object
match_city        11546 non-null object
year              11546 non-null int64
month             11546 non-null int64
day               11546 non-null object
team_1            11546 non-null object
team_2            11546 non-null object
batsmen           11546 non-null object
wicket_status     11546 non-null object
R                 11546 non-null object
B                 11540 non-null float64
M                 6804 non-null object
fours             11540 non-null float64
sixes             11540 non-null float64
SR                11540 non-null object
special_role      11546 non-null object
dismissal_mode    11546 non-null object
dtypes: float64(3), int64(2), object(12)
memory usage: 1.5+ MB


In [13]:
null_replacment_for_M = 0

df_match_batting['M'].fillna(null_replacment_for_M,inplace = True)
df_match_batting['B'].fillna(0,inplace = True)
df_match_batting['fours'].fillna(0,inplace = True)
df_match_batting['sixes'].fillna(0,inplace = True)
df_match_batting['SR'].fillna(0,inplace = True)

df_match_batting['R'].replace(to_replace = '-',value = 0,inplace = True)
df_match_batting['M'].replace(to_replace = '-',value = 0,inplace = True)
df_match_batting['B'].replace(to_replace = '-',value = 0,inplace = True)
df_match_batting['SR'].replace(to_replace = '-',value = 0,inplace = True)
df_match_batting['SR'] = df_match_batting['SR'].astype('float64')
df_match_batting['M'] = df_match_batting['M'].astype('float64')
df_match_batting['R'] = df_match_batting['R'].astype('float64')

In [14]:
df_match_batting.head()

Unnamed: 0,match_no,match_city,year,month,day,team_1,team_2,batsmen,wicket_status,R,B,M,fours,sixes,SR,special_role,dismissal_mode
0,1,Cape Town,2009,4,18,Mumbai Indians,Chennai Super Kings,ST Jayasuriya,c Hayden b Thushara,26.0,20.0,0.0,5.0,0.0,130.0,normal player,caught
1,1,Cape Town,2009,4,18,Mumbai Indians,Chennai Super Kings,SR Tendulkar (c),not out,59.0,49.0,0.0,7.0,0.0,120.4,captain,not out
2,1,Cape Town,2009,4,18,Mumbai Indians,Chennai Super Kings,S Dhawan,c Dhoni b Gony,22.0,21.0,0.0,2.0,0.0,104.76,normal player,caught
3,1,Cape Town,2009,4,18,Mumbai Indians,Chennai Super Kings,JP Duminy,c & b Gony,9.0,7.0,0.0,1.0,0.0,128.57,normal player,caught
4,1,Cape Town,2009,4,18,Mumbai Indians,Chennai Super Kings,DJ Bravo,c Hayden b Joginder Sharma,5.0,4.0,0.0,1.0,0.0,125.0,normal player,caught


In [17]:
# saving the final file in the folder
df_match_batting.to_csv('E:/Google_Drive_Contents/data/Final_data_files/ipl_batting.csv',index = False)

In [15]:
# let's load the database cred

data_base_cred = None
with open('E:/Azure_database_config.json','r') as F:
    data_base_cred = json.load(F)

In [16]:
def get_azure_database_connection(server,database,username,password):
    """
    This function will return an aws conn object for azure SQl Server database.
    return : AWS connection object
    """
    cnxn = None
    try:
        cnxn = pyodbc.connect('DRIVER={SQL Server};SERVER='+server+';DATABASE='+database+';UID='+username+';PWD='+ password)
        return cnxn
        
    except Exception as e:
        print(e)

In [17]:
def insert_data_into_database(db_connection,df):
    """
    This function will insert data into the database row by row.
    parameters :
    db_connection : database connection object
    df : pandas DataFrame which has data
    return : None
    """
    try:
        
         # get the cursor
        cur = db_connection.cursor()
        for i,row in tqdm(df.iterrows(),total = df.shape[0]):
            row_list = list(row) # for easy access
            # create insert statment
            Insert_Statment = f"""Insert into ipl_batting values('{row_list[0]}','{row_list[1]}',{row_list[2]},{row_list[3]},'{row_list[4]}','{row_list[5]}','{row_list[6]}','{row_list[7]}','{row_list[8]}',{row_list[9]},{row_list[10]},{row_list[11]},{row_list[12]},{row_list[13]},{row_list[14]},'{row_list[15]}','{row_list[16]}')"""
            
#             print(Insert_Statment)
            
#             break

            
            # execute the query
            cur.execute(Insert_Statment)
            # commit the connection
            db_connection.commit()


    except Exception as e:
        print(e)

In [18]:
SERVER = data_base_cred['server']
DATABASE = data_base_cred['database']
USERNAME = data_base_cred['username']
PASSWORD = data_base_cred['password']
db_connection = get_azure_database_connection(SERVER,DATABASE,USERNAME,PASSWORD)

In [20]:
insert_data_into_database(db_connection,df_match_batting.head(1000))

HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))




In [21]:
insert_data_into_database(db_connection,df_match_batting.iloc[1000:2000,])

HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))




In [22]:
insert_data_into_database(db_connection,df_match_batting.iloc[2000:3000,])

HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))




In [23]:
insert_data_into_database(db_connection,df_match_batting.iloc[3000:4000,])

HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))




In [24]:
insert_data_into_database(db_connection,df_match_batting.iloc[4000:5000,])

HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))




In [25]:
insert_data_into_database(db_connection,df_match_batting.iloc[5000:6000,])

HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))




In [26]:
insert_data_into_database(db_connection,df_match_batting.iloc[6000:7000,])

HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))




In [27]:
insert_data_into_database(db_connection,df_match_batting.iloc[7000:8000,])

HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))




In [28]:
insert_data_into_database(db_connection,df_match_batting.iloc[8000:9000,])

HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))




In [29]:
insert_data_into_database(db_connection,df_match_batting.iloc[9000:10000,])

HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))




In [30]:
insert_data_into_database(db_connection,df_match_batting.iloc[10000:11000,])

HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))




In [31]:
insert_data_into_database(db_connection,df_match_batting.iloc[11000:,])

HBox(children=(IntProgress(value=0, max=546), HTML(value='')))




In [182]:
db_connection.close()

In [34]:
pd.read_sql('Select * from ipl_batting',con = db_connection)

Unnamed: 0,innings_id,match_no,match_city,year,month,day,team_1,team_2,batsman,wicket_status,runs,balls,M,fours,sixes,strike_rate,special_role,dismissal_mode
0,1,1,Cape Town,2009,4,18,Mumbai Indians,Chennai Super Kings,ST Jayasuriya,c Hayden b Thushara,26,20,0,5,0.0,130.00,normal player,caught
1,2,1,Cape Town,2009,4,18,Mumbai Indians,Chennai Super Kings,SR Tendulkar (c),not out,59,49,0,7,0.0,120.40,captain,not out
2,3,1,Cape Town,2009,4,18,Mumbai Indians,Chennai Super Kings,S Dhawan,c Dhoni b Gony,22,21,0,2,0.0,104.76,normal player,caught
3,4,1,Cape Town,2009,4,18,Mumbai Indians,Chennai Super Kings,JP Duminy,c & b Gony,9,7,0,1,0.0,128.57,normal player,caught
4,5,1,Cape Town,2009,4,18,Mumbai Indians,Chennai Super Kings,DJ Bravo,c Hayden b Joginder Sharma,5,4,0,1,0.0,125.00,normal player,caught
5,6,1,Cape Town,2009,4,18,Mumbai Indians,Chennai Super Kings,AM Nayar,c Thushara b Oram,35,14,0,2,3.0,250.00,normal player,caught
6,7,1,Cape Town,2009,4,18,Mumbai Indians,Chennai Super Kings,Harbhajan Singh,run out (Oram),4,2,0,1,0.0,200.00,normal player,run out
7,8,1,Cape Town,2009,4,18,Mumbai Indians,Chennai Super Kings,Z Khan,c Ashwin b Flintoff,2,3,0,0,0.0,66.66,normal player,caught
8,9,1,Cape Town,2009,4,18,Mumbai Indians,Chennai Super Kings,PR Shah †,not out,0,0,0,0,0.0,0.00,wicket keeper,not out
9,10,1,Cape Town,2009,4,18,Mumbai Indians,Chennai Super Kings,PA Patel †,c Tendulkar b Malinga,0,2,0,0,0.0,0.00,wicket keeper,caught
