In [86]:
# !pip install tqdm # if you face issue related to tqdm
import pyodbc
import pandas as pd
from tqdm import tqdm_notebook as tqdm
import numpy as np

DATA_LAKE_FOLDER_PATH = "E:/Google_Drive_Contents/data/Final_data_files/"

# fill the database cred here
SERVER = '*'
DATABASE = '*'
USERNAME = '*'
PASSWORD = '*'

In [39]:
def get_azure_database_connection(server,database,username,password):
    """
    This function will return an aws conn object for azure SQl Server database.
    return : AWS connection object
    """
    cnxn = None
    try:
        cnxn = pyodbc.connect('DRIVER={SQL Server};SERVER='+server+';DATABASE='+database+';UID='+username+';PWD='+ password)
        return cnxn
        
    except Exception as e:
        print(e)

In [40]:
player_data = pd.read_json('E:/Google_Drive_Contents/data/Players_information.json')

In [41]:
player_data.head()

Unnamed: 0,Also known as,Batting style,Born,Bowling style,Current age,Education,Fielding position,Full name,Height,In a nutshell,Major teams,Nickname,Other,Playing role,Relation
0,,Left-hand bat,"October 14, 1981, Delhi",Legbreak,38 years 132 days,,,Gautam Gambhir,,"India,",,Top-order batsman,,,
1,,Right-hand bat,"October 16, 1975, Pinelands, Cape Town, Cape P...",Right-arm fast-medium,44 years 130 days,,,Jacques Henry Kallis,,"South Africa,",,Allrounder,,,
2,,Right-hand bat,"September 10, 1989, Nainital, Uttaranchal",Right-arm medium,30 years 166 days,,,Manish Krishnanand Pandey,,"India,",,Top-order batsman,,,
3,,Right-hand bat,"November 11, 1985, Coorg, Karnataka",Right-arm medium,34 years 104 days,,Occasional wicketkeeper,Robin Venu Uthappa,,"India,",,Batsman,,,
4,,Right-hand bat,"November 17, 1982, Baroda, Gujarat",Right-arm offbreak,37 years 98 days,,,Yusuf Khan Pathan,,"India,",,Allrounder,Half-brother - IK Pathan,,


In [42]:
player_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1007 entries, 0 to 1006
Data columns (total 15 columns):
Also known as        86 non-null object
Batting style        1007 non-null object
Born                 1007 non-null object
Bowling style        921 non-null object
Current age          1007 non-null object
Education            83 non-null object
Fielding position    126 non-null object
Full name            1007 non-null object
Height               170 non-null object
In a nutshell        327 non-null object
Major teams          716 non-null object
Nickname             430 non-null object
Other                56 non-null object
Playing role         599 non-null object
Relation             108 non-null object
dtypes: object(15)
memory usage: 118.1+ KB


In [43]:
# let's check hoe many data is missing
player_data.isna().sum()

Also known as        921
Batting style          0
Born                   0
Bowling style         86
Current age            0
Education            924
Fielding position    881
Full name              0
Height               837
In a nutshell        680
Major teams          291
Nickname             577
Other                951
Playing role         408
Relation             899
dtype: int64

In [44]:
# removing extra spaces from all the columns

for col in player_data.columns:
    player_data[col] = player_data[col].apply(lambda x: str(x).strip())

In [45]:
player_data.shape

(1007, 15)

In [46]:
# let's replace None with NaN

player_data = player_data.replace('None',np.NaN)

In [47]:
player_data.isna().sum()

Also known as        921
Batting style          0
Born                   0
Bowling style         86
Current age            0
Education            924
Fielding position    881
Full name              0
Height               837
In a nutshell        680
Major teams          291
Nickname             577
Other                951
Playing role         408
Relation             899
dtype: int64

In [48]:
# we have to fill NaN values so that we don't get any error when updating values in our Database

null_replacement = ""

player_data.fillna(value=null_replacement,inplace=True)

In [49]:
# Now let's enhance our data a little bit

from dateparser.search import search_dates

# what dateparser does : it extracts date from a given string

# we will apply dateParser method to each values

def create_birth_day(born):
    if len(born) != 0:
        birth_info = search_dates(born)[0][0]
        return birth_info
    else:
        return born
    
player_data['Birthday'] = player_data['Born'].apply(create_birth_day)

In [50]:
player_data['Birthday'].head(2)

0    October 14, 1981
1    October 16, 1975
Name: Birthday, dtype: object

In [51]:
# now let's create age column
from datetime import date

current_year = int(str(date.today()).split('-')[0])

def create_player_age(birthday):
    if len(birthday) != 0:
        birthday = birthday.strip()
        return current_year - int(birthday.split(',')[1].strip())
    
player_data['age'] = player_data['Birthday'].apply(create_player_age)
    

In [52]:
player_data.head(2)

Unnamed: 0,Also known as,Batting style,Born,Bowling style,Current age,Education,Fielding position,Full name,Height,In a nutshell,Major teams,Nickname,Other,Playing role,Relation,Birthday,age
0,,Left-hand bat,"October 14, 1981, Delhi",Legbreak,38 years 132 days,,,Gautam Gambhir,,"India,",,Top-order batsman,,,,"October 14, 1981",39
1,,Right-hand bat,"October 16, 1975, Pinelands, Cape Town, Cape P...",Right-arm fast-medium,44 years 130 days,,,Jacques Henry Kallis,,"South Africa,",,Allrounder,,,,"October 16, 1975",45


In [53]:
player_data.drop('Current age',axis = 1,inplace = True)

In [54]:
def extract_birth_place(row):
    birthday = row['Birthday']
    return row['Born'].strip().split(birthday)[1].strip().replace(',','').strip()
    

player_data['born_place'] = player_data.apply(extract_birth_place,axis = 1)

In [55]:
player_data.head()

Unnamed: 0,Also known as,Batting style,Born,Bowling style,Education,Fielding position,Full name,Height,In a nutshell,Major teams,Nickname,Other,Playing role,Relation,Birthday,age,born_place
0,,Left-hand bat,"October 14, 1981, Delhi",Legbreak,,,Gautam Gambhir,,"India,",,Top-order batsman,,,,"October 14, 1981",39,Delhi
1,,Right-hand bat,"October 16, 1975, Pinelands, Cape Town, Cape P...",Right-arm fast-medium,,,Jacques Henry Kallis,,"South Africa,",,Allrounder,,,,"October 16, 1975",45,Pinelands Cape Town Cape Province
2,,Right-hand bat,"September 10, 1989, Nainital, Uttaranchal",Right-arm medium,,,Manish Krishnanand Pandey,,"India,",,Top-order batsman,,,,"September 10, 1989",31,Nainital Uttaranchal
3,,Right-hand bat,"November 11, 1985, Coorg, Karnataka",Right-arm medium,,Occasional wicketkeeper,Robin Venu Uthappa,,"India,",,Batsman,,,,"November 11, 1985",35,Coorg Karnataka
4,,Right-hand bat,"November 17, 1982, Baroda, Gujarat",Right-arm offbreak,,,Yusuf Khan Pathan,,"India,",,Allrounder,Half-brother - IK Pathan,,,"November 17, 1982",38,Baroda Gujarat


In [56]:
# let's drop some unecessary columns

player_data.drop(['Also known as','Education','Height','In a nutshell','Major teams','Relation','Other'],axis = 1,inplace = True)

In [57]:
filter_  = player_data['Full name'] == 'Gautam Gambhir'
player_data[filter_]


Unnamed: 0,Batting style,Born,Bowling style,Fielding position,Full name,Nickname,Playing role,Birthday,age,born_place
0,Left-hand bat,"October 14, 1981, Delhi",Legbreak,,Gautam Gambhir,Top-order batsman,,"October 14, 1981",39,Delhi
376,Left-hand bat,"October 14, 1981, Delhi",Legbreak,,Gautam Gambhir,,Top-order batsman,"October 14, 1981",39,Delhi
718,Left-hand bat,"October 14, 1981, Delhi",Legbreak,,Gautam Gambhir,,Top-order batsman,"October 14, 1981",39,Delhi
881,Left-hand bat,"October 14, 1981, Delhi",Legbreak,,Gautam Gambhir,,Top-order batsman,"October 14, 1981",39,Delhi


In [58]:
player_data.drop(['Nickname','Born'],axis = 1,inplace=True)

In [59]:
player_data['Full name'].unique().shape

(561,)

In [60]:
player_data.replace('',np.nan,inplace=True)

In [61]:
player_data = player_data.drop_duplicates() # removing duplicate columns

In [62]:
player_name_with_2_col = []

for player in player_data['Full name'].unique():
    no_of_entries = player_data[player_data['Full name'] == player].shape[0]
    if no_of_entries > 1:
        player_name_with_2_col.append(player)

In [63]:
player_data = player_data[~pd.isna(player_data['Playing role'])]

In [64]:
player_data.shape

(338, 8)

In [65]:
player_data.drop(['Fielding position'],axis = 1,inplace=True)

In [66]:
player_data.drop(['Birthday'],axis = 1,inplace = True)

In [67]:
player_data.head()

Unnamed: 0,Batting style,Bowling style,Full name,Playing role,age,born_place
321,Left-hand bat,Slow left-arm orthodox,Sanath Teran Jayasuriya,Allrounder,51,Matara
322,Right-hand bat,"Right-arm offbreak, Legbreak googly",Sachin Ramesh Tendulkar,Top-order batsman,47,Bombay (now Mumbai) Maharashtra
323,Left-hand bat,Right-arm offbreak,Shikhar Dhawan,Opening batsman,35,Delhi
324,Left-hand bat,Right-arm offbreak,Jean-Paul Duminy,Batting allrounder,36,Strandfontein Cape Town Cape Province
325,Right-hand bat,Right-arm medium,Dwayne John Bravo,Allrounder,37,Santa Cruz Trinidad


In [68]:
player_data = player_data[['Full name','age','Playing role','Batting style','Bowling style','born_place']]

In [69]:
player_data.head()

Unnamed: 0,Full name,age,Playing role,Batting style,Bowling style,born_place
321,Sanath Teran Jayasuriya,51,Allrounder,Left-hand bat,Slow left-arm orthodox,Matara
322,Sachin Ramesh Tendulkar,47,Top-order batsman,Right-hand bat,"Right-arm offbreak, Legbreak googly",Bombay (now Mumbai) Maharashtra
323,Shikhar Dhawan,35,Opening batsman,Left-hand bat,Right-arm offbreak,Delhi
324,Jean-Paul Duminy,36,Batting allrounder,Left-hand bat,Right-arm offbreak,Strandfontein Cape Town Cape Province
325,Dwayne John Bravo,37,Allrounder,Right-hand bat,Right-arm medium,Santa Cruz Trinidad


In [70]:
# Let's save the cleaned data into the datalake folder

path_to_final_data_files = "E:/Google_Drive_Contents/data/Final_data_files/"

player_data.to_csv(path_to_final_data_files+'PlayersInformation.csv',index=False)

In [81]:
def insert_data_into_database(db_connection,df):
    """
    This function will insert data into the database row by row.
    parameters :
    db_connection : database connection object
    df : pandas DataFrame which has data
    return : None
    """
    try:
        
         # get the cursor
        cur = db_connection.cursor()
        for i,row in tqdm(df.iterrows(),total = df.shape[0]):
            row_list = list(row) # for easy access
            # create insert statment
            Insert_Statment = f"""Insert into ipl_players (full_name,age,playing_role,batting_style,bowling_style,born_place) 
                values('{row_list[0]}',{row_list[1]},'{row_list[2]}','{row_list[3]}','{row_list[4]}','{row_list[5]}')"""
            
#             print(Insert_Statment)

            
            # execute the query
            cur.execute(Insert_Statment)
            # commit the connection
            db_connection.commit()


    except Exception as e:
        print(e)

In [79]:
db_connection = get_azure_database_connection(SERVER,DATABASE,USERNAME,PASSWORD)

In [82]:
insert_data_into_database(db_connection,player_data)

HBox(children=(IntProgress(value=0, max=338), HTML(value='')))




In [84]:
# pd.read_sql('Select * from ipl_players',db_connection)

In [85]:
db_connection.close()