# BatFast Customer Dashboard Project 

In [2164]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import glob as glob
from math import pi
import plotly.graph_objects as go

## Importing and Merging Delivery Datasets

In [2165]:
deliveries_filenames = [i for i in glob.glob("./UCL_DATA/deliveries_data/*.csv")]

In [2166]:
len(deliveries_filenames)

3111

In [2167]:
deliveries_df = [pd.read_csv(file, sep = ",",) 
      for file in deliveries_filenames]

In [2168]:
deliveries_df

[     sim_number                  timestamp  batfast_id client_name  \
 0            28  2019-08-16 08:11:20+00:00  bfs2802182     natwest   
 1            28  2019-08-16 08:11:35+00:00  bfs2802182     natwest   
 2            28  2019-08-16 08:11:51+00:00  bfs2802182     natwest   
 3            28  2019-08-16 08:12:07+00:00  bfs2802182     natwest   
 4            28  2019-08-16 08:13:37+00:00  bfs2802326     natwest   
 ..          ...                        ...         ...         ...   
 995          28  2019-08-16 15:23:01+00:00  bfs2802385     natwest   
 996          28  2019-08-16 15:23:18+00:00  bfs2802385     natwest   
 997          28  2019-08-16 15:23:33+00:00  bfs2802385     natwest   
 998          28  2019-08-16 15:24:56+00:00  bfs2802369     natwest   
 999          28  2019-08-16 15:25:10+00:00  bfs2802369     natwest   
 
                     event_name game_mode score  speed  pitch  swing  pan  \
 0    natwest-ashes-lords-day-3  Training     3    3.8   -5.2    0.0 

In [2169]:
all_deliveries_data = pd.concat(deliveries_df)
all_deliveries_data = all_deliveries_data.reset_index(drop=True)

In [2170]:
all_deliveries_data['score'].isnull().sum()/len(all_deliveries_data)

0.649701568916879

In [2171]:
round(all_deliveries_data.isnull().sum()/len(all_deliveries_data),4)

sim_number     0.0000
timestamp      0.0000
batfast_id     0.0000
client_name    0.0000
event_name     0.0000
game_mode      0.0005
score          0.6497
speed          0.1285
pitch          0.1285
swing          0.1285
pan            0.1613
turn           0.4724
r              0.9307
theta          0.9337
z              0.9980
power          0.9920
machine        0.0000
scoring        0.0001
length         0.9728
dtype: float64

In [2172]:
# all_deliveries_data.to_csv("./UCL_DATA/deliveries_data.csv")

## Importing and Merging User Datasets

In [2173]:
user_filenames = [i for i in glob.glob("./UCL_DATA/user_data/*.csv")]

In [2174]:
len(user_filenames)

67

In [2175]:
user_df = [pd.read_csv(file, sep = ",",) 
      for file in user_filenames]

In [2176]:
user_df

[    email    name  batfast_id gender age_group hand  skill  \
 0     NaN     NaN  bfs1002102    NaN       NaN  NaN    NaN   
 1     NaN     NaN  bfs1002103    NaN       NaN  NaN    NaN   
 2     NaN  TAYLOR  bfs1002104    NaN       NaN  NaN    NaN   
 3     NaN     ZAN  bfs1002105    NaN       NaN  NaN    NaN   
 4     NaN    KATE  bfs1002106    NaN       NaN  NaN    NaN   
 ..    ...     ...         ...    ...       ...  ...    ...   
 995   NaN     NaN  bfs1003012    NaN       NaN  NaN    NaN   
 996   NaN     NaN  bfs1003013    NaN       NaN  NaN    NaN   
 997   NaN     NaN  bfs1003014    NaN       NaN  NaN    NaN   
 998   NaN     NaN  bfs1003015    NaN       NaN  NaN    NaN   
 999   NaN     NaN  bfs1003016    NaN       NaN  NaN    NaN   
 
                             timestamp                           updated  
 0    2021-01-13 09:18:29.255713+00:00  2021-01-13 09:18:29.255734+00:00  
 1    2021-01-13 09:18:30.646599+00:00  2021-01-13 09:18:30.646619+00:00  
 2    2021-01-13 

In [2177]:
all_user_data = pd.concat(user_df)
all_user_data = all_user_data.reset_index(drop=True)

In [2178]:
all_user_data.head()

Unnamed: 0,email,name,batfast_id,gender,age_group,hand,skill,timestamp,updated
0,,,bfs1002102,,,,,2021-01-13 09:18:29.255713+00:00,2021-01-13 09:18:29.255734+00:00
1,,,bfs1002103,,,,,2021-01-13 09:18:30.646599+00:00,2021-01-13 09:18:30.646619+00:00
2,,TAYLOR,bfs1002104,,,,,2021-01-13 09:18:28.575842+00:00,2021-01-13 09:18:28.575862+00:00
3,,ZAN,bfs1002105,,,,,2021-01-13 09:18:35.520035+00:00,2021-01-13 09:18:35.520054+00:00
4,,KATE,bfs1002106,,,,,2021-01-13 09:18:28.447465+00:00,2021-01-13 09:18:28.447486+00:00


In [2179]:
all_user_data.isnull().sum()/len(all_user_data)

email         0.786523
name          0.432855
batfast_id    0.000000
gender        0.628552
age_group     0.629060
hand          0.622606
skill         0.998446
timestamp     0.000000
updated       0.000000
dtype: float64

In [2180]:
# all_user_data.to_csv("./UCL_DATA/user_data.csv")

## Importing and Merging Session Datasets

In [2181]:
session_filenames = [i for i in glob.glob("./UCL_DATA/session_data/*.csv")]

In [2182]:
len(session_filenames)

113

In [2183]:
session_df = [pd.read_csv(file, sep = ",",) 
      for file in session_filenames]

In [2184]:
session_df

[     sim_number                  timestamp         action detail
 0             7  2019-07-20 13:40:43+00:00  Session Ended      1
 1             7  2019-07-20 13:41:43+00:00      Activated     20
 2             7  2019-07-20 14:00:04+00:00    Deactivated     64
 3             7  2019-07-20 14:00:21+00:00  Session Ended      1
 4             7  2019-07-20 14:01:17+00:00      Activated     40
 ..          ...                        ...            ...    ...
 995           7  2019-08-28 19:39:37+00:00      Activated     20
 996           7  2019-08-28 19:59:48+00:00  Session Ended     72
 997           7  2019-08-28 20:17:46+00:00         Booted    NaN
 998           7  2019-08-28 20:20:36+00:00      Activated     20
 999           7  2019-08-28 20:40:49+00:00  Session Ended     68
 
 [1000 rows x 4 columns],
      sim_number                  timestamp         action   detail
 0            24  2019-12-02 05:17:15+00:00         Booted      NaN
 1            24  2019-12-02 07:53:11+00:00 

In [2185]:
all_session_data = pd.concat(session_df)
all_session_data = all_session_data.reset_index(drop=True)

In [2186]:
all_session_data.head()

Unnamed: 0,sim_number,timestamp,action,detail
0,7,2019-07-20 13:40:43+00:00,Session Ended,1
1,7,2019-07-20 13:41:43+00:00,Activated,20
2,7,2019-07-20 14:00:04+00:00,Deactivated,64
3,7,2019-07-20 14:00:21+00:00,Session Ended,1
4,7,2019-07-20 14:01:17+00:00,Activated,40


In [2187]:
all_session_data.isnull().sum()/len(all_session_data)

sim_number    0.000000
timestamp     0.000000
action        0.000000
detail        0.063166
dtype: float64

In [2188]:
# all_session_data.to_csv("./UCL_DATA/session_data.csv")

In [2189]:
session_data = pd.read_csv("./UCL_DATA/session_data.csv")
user_data = pd.read_csv("./UCL_DATA/user_data.csv")
deliveries_data = pd.read_csv("./UCL_DATA/deliveries_data.csv")


Columns (7) have mixed types.Specify dtype option on import or set low_memory=False.


Columns (7,13,19) have mixed types.Specify dtype option on import or set low_memory=False.



In [2190]:
session_data.head()
user_data.head()
deliveries_data.head()

Unnamed: 0.1,Unnamed: 0,sim_number,timestamp,action,detail
0,0,7,2019-07-20 13:40:43+00:00,Session Ended,1
1,1,7,2019-07-20 13:41:43+00:00,Activated,20
2,2,7,2019-07-20 14:00:04+00:00,Deactivated,64
3,3,7,2019-07-20 14:00:21+00:00,Session Ended,1
4,4,7,2019-07-20 14:01:17+00:00,Activated,40


Unnamed: 0.1,Unnamed: 0,email,name,batfast_id,gender,age_group,hand,skill,timestamp,updated
0,0,,,bfs1002102,,,,,2021-01-13 09:18:29.255713+00:00,2021-01-13 09:18:29.255734+00:00
1,1,,,bfs1002103,,,,,2021-01-13 09:18:30.646599+00:00,2021-01-13 09:18:30.646619+00:00
2,2,,TAYLOR,bfs1002104,,,,,2021-01-13 09:18:28.575842+00:00,2021-01-13 09:18:28.575862+00:00
3,3,,ZAN,bfs1002105,,,,,2021-01-13 09:18:35.520035+00:00,2021-01-13 09:18:35.520054+00:00
4,4,,KATE,bfs1002106,,,,,2021-01-13 09:18:28.447465+00:00,2021-01-13 09:18:28.447486+00:00


Unnamed: 0.1,Unnamed: 0,sim_number,timestamp,batfast_id,client_name,event_name,game_mode,score,speed,pitch,swing,pan,turn,r,theta,z,power,machine,scoring,length
0,0,28,2019-08-16 08:11:20+00:00,bfs2802182,natwest,natwest-ashes-lords-day-3,Training,3,3.8,-5.2,0.0,0.0,0.0,46.0,5.78,,,B2S,manual_scoring,
1,1,28,2019-08-16 08:11:35+00:00,bfs2802182,natwest,natwest-ashes-lords-day-3,Training,2,3.7,-4.8,0.0,0.0,0.0,23.0,1.09,,,B2S,manual_scoring,
2,2,28,2019-08-16 08:11:51+00:00,bfs2802182,natwest,natwest-ashes-lords-day-3,Training,4,3.9,-5.6,0.0,0.0,0.0,60.0,6.13,,,B2S,manual_scoring,
3,3,28,2019-08-16 08:12:07+00:00,bfs2802182,natwest,natwest-ashes-lords-day-3,Training,4,3.6,-4.4,0.0,0.0,0.0,59.0,0.41,,,B2S,manual_scoring,
4,4,28,2019-08-16 08:13:37+00:00,bfs2802326,natwest,natwest-ashes-lords-day-3,Training,0,3.8,-5.2,0.0,0.0,-3.0,,,,,B2S,manual_scoring,


In [2191]:
deliveries_data.drop(columns=['Unnamed: 0'], inplace=True)
user_data.drop(columns=['Unnamed: 0'], inplace=True)
session_data.drop(columns=['Unnamed: 0'], inplace=True)

## Merging Deliveries and User Data

In [2192]:
combined_df_1 = deliveries_data.merge(user_data, how='inner', left_on='batfast_id', right_on='batfast_id')
combined_df_1.head()

Unnamed: 0,sim_number,timestamp_x,batfast_id,client_name,event_name,game_mode,score,speed,pitch,swing,...,scoring,length,email,name,gender,age_group,hand,skill,timestamp_y,updated
0,28,2019-08-16 08:11:20+00:00,bfs2802182,natwest,natwest-ashes-lords-day-3,Training,3,3.8,-5.2,0.0,...,manual_scoring,,cram1991@hotmail.com,CHRIS MARCHANT,male,16Plus,RHS,,2020-12-22 12:31:24.823613+00:00,2020-12-22 12:31:24.823634+00:00
1,28,2019-08-16 08:11:35+00:00,bfs2802182,natwest,natwest-ashes-lords-day-3,Training,2,3.7,-4.8,0.0,...,manual_scoring,,cram1991@hotmail.com,CHRIS MARCHANT,male,16Plus,RHS,,2020-12-22 12:31:24.823613+00:00,2020-12-22 12:31:24.823634+00:00
2,28,2019-08-16 08:11:51+00:00,bfs2802182,natwest,natwest-ashes-lords-day-3,Training,4,3.9,-5.6,0.0,...,manual_scoring,,cram1991@hotmail.com,CHRIS MARCHANT,male,16Plus,RHS,,2020-12-22 12:31:24.823613+00:00,2020-12-22 12:31:24.823634+00:00
3,28,2019-08-16 08:12:07+00:00,bfs2802182,natwest,natwest-ashes-lords-day-3,Training,4,3.6,-4.4,0.0,...,manual_scoring,,cram1991@hotmail.com,CHRIS MARCHANT,male,16Plus,RHS,,2020-12-22 12:31:24.823613+00:00,2020-12-22 12:31:24.823634+00:00
4,28,2019-08-15 08:14:08+00:00,bfs2802182,natwest,natwest-ashes-lords-day-2,Training,4,3.6,-4.4,0.0,...,manual_scoring,,cram1991@hotmail.com,CHRIS MARCHANT,male,16Plus,RHS,,2020-12-22 12:31:24.823613+00:00,2020-12-22 12:31:24.823634+00:00


In [2193]:
combined_df_1 = combined_df_1.sort_values(['email','batfast_id'])
combined_df_1.reset_index(inplace=True)
combined_df_1.drop(columns=['index'], inplace=True)

In [2194]:
combined_df = combined_df_1.copy()
anomaly_df = combined_df_1.copy()

# 5. Data Preparation

# 5.1 Variable Adjustments

Given the information provided by BatFast on their initial introductory document, a number of variables will undergo a brief adjustment before any data analysis begins. This will be done in order to increase their usability and/or ease of interpretability.

## Speed

In [2195]:
#Example of speed data being recorded as decimal (e.g. 6.0 = 60mph)
combined_df['speed'].head()

0    6.0
1    6.0
2    6.0
3    6.0
4    6.0
Name: speed, dtype: float64

In [2196]:
# Adjusting from 6.0 -> 60

def speed_adjustment(dataset):
    dataset['speed_adj'] = dataset['speed']*10
    return dataset['speed_adj'].head()

In [2197]:
speed_adjustment(combined_df)

0    60.0
1    60.0
2    60.0
3    60.0
4    60.0
Name: speed_adj, dtype: float64

## Swing

In [2198]:
# Creating a dataframe of swing and hand for explanation purposes
swing_adjustment_df = combined_df[['hand','swing']]
swing_adjustment_df.head(20)

Unnamed: 0,hand,swing
0,RHS,0.0
1,RHS,0.0
2,RHS,0.0
3,RHS,0.0
4,RHS,0.0
5,RHS,0.0
6,RHS,0.0
7,RHS,0.0
8,RHS,0.0
9,RHS,0.0


In [2199]:
#As can be seen, the positive swing value below indicates this has swung into the left hander yet the same
#value for a right hander would indicate away swing. Thus to ensure the swing values represent the same type of
#swing regardless of hand, all left handed batters will have their swing values changes to the opposite sign
lh_example = swing_adjustment_df.loc[swing_adjustment_df['hand']=='LHS'].head()
lh_example

Unnamed: 0,hand,swing
861,LHS,3.0
862,LHS,2.0
863,LHS,2.0
864,LHS,0.0
865,LHS,3.0


In [2200]:
#Swing adjustment

#Note that the resultant swing_adj column has NaN for each RHS given no value was specified for these rows. 
#Thus all NaN values for RHS are set as the values in the original swing column given no adjustment was
#made to these rows

def swing_adjustment(dataset):
    dataset.loc[dataset.hand=="LHS", 'swing_adj'] = (dataset['swing']/-1)
    dataset['swing_adj'].fillna(value=dataset['swing'], inplace=True)
    return dataset[['hand','swing','swing_adj']].head()

In [2201]:
swing_adjustment(combined_df)

Unnamed: 0,hand,swing,swing_adj
0,RHS,0.0,0.0
1,RHS,0.0,0.0
2,RHS,0.0,0.0
3,RHS,0.0,0.0
4,RHS,0.0,0.0


In [2202]:
#Given the above only shows RHS, the below code checks that the function has worked for the LHS. As can be seen,
#the swing values for LH batsmen have reversed signs.
swing_adj_test = combined_df[['hand','swing','swing_adj']]
swing_adj_test.loc[swing_adj_test['hand']=='LHS'].head()

Unnamed: 0,hand,swing,swing_adj
861,LHS,3.0,-3.0
862,LHS,2.0,-2.0
863,LHS,2.0,-2.0
864,LHS,0.0,-0.0
865,LHS,3.0,-3.0


## Theta

In [2203]:
#Example of theta values being recorded in radians
combined_df['theta'].value_counts().head()

0.0    18753
1.5     4396
1.4     3762
1.6     3432
0.2     3399
Name: theta, dtype: int64

In [2204]:
#Adjusting from radians to degrees

def theta_adjustment(dataset):
    dataset['theta_adj'] = dataset['theta']*(180/pi)
    return dataset['theta_adj'].value_counts().head()

In [2205]:
theta_adjustment(combined_df)

0.000000     18753
85.943669     4396
80.214091     3762
91.673247     3432
11.459156     3399
Name: theta_adj, dtype: int64

## Aggregating into 1 Function

In [2206]:
def variable_adjustments(dataset):
    speed_adjustment(dataset)
    swing_adjustment(dataset)
    theta_adjustment(dataset)
    dataset.drop(columns=['speed','swing','theta'], inplace=True)
    dataset.info()

In [2207]:
# variable_adjustments(combined_df)

The above list shows that the original variables have been successfully removed and the adjusted versions are present at the bottom.

# 5.2 Identification and Resolving of Outliers/ Anomalies

As specified by the project requirements, the dataset has to be interrogated to ascertain which variables possess anomalous data readings. Once these anomalies are found, a decision will be made as to what the best approach is to dealing with these anomalies. 

Note that where outliers have been found, BatFast have requested further analysis take place to establish what data/ time these outliers were recorded, as well as on which simulators. 

## R and Power 

There was a period of time when the machines were erroneously recording the power values within the r values and so it is important to ensure these values are used within the power column and not the r columns.

In [2208]:
r_values = list(combined_df['r'].unique())

In [2209]:
power_values = list(combined_df['power'].unique())

In [2210]:
#To show the issue here, a dataframe has been created to highlight how some power values were
#erroneously recorded in the 'r' column (see row 5,7,8 and 9 for example as no one is capable
#of hitting a ball these distances).
r_power_df = pd.DataFrame({'r':r_values[20:30], 'power':power_values[20:30]})
r_power_df

Unnamed: 0,r,power
0,30.0,73.0
1,16.0,140.0
2,61.0,55.0
3,62.0,45.0
4,17.0,249.0
5,420.0,227.0
6,73.0,74.0
7,253.0,472.0
8,669.0,308.0
9,214.0,488.0


#### ( R ANOMALY TIMEDATE AND LOCATION ANALYSIS)

In [2211]:
#Note for the following analysis the values within the power and r columns must be coerced into numerical form
combined_df['r'] = pd.to_numeric(combined_df['r'], errors='coerce')
combined_df['power'] = pd.to_numeric(combined_df['power'], errors='coerce')

In [2212]:
#Note this borrows a function from later analysis to allow for the identification of the specific date in the anomaly 
#dataset
def generate_time_date(dataset):
    df_1 = pd.DataFrame({"time": dataset.timestamp_x.str.split(' '),
                        "date": dataset.timestamp_x.str.split(' ')}, index=dataset.index)
    df_1['time'] = df_1.time.map(lambda x: x[1])
    df_1['date'] = df_1.date.map(lambda x: x[0])
    
    dataset['time'] = df_1['time']
    dataset['date'] = df_1['date']
    
    return dataset.info()

In [2213]:
#Obtaining all rows where there are anomalous values for r
r_anomaly_df = combined_df.loc[combined_df['r']>150]
#Implementing above timedate splitting function
generate_time_date(r_anomaly_df)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7976 entries, 237 to 2368420
Data columns (total 32 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   sim_number   7976 non-null   int64  
 1   timestamp_x  7976 non-null   object 
 2   batfast_id   7976 non-null   object 
 3   client_name  7976 non-null   object 
 4   event_name   7976 non-null   object 
 5   game_mode    7976 non-null   object 
 6   score        7975 non-null   object 
 7   speed        7976 non-null   float64
 8   pitch        7976 non-null   float64
 9   swing        7976 non-null   float64
 10  pan          7976 non-null   float64
 11  turn         7831 non-null   float64
 12  r            7976 non-null   float64
 13  theta        247 non-null    float64
 14  z            0 non-null      float64
 15  power        0 non-null      float64
 16  machine      7976 non-null   object 
 17  scoring      7976 non-null   object 
 18  length       2788 non-null   object 
 19  e



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [2214]:

# r_anomaly_dates = r_anomaly_df['date'].value_counts().to_frame()
# r_anomaly_dates.reset_index(inplace=True)
# r_anomaly_dates.columns=(['date','count'])
# r_anomaly_dates.sort_values(by='date',inplace=True)
# r_anomaly_dates




In [2215]:
#Obtaining a dataframe showing where these anomalous values occured (i.e. the client) and at which date they occured
r_anomaly_analysis = r_anomaly_df.pivot_table(index='date', columns='client_name', values='timestamp_x', aggfunc='count')
r_anomaly_analysis.fillna(value=0, inplace=True)

#Creating a total anomalous values column
r_anomaly_analysis['TOTAL'] = r_anomaly_analysis.sum(axis=1)
r_anomaly_analysis

#Showing the top 50 anomalous recordings per daty (predominantly at Sixes around Decemeber of last year)
r_anomaly_analysis_sorted = r_anomaly_analysis.sort_values(by='TOTAL', ascending=False)
r_anomaly_analysis_sorted.head(50)

client_name,batfast,cricket-simulator-canada,sixes,tenpin,the-game-adelaide,TOTAL
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-02-04,0.0,1.0,0.0,0.0,0.0,1.0
2018-04-16,0.0,3.0,0.0,0.0,0.0,3.0
2018-04-20,0.0,5.0,0.0,0.0,0.0,5.0
2018-04-21,0.0,8.0,0.0,0.0,0.0,8.0
2018-04-28,0.0,2.0,0.0,0.0,0.0,2.0
...,...,...,...,...,...,...
2021-03-12,2.0,0.0,0.0,0.0,0.0,2.0
2021-03-20,0.0,0.0,4.0,0.0,0.0,4.0
2021-03-21,0.0,0.0,17.0,0.0,0.0,17.0
2021-03-23,7.0,0.0,11.0,0.0,0.0,18.0


client_name,batfast,cricket-simulator-canada,sixes,tenpin,the-game-adelaide,TOTAL
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-12-11,0.0,0.0,259.0,1.0,36.0,296.0
2020-12-09,0.0,0.0,262.0,5.0,0.0,267.0
2020-12-08,8.0,0.0,175.0,26.0,0.0,209.0
2020-12-12,0.0,0.0,153.0,17.0,8.0,178.0
2021-01-22,0.0,0.0,84.0,0.0,85.0,169.0
2020-12-03,0.0,0.0,125.0,35.0,9.0,169.0
2021-03-04,0.0,0.0,167.0,0.0,0.0,167.0
2020-09-12,57.0,0.0,0.0,88.0,18.0,163.0
2020-09-20,40.0,0.0,0.0,115.0,0.0,155.0
2021-02-26,0.0,0.0,127.0,0.0,9.0,136.0


In [2216]:
#NOTE this below code tests the method suggested by BatFast (Alex) that the anomalies exist where the power value
#is missing but the r value is filled. 
r_test = combined_df.loc[(combined_df['power'].isna()) & (combined_df['r']>=0)]
r_test

Unnamed: 0,sim_number,timestamp_x,batfast_id,client_name,event_name,game_mode,score,speed,pitch,swing,...,name,gender,age_group,hand,skill,timestamp_y,updated,speed_adj,swing_adj,theta_adj
11,12,2019-06-16 12:06:28+00:00,bfs1203632,lancashire-ccc,lancashire-foundation,Continuous,2,5.83,8.38,0.0,...,HANY,male,10To16,RHS,,2021-02-12 12:05:14.276508+00:00,2021-02-12 12:05:14.276551+00:00,58.3,0.0,296.792138
12,12,2019-06-16 12:06:39+00:00,bfs1203632,lancashire-ccc,lancashire-foundation,Continuous,2,5.83,8.38,0.0,...,HANY,male,10To16,RHS,,2021-02-12 12:05:14.276508+00:00,2021-02-12 12:05:14.276551+00:00,58.3,0.0,52.139159
13,12,2019-06-16 12:06:50+00:00,bfs1203632,lancashire-ccc,lancashire-foundation,Continuous,1,5.83,8.38,0.0,...,HANY,male,10To16,RHS,,2021-02-12 12:05:14.276508+00:00,2021-02-12 12:05:14.276551+00:00,58.3,0.0,49.847328
14,12,2019-06-16 12:07:02+00:00,bfs1203632,lancashire-ccc,lancashire-foundation,Continuous,3,5.83,8.38,0.0,...,HANY,male,10To16,RHS,,2021-02-12 12:05:14.276508+00:00,2021-02-12 12:05:14.276551+00:00,58.3,0.0,328.877774
15,12,2019-06-16 12:07:15+00:00,bfs1203632,lancashire-ccc,lancashire-foundation,Continuous,3,5.83,8.38,0.0,...,HANY,male,10To16,RHS,,2021-02-12 12:05:14.276508+00:00,2021-02-12 12:05:14.276551+00:00,58.3,0.0,351.796086
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3108784,9,2020-03-02 17:45:44+00:00,bfs901808,batfast,batfast-other,Continuous_RR_BB,2,5.40,-3.60,0.0,...,SAM,,,,,2021-01-13 15:37:04.737581+00:00,2021-01-13 16:19:26.573684+00:00,54.0,0.0,27.501974
3108785,9,2020-03-02 17:46:00+00:00,bfs901808,batfast,batfast-other,Continuous_RR_BB,2,5.40,-3.60,0.0,...,SAM,,,,,2021-01-13 15:37:04.737581+00:00,2021-01-13 16:19:26.573684+00:00,54.0,0.0,34.950426
3108786,9,2020-03-02 17:46:17+00:00,bfs901808,batfast,batfast-other,Continuous_RR_BB,0,5.40,-3.60,0.0,...,SAM,,,,,2021-01-13 15:37:04.737581+00:00,2021-01-13 16:19:26.573684+00:00,54.0,0.0,82.505922
3108787,9,2020-03-02 17:46:34+00:00,bfs901808,batfast,batfast-other,Continuous_RR_BB,0,5.40,-3.60,0.0,...,SAM,,,,,2021-01-13 15:37:04.737581+00:00,2021-01-13 16:19:26.573684+00:00,54.0,0.0,303.667631


The issue with this method is that there are a huge number of r values which could legitimately be 'r' values given that there are over 185,000 r-values below 150m which correspond with missing power values. Furthermore, the fact that there are only 10,000 missing theta values again suggests that lots of the r-values are legitimate given that these two variables are almost paired thanks to their combined role in the wagon wheel. Therefore, assuming that any r-value is incorrect whenever it has a missing power value would result in a large number of correct r-values being overwritten. Thus, the method below will be used (i.e. cutting off the r-value at 150m).

In [2217]:
print(f"Potential legitimate R values: {(r_test['r']<150).sum()}")
print(f"Values which are definitely not leigitmate: {(r_test['r']>150).sum()}")
print(f"Number of missing theta values: {r_test['theta'].isnull().sum()}")

Potential legitimate R values: 185487
Values which are definitely not leigitmate: 7976
Number of missing theta values: 10065


In [2218]:
#Therefore, all rows with 'r' values over 150 will have their power value equalled to the 'r' value and the 
#'r' column will have their value set to 0

def r_power_anomaly(dataset):
    dataset.loc[dataset.r>150,'power'] = dataset['r']
    dataset.loc[dataset.r>150,'r'] = np.NaN
    print(dataset['r'].max())
    return dataset[['r','power']].loc[dataset['r']>120].head(10)

In [2219]:
#As the below shows, the maximum value of the 'r' column is now 150, indicating the above function has worked. 
r_power_anomaly(combined_df)

150.0


Unnamed: 0,r,power
263,123.0,
2092,150.0,
2150,145.0,
2346,149.0,
3673,134.1,
4022,129.0,
5588,136.0,
5596,144.0,
5671,126.0,
6861,146.0,


## Gender

In [2220]:
#Checking the values
combined_df['gender'].unique()

array(['male', 'female', nan, 'na'], dtype=object)

#### (Gender Anomaly TimeDate Location Analysis)

In [2221]:
#Obtaining all anomalous rows
gender_anomaly_df = combined_df.loc[combined_df['gender']=='na']
generate_time_date(gender_anomaly_df)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 180 entries, 161012 to 352008
Data columns (total 32 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   sim_number   180 non-null    int64  
 1   timestamp_x  180 non-null    object 
 2   batfast_id   180 non-null    object 
 3   client_name  180 non-null    object 
 4   event_name   180 non-null    object 
 5   game_mode    180 non-null    object 
 6   score        180 non-null    object 
 7   speed        180 non-null    float64
 8   pitch        180 non-null    float64
 9   swing        180 non-null    float64
 10  pan          180 non-null    float64
 11  turn         180 non-null    float64
 12  r            66 non-null     float64
 13  theta        66 non-null     float64
 14  z            33 non-null     float64
 15  power        78 non-null     float64
 16  machine      180 non-null    object 
 17  scoring      180 non-null    object 
 18  length       180 non-null    object 
 19  



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [2222]:
#Obtaining a dataframe showing where these anomalous values occured (i.e. the client) and at which date they occured
gender_anomaly_analysis = gender_anomaly_df.pivot_table(index='date', columns='client_name', values='timestamp_x', aggfunc='count')
gender_anomaly_analysis.fillna(value=0, inplace=True)

#Creating a total anomalous values column
gender_anomaly_analysis['TOTAL'] = gender_anomaly_analysis.sum(axis=1)
gender_anomaly_analysis

#This shows all 'na' inputs are from Sixes cricket club

client_name,sixes,TOTAL
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-12-13,18,18
2021-05-01,42,42
2021-05-08,120,120


In [2223]:
#Changing the above, assumingly manually entered 'na' into a missing value nan.

def gender_anomaly(dataset):
    dataset.loc[dataset.gender=='na', 'gender'] = np.NaN
    return dataset['gender'].unique()

In [2224]:
#As can be seen, the 'na' gender has been removed and replaced by an actual missing value nan.
gender_anomaly(combined_df)

array(['male', 'female', nan], dtype=object)

## Age Group

In [2225]:
combined_df['age_group'].unique()

array(['16Plus', '10To16', '4To10', nan, '11To16'], dtype=object)

As can be seen, there is an overlapping age group: 10To16 and 11To16. Given we cannot tell
the specific age of the users and therefore cannot customise these ranges, the 11To16 users will be absorbed into the 10To16 age group given we can guarentee that all these users fit into the latter category. 

#### (Age Group Anomaly Analysis)

In [2226]:
#Obtaining all anomalous rows
age_anomaly_df = combined_df.loc[combined_df['age_group']=='11To16']
generate_time_date(age_anomaly_df)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6580 entries, 6420 to 1977873
Data columns (total 32 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   sim_number   6580 non-null   int64  
 1   timestamp_x  6580 non-null   object 
 2   batfast_id   6580 non-null   object 
 3   client_name  6580 non-null   object 
 4   event_name   6580 non-null   object 
 5   game_mode    6580 non-null   object 
 6   score        2101 non-null   object 
 7   speed        6580 non-null   float64
 8   pitch        6580 non-null   float64
 9   swing        6580 non-null   float64
 10  pan          6580 non-null   float64
 11  turn         6580 non-null   float64
 12  r            33 non-null     float64
 13  theta        5 non-null      float64
 14  z            0 non-null      float64
 15  power        363 non-null    float64
 16  machine      6580 non-null   object 
 17  scoring      6580 non-null   object 
 18  length       0 non-null      object 
 19  



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [2227]:
#Obtaining a dataframe showing where these anomalous values occured (i.e. the client) and at which date they occured
age_anomaly_analysis = age_anomaly_df.pivot_table(index='date', columns='client_name', values='timestamp_x', aggfunc='count')
age_anomaly_analysis.fillna(value=0, inplace=True)

#Creating a total anomalous values column
age_anomaly_analysis['TOTAL'] = age_anomaly_analysis.sum(axis=1)
age_anomaly_analysis

#All anomalous values occured at BatFast beginning August 2020 and finished early November 2020

client_name,batfast,TOTAL
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-08-29,215,215
2020-08-30,258,258
2020-09-04,76,76
2020-09-05,570,570
2020-09-08,30,30
2020-09-11,36,36
2020-09-20,301,301
2020-09-21,169,169
2020-09-25,202,202
2020-09-27,183,183


In [2228]:
def age_group_adjustment(dataset):
    dataset.loc[dataset.age_group=='11To16', 'age_group'] = '10To16'
    return dataset['age_group'].unique()

In [2229]:
age_group_adjustment(combined_df)

array(['16Plus', '10To16', '4To10', nan], dtype=object)

## Hand

In [2230]:
#No anomlaous values appear
combined_df['hand'].unique()

array(['RHS', nan, 'LHS'], dtype=object)

## Skill

In [2231]:
combined_df['skill'].unique()

array([nan, '50 MPH'], dtype=object)

In [2232]:
#This shows that almost all of these values are missing (99.8%)
combined_df['skill'].isnull().sum()/len(combined_df)

0.9982111493132614

These are odd values. The BatFast introductory document states that skill shows the speed that the user has selected but the only values which appear are missing (nan) or 50mph. Therefore, given that the speed column already includes the speed of the delivery, alongside the fact that the proportion of missing values for this variable is so high this column will be dropped.

In [2233]:
def skill_anomaly(dataset):
    dataset.drop(columns='skill', inplace=True)

In [2234]:
skill_anomaly(combined_df)

## Timestamp

In [2235]:
# These are simply automatic time-stamps generated by the machines
combined_df['timestamp_y'].unique()

array(['2021-02-12 12:05:02.672479+00:00',
       '2021-02-12 12:05:14.276508+00:00',
       '2021-02-12 12:05:15.696636+00:00', ...,
       '2021-01-13 16:19:26.864118+00:00',
       '2021-01-13 16:19:26.883619+00:00',
       '2021-01-13 16:19:26.887705+00:00'], dtype=object)

To increase accessibility and usability, we will add an hour, minute and second column for each delivery. We will also split the time_stamp column in two. One column will have the time of delivery, the other will have the date. 

In [2236]:
def time_split(dataset):
    dataset['hour'] = pd.to_datetime(dataset['timestamp_x']).dt.hour
    dataset['minute'] = pd.to_datetime(dataset['timestamp_x']).dt.minute
    dataset['secs'] = pd.to_datetime(dataset['timestamp_x']).dt.second

    return dataset.info()

In [2237]:
time_split(combined_df)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3110936 entries, 0 to 3110935
Data columns (total 32 columns):
 #   Column       Dtype  
---  ------       -----  
 0   sim_number   int64  
 1   timestamp_x  object 
 2   batfast_id   object 
 3   client_name  object 
 4   event_name   object 
 5   game_mode    object 
 6   score        object 
 7   speed        float64
 8   pitch        float64
 9   swing        float64
 10  pan          float64
 11  turn         float64
 12  r            float64
 13  theta        float64
 14  z            float64
 15  power        float64
 16  machine      object 
 17  scoring      object 
 18  length       object 
 19  email        object 
 20  name         object 
 21  gender       object 
 22  age_group    object 
 23  hand         object 
 24  timestamp_y  object 
 25  updated      object 
 26  speed_adj    float64
 27  swing_adj    float64
 28  theta_adj    float64
 29  hour         int64  
 30  minute       int64  
 31  secs         int64  
dty

In [2238]:
def generate_time_date(dataset):
    df_1 = pd.DataFrame({"time": dataset.timestamp_x.str.split(' '),
                        "date": dataset.timestamp_x.str.split(' ')}, index=dataset.index)
    df_1['time'] = df_1.time.map(lambda x: x[1])
    df_1['date'] = df_1.date.map(lambda x: x[0])
    
    dataset['time'] = df_1['time']
    dataset['date'] = df_1['date']
    
    return dataset.info()

generate_time_date(combined_df)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3110936 entries, 0 to 3110935
Data columns (total 34 columns):
 #   Column       Dtype  
---  ------       -----  
 0   sim_number   int64  
 1   timestamp_x  object 
 2   batfast_id   object 
 3   client_name  object 
 4   event_name   object 
 5   game_mode    object 
 6   score        object 
 7   speed        float64
 8   pitch        float64
 9   swing        float64
 10  pan          float64
 11  turn         float64
 12  r            float64
 13  theta        float64
 14  z            float64
 15  power        float64
 16  machine      object 
 17  scoring      object 
 18  length       object 
 19  email        object 
 20  name         object 
 21  gender       object 
 22  age_group    object 
 23  hand         object 
 24  timestamp_y  object 
 25  updated      object 
 26  speed_adj    float64
 27  swing_adj    float64
 28  theta_adj    float64
 29  hour         int64  
 30  minute       int64  
 31  secs         int64  
 32

## Updated

In [2239]:
#No anomalous values appear
combined_df['updated'].unique()

array(['2021-02-12 12:05:02.672498+00:00',
       '2021-02-12 12:05:14.276551+00:00',
       '2021-02-12 12:05:15.696655+00:00', ...,
       '2021-01-13 16:19:26.864138+00:00',
       '2021-01-13 16:19:26.883639+00:00',
       '2021-01-13 16:19:26.887725+00:00'], dtype=object)

## Sim Number 

In [2240]:
#No anomalous values appear
combined_df['sim_number'].unique()

array([12, 28, 11, 47,  3,  4,  1, 27, 30, 17, 40, 26, 46, 34, 19, 24, 35,
       41, 48, 18, 21, 37,  5,  9, 16, 33, 36, 25, 39, 29, 13, 31, 22, 55,
       38, 32, 23,  7,  8, 10, 44, 15])

## Client Name and Event Name

These could be whatever the client decided thus there is no way to establish what the anomalies are. 

## Game Mode

In [2241]:
combined_df['game_mode'].value_counts()

Continuous              1309915
Baseball Pitching       1023221
Training                 292230
Whack                     98812
Continuous_RR             82198
Continuous_BB             57967
Continuous_RR_BB          46178
over_challenge_RR         46063
Baseball Pitching_RR      43109
Finish                    40771
Training_BB               28841
Survive                   14481
ball_challenge             7496
Test Ball                  7127
over_challenge             4497
Whack_RR                   2661
Innovate                    808
Continuous_TN               735
Whack_BB                    619
over_challenge_RR_BB        520
Training_TN                 486
over_challenge_TN           215
Continuous_RR_TN            139
over_challenge_BB           113
ball_challenge_BB           110
None                         51
_RR                          24
_BB                          20
ball_challenge_TN            11
_RR_BB                        3
Whacks                        1
Name: ga

Given the above list of game modes, the adjustments which will be made are:

Assign all '_RR' games to the 'Continuous_RR' game mode.
Assign all '_BB' to 'Continuous_BB'. 
Assign all '_RR_BB' to 'Continuous_RR_BB'.
Note there is also a single incidence of the game mode "Whacks" which we will assume was meant to be "Whack", thus this row will be reassigned as a "Whack' delivery.

Also, whilst thereare 51 instances of nan being used as a game mode, these rows will not be removed given that the information they possess about the deliveries could be included in headline statistics (e.g. the total number of deliveries faced by a user).

####  (Game Mode Anomaly Analysis)

In [2242]:
game_mode_anomaly_df = combined_df.loc[(combined_df['game_mode']=='_RR')|(combined_df['game_mode']=='_BB')|
                                             (combined_df['game_mode']=='_RR_BB')]

generate_time_date(game_mode_anomaly_df)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 47 entries, 44958 to 2282298
Data columns (total 34 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   sim_number   47 non-null     int64  
 1   timestamp_x  47 non-null     object 
 2   batfast_id   47 non-null     object 
 3   client_name  47 non-null     object 
 4   event_name   47 non-null     object 
 5   game_mode    47 non-null     object 
 6   score        45 non-null     object 
 7   speed        47 non-null     float64
 8   pitch        47 non-null     float64
 9   swing        47 non-null     float64
 10  pan          47 non-null     float64
 11  turn         37 non-null     float64
 12  r            7 non-null      float64
 13  theta        7 non-null      float64
 14  z            0 non-null      float64
 15  power        1 non-null      float64
 16  machine      47 non-null     object 
 17  scoring      47 non-null     object 
 18  length       5 non-null      object 
 19  e



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [2243]:
#Obtaining a dataframe showing where these anomalous values occured (i.e. the client) and at which date they occured
game_mode_anomaly_analysis = game_mode_anomaly_df.pivot_table(index='date', columns='client_name', values='timestamp_x', aggfunc='count')
game_mode_anomaly_analysis.fillna(value=0, inplace=True)

#Creating a total anomalous values column
game_mode_anomaly_analysis['TOTAL'] = game_mode_anomaly_analysis.sum(axis=1)
game_mode_anomaly_analysis

#The below shows anomalous values appeared pretty much all over the place at different dates as well

client_name,batfast,cricket-simulator-canada,funholics-pakistan,players-entrance,sixes,southern-cricket-perth,tenpin,the-game-adelaide,TOTAL
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2018-06-25,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
2018-07-01,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
2018-11-25,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
2018-12-29,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
2019-03-03,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2019-03-12,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2019-04-26,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2019-04-29,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2019-06-07,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2019-08-11,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0


In [2244]:
def game_mode_adjustment(dataset):
    dataset.loc[dataset.game_mode=='_RR', 'game_mode'] = 'Continuous_RR'
    dataset.loc[dataset.game_mode=='_BB', 'game_mode'] = 'Continuous_BB'
    dataset.loc[dataset.game_mode=='_RR_BB', 'game_mode'] = 'Continuous_RR_BB'
    dataset.loc[dataset.game_mode=='Whacks', 'game_mode'] = 'Whack'
    return dataset['game_mode'].value_counts()

In [2245]:
game_mode_adjustment(combined_df)

Continuous              1309915
Baseball Pitching       1023221
Training                 292230
Whack                     98813
Continuous_RR             82222
Continuous_BB             57987
Continuous_RR_BB          46181
over_challenge_RR         46063
Baseball Pitching_RR      43109
Finish                    40771
Training_BB               28841
Survive                   14481
ball_challenge             7496
Test Ball                  7127
over_challenge             4497
Whack_RR                   2661
Innovate                    808
Continuous_TN               735
Whack_BB                    619
over_challenge_RR_BB        520
Training_TN                 486
over_challenge_TN           215
Continuous_RR_TN            139
over_challenge_BB           113
ball_challenge_BB           110
None                         51
ball_challenge_TN            11
Name: game_mode, dtype: int64

As the above shows, the reassigned values have been absorbed into the other variables. For example, the 24 values of '_RR' have been absorbed into the 'Continuous_RR', hence the 'Continuous_RR' variable now has 24 more instances in the game mode columns, standing at 82,222 (up from 82,198).

## Score

In [2246]:
#The below shows that the words 'Yes' and 'No' have been included in the score, as well wicket balls. The negative
# values refer to wickets/ strikes in points based game modes. 
combined_df['score'].unique()

array(['Dead Ball', '0', '6', '4', 'Bowled', '2', '1', '3', nan,
       'Caught Behind', 'No Ball', '0.0', 'Wide', '-10.0', '10.0', '5',
       '2.0', '1.0', '4.0', '3.0', '10', '50', '6.0', '-10', '5.0',
       '50.0', 'Yes', 'No', '100.0', '100', 0.0, -10.0, 100.0, 10.0, '25',
       '15', 50.0, 6.0, 2.0, 4.0, 1.0, 3.0], dtype=object)

Given the above, the score column is going to be split into three.

The first column will be the number of cricket runs actually scored on the simulators and the second will be the number of points scored in the various game modes. Following consultation with BatFast, it was established that the values 1, 2, 3, 4, 5 and 6 were all cricket runs and that no gamne mode uses a points scoring system with these values in. Thus, all values in the scoring column which are the size 6 or below are runs. Any other values are points scored within specific game modes. 

Finally, the score column will also be adjusted in that any wicket ball, dead ball, missing value (nan), wide, no ball, 'no' value and 'yes' value will be changed to a score of 0. The reason wides and no balls will be assigned a score of 0 is because whilst in a game this would count as runs for the batting team, they don't actually count as runs for the batsmen and thus they will not be included in the individuals score. The reason the 'yes' value is going to be changed to 0 is that there are only 250 occurences of a 'yes' score and without any further information it is impossible to ascertain what number of runs/ score was achieved off of that ball. 

In [2247]:
def score_adjustment(dataset):
    
    #Showing the the deliveries where wickets occur as 0 runs or points
    dataset['score_adj'] = np.where((dataset['score']=='Dead Ball')|(dataset['score']=='Bowled')|
                                     (dataset['score']=='Caught Behind')|(dataset['score']=='No Ball')|
                                     (dataset['score']=='Wide')|(dataset['score']=='Yes')|(dataset['score']=='No'), 
                                    0, dataset['score'])
    
    #Coercing values in the score column to numeric data type for handling ease
    dataset['score_adj'] = pd.to_numeric(dataset['score_adj'], errors='coerce')
    
    #Creating the runs columns
    dataset['runs'] = np.where((dataset['score_adj']==1.0)|(dataset['score_adj']==2.0)|
                               (dataset['score_adj']==3.0)|(dataset['score_adj']==4.0)|
                               (dataset['score_adj']==5.0)|(dataset['score_adj']==6.0),
                               dataset['score_adj'], 0)
    
    #Creating the points columns
    dataset['points'] = np.where((dataset['score_adj']==1.0)|(dataset['score_adj']==2.0)|
                               (dataset['score_adj']==3.0)|(dataset['score_adj']==4.0)|
                               (dataset['score_adj']==5.0)|(dataset['score_adj']==6.0), 
                                 0, dataset['score_adj'])
    
    #Creating the wickets columns
    dataset['wicket'] = np.where((dataset['score']=='Bowled')|(dataset['score']=='Caught Behind'),dataset['score'],np.NaN)
    
    return dataset.info()

In [2248]:
score_adjustment(combined_df)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3110936 entries, 0 to 3110935
Data columns (total 38 columns):
 #   Column       Dtype  
---  ------       -----  
 0   sim_number   int64  
 1   timestamp_x  object 
 2   batfast_id   object 
 3   client_name  object 
 4   event_name   object 
 5   game_mode    object 
 6   score        object 
 7   speed        float64
 8   pitch        float64
 9   swing        float64
 10  pan          float64
 11  turn         float64
 12  r            float64
 13  theta        float64
 14  z            float64
 15  power        float64
 16  machine      object 
 17  scoring      object 
 18  length       object 
 19  email        object 
 20  name         object 
 21  gender       object 
 22  age_group    object 
 23  hand         object 
 24  timestamp_y  object 
 25  updated      object 
 26  speed_adj    float64
 27  swing_adj    float64
 28  theta_adj    float64
 29  hour         int64  
 30  minute       int64  
 31  secs         int64  
 32

## Machine

In [2249]:
#No anomalous values appear
combined_df['machine'].unique()

array(['B1', 'B2S', 'B2', 'B2R', nan], dtype=object)

## Length

In [2250]:
#No anomalous values appear
combined_df['length'].unique()

array([nan, 'Yorker', 'Extra Short', 'Short', 'Full', 'Good',
       'Mid Strike Zone', 'Net Line', 'Service Line'], dtype=object)

## Speed

In [2251]:
# Analysing speed_adj to account for the above adjustment
sorted(list(combined_df['speed_adj'].unique()))

[0.0,
 10.0,
 20.0,
 22.0,
 24.0,
 25.0,
 26.0,
 27.0,
 28.0,
 28.799999999999997,
 29.0,
 30.0,
 31.0,
 32.0,
 33.0,
 34.0,
 35.0,
 36.0,
 37.0,
 38.0,
 39.0,
 40.0,
 40.599999999999994,
 41.0,
 42.0,
 43.0,
 44.0,
 45.0,
 46.0,
 47.0,
 48.0,
 49.0,
 50.0,
 51.0,
 52.0,
 53.0,
 54.0,
 55.0,
 56.0,
 57.0,
 58.0,
 58.3,
 59.0,
 60.0,
 62.0,
 64.0,
 65.0,
 66.7,
 70.0,
 72.0,
 75.0,
 76.0,
 80.0,
 83.3,
 90.0,
 100.0,
 nan,
 2.0,
 6.0,
 8.0,
 12.0,
 15.0,
 16.0,
 17.0,
 18.0,
 19.0,
 20.6,
 21.0,
 21.200000000000003,
 21.9,
 22.5,
 23.0,
 23.1,
 23.799999999999997,
 23.900000000000002,
 24.4,
 25.6,
 26.200000000000003,
 26.9,
 27.5,
 28.1,
 29.4,
 30.6,
 31.200000000000003,
 31.9,
 32.5,
 33.1,
 33.8,
 34.4,
 35.6,
 36.2,
 36.9,
 37.5,
 38.1,
 38.8,
 39.4,
 40.5,
 41.2,
 41.900000000000006,
 42.5,
 43.099999999999994,
 43.8,
 44.400000000000006,
 45.599999999999994,
 46.2,
 46.900000000000006,
 47.5,
 48.099999999999994,
 48.8,
 49.400000000000006,
 50.300000000000004,
 50.5999999999999

The above shows that there are some exceptionally high speed values. However, given the previous, manual adjustment made of multiplying speed by 10 to get the values into the desired format, it is assumed that these figures were previously manually entered as (e.g.) 30mph instead of 3.0. Thus, for all the speed values over 100, they will be divided by 10 to return them back to their original number. I.e., 300mph will revert back to 30mph. Values will also be rounded to 1 d.p. for ease. 

#### (Speed Anomaly TimeDate Analysis)

In [2252]:
speed_anomaly_df = combined_df.loc[(combined_df['speed_adj']>100)]
generate_time_date(speed_anomaly_df)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 41 entries, 71715 to 385923
Data columns (total 38 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   sim_number   41 non-null     int64  
 1   timestamp_x  41 non-null     object 
 2   batfast_id   41 non-null     object 
 3   client_name  41 non-null     object 
 4   event_name   41 non-null     object 
 5   game_mode    41 non-null     object 
 6   score        41 non-null     object 
 7   speed        41 non-null     float64
 8   pitch        41 non-null     float64
 9   swing        41 non-null     float64
 10  pan          41 non-null     float64
 11  turn         41 non-null     float64
 12  r            2 non-null      float64
 13  theta        2 non-null      float64
 14  z            0 non-null      float64
 15  power        23 non-null     float64
 16  machine      41 non-null     object 
 17  scoring      41 non-null     object 
 18  length       7 non-null      object 
 19  em



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [2253]:
#Obtaining a dataframe showing where these anomalous values occured (i.e. the client) and at which date they occured
speed_anomaly_analysis = speed_anomaly_df.pivot_table(index='date', columns='client_name', values='timestamp_x', aggfunc='count')
speed_anomaly_analysis.fillna(value=0, inplace=True)

#Creating a total anomalous values column
speed_anomaly_analysis['TOTAL'] = speed_anomaly_analysis.sum(axis=1)
speed_anomaly_analysis

#The below shows anomalous values appeared only at the BatFast location

client_name,batfast,TOTAL
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-10-31,34,34
2020-12-20,2,2
2021-04-23,5,5


In [2254]:
def speed_adj_anomaly(dataset):
    dataset.loc[dataset.speed_adj>100, 'speed_adj'] = dataset['speed_adj']/10
    dataset['speed_adj'].round(1)
    return sorted(list(combined_df['speed_adj'].unique()))

In [2255]:
speed_adj_anomaly(combined_df)

[0.0,
 10.0,
 20.0,
 22.0,
 24.0,
 25.0,
 26.0,
 27.0,
 28.0,
 28.799999999999997,
 29.0,
 30.0,
 31.0,
 32.0,
 33.0,
 34.0,
 35.0,
 36.0,
 37.0,
 38.0,
 39.0,
 40.0,
 40.599999999999994,
 41.0,
 42.0,
 43.0,
 44.0,
 45.0,
 46.0,
 47.0,
 48.0,
 49.0,
 50.0,
 51.0,
 52.0,
 53.0,
 54.0,
 55.0,
 57.0,
 58.0,
 58.3,
 59.0,
 60.0,
 62.0,
 64.0,
 65.0,
 66.7,
 70.0,
 75.0,
 80.0,
 83.3,
 90.0,
 100.0,
 nan,
 2.0,
 6.0,
 8.0,
 12.0,
 15.0,
 16.0,
 17.0,
 18.0,
 19.0,
 20.6,
 21.0,
 21.200000000000003,
 21.9,
 22.5,
 23.0,
 23.1,
 23.799999999999997,
 23.900000000000002,
 24.4,
 25.6,
 26.200000000000003,
 26.9,
 27.5,
 28.1,
 29.4,
 30.6,
 31.200000000000003,
 31.9,
 32.5,
 33.1,
 33.8,
 34.4,
 35.6,
 36.2,
 36.9,
 37.5,
 38.1,
 38.8,
 39.4,
 40.5,
 41.2,
 41.900000000000006,
 42.5,
 43.099999999999994,
 43.8,
 44.400000000000006,
 45.599999999999994,
 46.2,
 46.900000000000006,
 47.5,
 48.099999999999994,
 48.8,
 49.400000000000006,
 50.300000000000004,
 50.599999999999994,
 51.2,
 51.7,
 51

As can be seen, the exceptionally large speed values are no longer present as they have been transformed into their original values and all values have been rounded to 1dp.

## Pitch

In [2256]:
#No anomalous values appear
sorted(list(combined_df['pitch'].unique()))

[-22.0,
 -16.8,
 -12.0,
 -11.66,
 -6.8,
 -6.0,
 -5.2,
 -4.8,
 -4.4,
 -4.0,
 -3.7,
 -3.4,
 -3.1,
 -3.0,
 -2.72,
 -2.5,
 -2.3,
 -2.16,
 -2.0,
 -1.8,
 -1.6,
 -1.4,
 -1.36,
 -1.2,
 -0.9,
 -0.5,
 -0.28,
 0.0,
 1.0,
 1.5,
 1.8,
 2.0,
 2.5,
 2.9,
 3.0,
 3.5,
 4.0,
 5.0,
 6.0,
 6.5,
 7.0,
 8.0,
 8.38,
 9.0,
 9.8,
 10.0,
 10.5,
 11.0,
 12.0,
 14.0,
 16.0,
 18.0,
 20.0,
 nan,
 -19.8,
 -19.1,
 -19.0,
 -18.8,
 -18.5,
 -18.4,
 -18.3,
 -18.0,
 -17.4,
 -17.0,
 -16.0,
 -15.8,
 -15.5,
 -15.4,
 -15.0,
 -14.8,
 -14.6,
 -14.5,
 -14.4,
 -14.1,
 -14.0,
 -13.9,
 -13.8,
 -13.6,
 -13.58,
 -13.56,
 -13.52,
 -13.5,
 -13.4,
 -13.3,
 -13.1,
 -13.0,
 -12.8,
 -12.42,
 -12.4,
 -12.3,
 -12.2,
 -12.1,
 -11.94,
 -11.9,
 -11.88,
 -11.81,
 -11.8,
 -11.75,
 -11.7,
 -11.69,
 -11.62,
 -11.6,
 -11.56,
 -11.52,
 -11.5,
 -11.44,
 -11.4,
 -11.38,
 -11.31,
 -11.3,
 -11.26,
 -11.25,
 -11.22,
 -11.2,
 -11.15,
 -11.1,
 -11.0,
 -10.97,
 -10.9,
 -10.89,
 -10.8,
 -10.75,
 -10.72,
 -10.7,
 -10.67,
 -10.62,
 -10.6,
 -10.57,
 -10.56,
 -10

## Swing

In [2257]:
#No anomalous values appear
sorted(list(combined_df['swing'].unique()))

[-45.0,
 -40.0,
 -35.0,
 -30.0,
 -25.0,
 -24.0,
 -23.0,
 -22.0,
 -21.0,
 -20.0,
 -19.0,
 -18.0,
 -17.0,
 -16.0,
 -15.0,
 -14.0,
 -13.0,
 -12.0,
 -11.0,
 -10.0,
 -9.0,
 -8.0,
 -7.0,
 -6.0,
 -5.0,
 -4.0,
 -3.0,
 -2.0,
 -1.0,
 0.0,
 1.0,
 2.0,
 3.0,
 4.0,
 5.0,
 10.0,
 nan,
 6.0,
 7.0,
 8.0,
 9.0,
 11.0,
 12.0,
 13.0,
 14.0,
 15.0,
 16.0,
 17.0,
 18.0,
 19.0,
 20.0,
 21.0,
 22.0,
 23.0,
 24.0,
 25.0,
 29.0,
 30.0,
 35.0,
 40.0,
 45.0]

## Pan

In [2258]:
#No anomalous values appear
sorted(list(combined_df['pan'].unique()))

[0.0,
 nan,
 -19.0,
 -17.0,
 -14.0,
 -12.0,
 -11.5,
 -11.0,
 -10.5,
 -10.0,
 -9.5,
 -9.0,
 -8.5,
 -8.0,
 -7.5,
 -7.0,
 -6.5,
 -6.0,
 -5.5,
 -5.0,
 -4.5,
 -4.0,
 -3.5,
 -3.0,
 -2.5,
 -2.0,
 -1.5,
 -1.0,
 -0.5,
 0.5,
 1.0,
 1.5,
 2.0,
 2.5,
 3.0,
 3.3,
 3.4,
 3.5,
 4.0,
 4.5,
 5.0,
 5.5,
 6.0,
 6.5,
 7.0,
 7.5,
 8.0,
 8.5,
 9.0,
 9.5,
 10.0,
 10.5,
 11.0,
 11.5,
 12.0,
 14.0,
 25.0]

## Turn

In [2259]:
#No anomalous values appear
sorted(list(combined_df['turn'].unique()))

[nan, -6.0, -5.0, -4.0, -3.0, -2.0, -1.0, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0]

## R

In [2260]:
#WHAT ARE NEGATIVE R VALUES - MAY BE POWER VALUES
sorted(list(combined_df['r'].unique()))

[nan,
 -2863.8,
 -1615.5,
 -185.2,
 -170.8,
 -161.2,
 -132.9,
 -108.6,
 -106.9,
 -91.5,
 -90.8,
 -82.4,
 -81.0,
 -80.9,
 -78.1,
 -73.8,
 -71.9,
 -70.9,
 -70.6,
 -69.0,
 -67.5,
 -67.4,
 -66.0,
 -65.8,
 -65.5,
 -65.3,
 -61.4,
 -60.6,
 -60.1,
 -59.9,
 -59.5,
 -58.9,
 -58.5,
 -57.8,
 -57.5,
 -55.8,
 -54.8,
 -54.5,
 -54.1,
 -53.4,
 -53.2,
 -52.5,
 -52.4,
 -51.3,
 -51.0,
 -50.7,
 -50.1,
 -49.3,
 -48.8,
 -48.5,
 -47.6,
 -47.3,
 -47.2,
 -47.1,
 -47.0,
 -46.9,
 -46.8,
 -46.5,
 -45.8,
 -45.4,
 -45.3,
 -45.2,
 -45.0,
 -44.9,
 -44.8,
 -44.6,
 -44.3,
 -44.2,
 -44.0,
 -43.6,
 -43.5,
 -43.4,
 -43.1,
 -43.0,
 -42.6,
 -42.5,
 -42.4,
 -42.2,
 -42.0,
 -41.9,
 -41.7,
 -41.5,
 -41.3,
 -41.0,
 -40.9,
 -40.8,
 -40.4,
 -40.2,
 -39.5,
 -39.4,
 -39.3,
 -39.1,
 -39.0,
 -38.9,
 -38.8,
 -38.7,
 -38.6,
 -38.4,
 -38.3,
 -38.2,
 -38.0,
 -37.9,
 -37.8,
 -37.7,
 -37.5,
 -37.4,
 -37.3,
 -37.2,
 -37.1,
 -37.0,
 -36.8,
 -36.7,
 -36.4,
 -36.3,
 -36.2,
 -35.9,
 -35.8,
 -35.7,
 -35.5,
 -35.2,
 -35.0,
 -34.9,
 -34.8,
 -34.7,


## Theta

In [2261]:
#No anomalous values appear
sorted(list(combined_df['turn'].unique()))

[nan, -6.0, -5.0, -4.0, -3.0, -2.0, -1.0, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0]

## Z

In [2262]:
#No anomalous values appear
sorted(list(combined_df['z'].unique()))

[nan, 1.0, 2.0]

## Power

BatFast have indicated that the maximum value of the power variable is 1000. Thus there should not be values of pwoer greater than 1000.

In [2263]:
#This shows there are anomalous values within the dataframe for power
combined_df['power'].max()

32016.1

####  (Power Anomaly TimeDate Analysis)

In [2264]:
power_anomaly_df = combined_df.loc[combined_df['power']>1000]
generate_time_date(power_anomaly_df)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7611 entries, 3925 to 2311442
Data columns (total 38 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   sim_number   7611 non-null   int64  
 1   timestamp_x  7611 non-null   object 
 2   batfast_id   7611 non-null   object 
 3   client_name  7611 non-null   object 
 4   event_name   7611 non-null   object 
 5   game_mode    7611 non-null   object 
 6   score        7611 non-null   object 
 7   speed        7611 non-null   float64
 8   pitch        7611 non-null   float64
 9   swing        7611 non-null   float64
 10  pan          7611 non-null   float64
 11  turn         7611 non-null   float64
 12  r            0 non-null      float64
 13  theta        12 non-null     float64
 14  z            0 non-null      float64
 15  power        7611 non-null   float64
 16  machine      7611 non-null   object 
 17  scoring      7611 non-null   object 
 18  length       71 non-null     object 
 19  



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [2265]:
#Obtaining a dataframe showing where these anomalous values occured (i.e. the client) and at which date they occured
power_anomaly_analysis = power_anomaly_df.pivot_table(index='date', columns='client_name', values='timestamp_x', aggfunc='count')
power_anomaly_analysis.fillna(value=0, inplace=True)

#Creating a total anomalous values column
power_anomaly_analysis['TOTAL'] = power_anomaly_analysis.sum(axis=1)
power_anomaly_analysis

#The below shows anomalous values appeared at a number of locations although a large number came from Adelaide 
#on the 27th Feb 2021.

client_name,batfast,cricket-simulator-canada,sixes,tenpin,the-game-adelaide,TOTAL
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2019-01-13,2.0,0.0,0.0,0.0,0.0,2.0
2019-01-15,3.0,0.0,0.0,0.0,0.0,3.0
2019-02-06,1.0,0.0,0.0,0.0,0.0,1.0
2019-02-08,2.0,0.0,0.0,0.0,0.0,2.0
2019-02-25,1.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...
2021-03-19,0.0,0.0,1.0,0.0,0.0,1.0
2021-03-23,0.0,0.0,1.0,0.0,0.0,1.0
2021-03-26,0.0,0.0,2.0,0.0,0.0,2.0
2021-03-31,0.0,0.0,1.0,0.0,0.0,1.0


In [2372]:
def power_anomaly(dataset):
    dataset.loc[dataset.power>1000, 'power'] = 1000
    return dataset['power'].max()

power_anomaly(combined_df)

1000.0

## 6. Analysis and Classification of Users

## 6.1 Classification of Users

In [2267]:
combined_df['game_mode'].value_counts()

Continuous              1309915
Baseball Pitching       1023221
Training                 292230
Whack                     98813
Continuous_RR             82222
Continuous_BB             57987
Continuous_RR_BB          46181
over_challenge_RR         46063
Baseball Pitching_RR      43109
Finish                    40771
Training_BB               28841
Survive                   14481
ball_challenge             7496
Test Ball                  7127
over_challenge             4497
Whack_RR                   2661
Innovate                    808
Continuous_TN               735
Whack_BB                    619
over_challenge_RR_BB        520
Training_TN                 486
over_challenge_TN           215
Continuous_RR_TN            139
over_challenge_BB           113
ball_challenge_BB           110
None                         51
ball_challenge_TN            11
Name: game_mode, dtype: int64

All game modes other than ‘training’ will be considered as having entertainment purposes and thus, all deliveries faced during these game modes will be analysed for the entertainment customer dashboard. Likewise, all deliveries faced during the ‘training’ mode will be analysed for the training customer dashboard.

In [2268]:
def user_classification(dataset):
    
    #Identifying which deliveries were delivered in training scenarios
    dataset['training'] = np.where((dataset['game_mode'] == 'Training')|
                                    (dataset['game_mode'] == 'Training_BB')|
                                    (dataset['game_mode'] == 'Training_TN')
                                    ,1,0)
    
    #Identifying which deliveries were delivered in entertainement scenarios (i.e. all bar the 3 training modes)
    dataset['entertainment'] = np.where((dataset['game_mode'] == 'Training')|
                                    (dataset['game_mode'] == 'Training_BB')|
                                    (dataset['game_mode'] == 'Training_TN')
                                    ,0,1)
    return dataset.info()

In [2269]:
user_classification(combined_df)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3110936 entries, 0 to 3110935
Data columns (total 40 columns):
 #   Column         Dtype  
---  ------         -----  
 0   sim_number     int64  
 1   timestamp_x    object 
 2   batfast_id     object 
 3   client_name    object 
 4   event_name     object 
 5   game_mode      object 
 6   score          object 
 7   speed          float64
 8   pitch          float64
 9   swing          float64
 10  pan            float64
 11  turn           float64
 12  r              float64
 13  theta          float64
 14  z              float64
 15  power          float64
 16  machine        object 
 17  scoring        object 
 18  length         object 
 19  email          object 
 20  name           object 
 21  gender         object 
 22  age_group      object 
 23  hand           object 
 24  timestamp_y    object 
 25  updated        object 
 26  speed_adj      float64
 27  swing_adj      float64
 28  theta_adj      float64
 29  hour          

## 6.2 Brief Analysis of Users

### Distribution of Users by Type of Use

The following will show a brief analysis of the users, giving an idea as to the distribution of users across Batfast's simulators. It will help to show whether the simulators are more commonly used by users for entertainment purposes or training purposes.

In [2270]:
training_percent = (combined_df['training'].sum()/len(combined_df))*100
entertainment_percent = (combined_df['entertainment'].sum()/len(combined_df))*100

print(f"Training purposes: {round(training_percent,1)}%")
print(f"Entertainment purposes: {round(entertainment_percent,1)}%")

Training purposes: 10.3%
Entertainment purposes: 89.7%


### Distribution of Users by Sport Type

In order to work out the distribution of sports, three separate indicator columns will be created for each sport to identify when each sport was being played. 

In [2271]:
combined_df['game_mode'].value_counts()

Continuous              1309915
Baseball Pitching       1023221
Training                 292230
Whack                     98813
Continuous_RR             82222
Continuous_BB             57987
Continuous_RR_BB          46181
over_challenge_RR         46063
Baseball Pitching_RR      43109
Finish                    40771
Training_BB               28841
Survive                   14481
ball_challenge             7496
Test Ball                  7127
over_challenge             4497
Whack_RR                   2661
Innovate                    808
Continuous_TN               735
Whack_BB                    619
over_challenge_RR_BB        520
Training_TN                 486
over_challenge_TN           215
Continuous_RR_TN            139
over_challenge_BB           113
ball_challenge_BB           110
None                         51
ball_challenge_TN            11
Name: game_mode, dtype: int64

In [2272]:
# Note any game mode without BB or TN in is cricket.

def sport_function(dataset):
    
    #Identfying cricket deliveries
    dataset['cricket'] = np.where((dataset['game_mode'] == 'Continuous')|(dataset['game_mode'] == 'Training')|
                                  (dataset['game_mode'] == 'Whack')|(dataset['game_mode'] == 'Continuous_RR')|
                                  (dataset['game_mode'] == 'Finish')|(dataset['game_mode'] == 'over_challenge_RR')|
                                  (dataset['game_mode'] == 'Survive')|(dataset['game_mode'] == 'ball_challenge')|
                                  (dataset['game_mode'] == 'Test Ball')|(dataset['game_mode'] == 'over_challenge')|
                                  (dataset['game_mode'] == 'Whack_RR')|(dataset['game_mode'] == 'Innovate')
                                  ,1,0)
    
    #Identfying baseball deliveries
    dataset['baseball'] = np.where((dataset['game_mode'] == 'Baseball Pitching')|(dataset['game_mode'] == 'Continuous_BB')|
                                   (dataset['game_mode'] == 'Continuous_RR_BB')|(dataset['game_mode'] == 'Baseball Pitching_RR')|
                                   (dataset['game_mode'] == 'Training_BB')|(dataset['game_mode'] == 'Whack_BB')|
                                   (dataset['game_mode'] == 'over_challenge_RR_BB')|(dataset['game_mode'] == 'over_challenge_BB')|
                                   (dataset['game_mode'] == 'ball_challenge_BB')
                                   ,1,0)
    
    #Identfying tennis deliveries
    dataset['tennis'] = np.where((dataset['game_mode'] == 'Continuous_TN')|(dataset['game_mode'] == 'Training_TN')|
                                 (dataset['game_mode'] == 'over_challenge_TN')|(dataset['game_mode'] == 'Continuous_RR_TN')|
                                 (dataset['game_mode'] == 'ball_challenge_TN')
                                 ,1,0)
    
    return dataset.info()

In [2273]:
sport_function(combined_df)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3110936 entries, 0 to 3110935
Data columns (total 43 columns):
 #   Column         Dtype  
---  ------         -----  
 0   sim_number     int64  
 1   timestamp_x    object 
 2   batfast_id     object 
 3   client_name    object 
 4   event_name     object 
 5   game_mode      object 
 6   score          object 
 7   speed          float64
 8   pitch          float64
 9   swing          float64
 10  pan            float64
 11  turn           float64
 12  r              float64
 13  theta          float64
 14  z              float64
 15  power          float64
 16  machine        object 
 17  scoring        object 
 18  length         object 
 19  email          object 
 20  name           object 
 21  gender         object 
 22  age_group      object 
 23  hand           object 
 24  timestamp_y    object 
 25  updated        object 
 26  speed_adj      float64
 27  swing_adj      float64
 28  theta_adj      float64
 29  hour          

In [2274]:
#Checking all rows have been included in the sports columns (plus those rows with nan and missing values for game modes)
len(combined_df) == ((combined_df['cricket'].sum()) + (combined_df['tennis'].sum()) + (combined_df['baseball'].sum())) + 51 + combined_df['game_mode'].isnull().sum()

True

In [2275]:
#Proportion of deliveries in the dataset for each sport

cricket_percent = (combined_df['cricket'].sum()/len(combined_df))*100
baseball_percent = (combined_df['baseball'].sum()/len(combined_df))*100
tennis_percent = (combined_df['tennis'].sum()/len(combined_df))*100
no_game_mode = (((len(combined_df)) - ((combined_df['cricket'].sum())+(combined_df['baseball'].sum()))+(combined_df['tennis'].sum()))/len(combined_df))*100

print(f"Cricket Deliveries: {round(cricket_percent,1)}%")
print(f"Baseball Deliveries: {round(baseball_percent,1)}%")
print(f"Tennis Deliveries: {round(tennis_percent,1)}%")
print(f"No Game Mode Deliveries: {round(no_game_mode,1)}%")

Cricket Deliveries: 61.3%
Baseball Deliveries: 38.6%
Tennis Deliveries: 0.1%
No Game Mode Deliveries: 0.2%


### Distribution of Users by Gender

In [2276]:
#Seeing the distribution of gender across users (ignoring missing values)
user_data['gender'].value_counts()

#Seeing how many users did NOT provide gender information
user_data['gender'].isnull().sum()/len(user_data)

male      21984
female     2870
na            5
Male          1
Female        1
Name: gender, dtype: int64

0.6285522187359929

In [2277]:
#Note that total male users is calculated as the total number of users who have identified themselves as male -
#given the large number of missing values for gender this won't be an exact match but should be relatively representative of 
#batfast's user base
male_total_users = len(user_data[(user_data['gender']=='male')|(user_data['gender']=='Male')])
female_total_users = len(user_data[(user_data['gender']=='female')|(user_data['gender']=='Female')])
male_total_users
female_total_users

21985

2871

In [2278]:
#Gender Percent
male_percent = (male_total_users/(male_total_users + female_total_users))*100
female_percent = (female_total_users/(male_total_users + female_total_users))*100
print(f"Male users: {round(male_percent,1)}%")
print(f"Female users: {round(female_percent,1)}%")

Male users: 88.4%
Female users: 11.6%


Of those who have identified their gender with BatFast, 88.4% are male and just 11.6% are female. This suggests users of BatFast are overwhelmingly male. 

### Distribution of Users by Age

In [2279]:
#Using the age group function created above to amend the duplicated age groups for the 'user_data' dataset
age_group_adjustment(user_data)

array([nan, '16Plus', '10To16', '4To10'], dtype=object)

In [2280]:
#Seeing the distribution of age across users (ignoring missing values)
user_data['age_group'].value_counts()

#Seeing how many users did NOT provide age information
user_data['age_group'].isnull().sum()/len(user_data)

16Plus    14034
10To16     6062
4To10      4731
Name: age_group, dtype: int64

0.6290602121619603

In [2281]:
#Note that total number of users by age group are similarly calculated just from users who provided their ages. 
#Interestingly, there is almost an indentical proportion of users who provided their age information as those who
#provided gender information. This suggests there may be something systematic when it comes to providing this type 
#of information for users (i.e. certain events, users or clients influence the likelihood of this information 
#being provided)

four_to_ten = len(user_data[(user_data['age_group']=='4To10')])
ten_to_sixteen = len(user_data[(user_data['age_group']=='10To16')])
sixteen_plus = len(user_data[(user_data['age_group']=='16Plus')])

four_to_ten
ten_to_sixteen
sixteen_plus

4731

6062

14034

In [2282]:
four_to_ten_percent = (four_to_ten/(four_to_ten + ten_to_sixteen + sixteen_plus))*100
ten_to_sixteen_percent = (ten_to_sixteen/(four_to_ten + ten_to_sixteen + sixteen_plus))*100
sixteen_plus_percent = (sixteen_plus/(four_to_ten + ten_to_sixteen + sixteen_plus))*100
print(f"4 to 10: {round(four_to_ten_percent,1)}%")
print(f"10 to 16: {round(ten_to_sixteen_percent,1)}%")
print(f"16+ users: {round(sixteen_plus_percent,1)}%")

4 to 10: 19.1%
10 to 16: 24.4%
16+ users: 56.5%


# 7. Headline Statistics

The next objective of the project is to obtain headline statistics for the customer dashboard. These will be statistics which provide a general overview of a user's experience on the BatFast simulators. They will not be sport specific in order to ensure that the 'headline' dashboard is relevant and engaging to all users, regardless of which sport they had played on the simulator and regardless of whether they used it for training or entertainment.

In [2283]:
combined_df_1 = combined_df.copy()

In [2284]:
combined_df_1.groupby(['email','name']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,sim_number,timestamp_x,batfast_id,client_name,event_name,game_mode,score,speed,pitch,swing,...,date,score_adj,runs,points,wicket,training,entertainment,cricket,baseball,tennis
email,name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
0070@stores.gane.co.uk,GAME STORE,10,10,10,10,10,10,10,10,10,10,...,10,10,10,10,0,10,10,10,10,10
012483@stretfordhigh.com,HANY,6,6,6,6,6,6,6,6,6,6,...,6,6,6,6,1,6,6,6,6,6
05tsmith@brightoncollege.net,TOMAS SMITH,17,17,17,17,17,17,0,17,17,17,...,17,0,17,0,0,17,17,17,17,17
0845100989.gf@gmail.com,GERRIE,6,6,6,6,6,6,6,6,6,6,...,6,6,6,6,2,6,6,6,6,6
08darsh@gmail.com,AVNEET,6,6,6,6,6,6,6,6,6,6,...,6,6,6,6,0,6,6,6,6,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zureenali@hotmail.co.uk,ZUREEN ALI,6,6,6,6,6,6,6,0,0,0,...,6,6,6,6,1,6,6,6,6,6
zwebb05@icloud.com,ZAK,32,32,32,32,32,32,32,32,32,32,...,32,32,32,32,4,32,32,32,32,32
zygarde1234@gmail.com,SABBIR AHMED,24,24,24,24,24,24,24,24,24,24,...,24,24,24,24,3,24,24,24,24,24
zyounus@gmail.com,ZEESHAN,6,6,6,6,6,6,6,6,6,6,...,6,6,6,6,2,6,6,6,6,6


## 7.1 Number of BatFast Deliveries Faced 

In [2285]:
#The following code counts the number of rows (i.e. delivieries) and is then GROUPED by the method outlined above
#(i.e. by email and then name)
user_deliveries_faced = combined_df_1.groupby(['email','name']).count()
user_deliveries_faced = user_deliveries_faced['timestamp_x'].to_frame()
user_deliveries_faced.rename(columns={"timestamp_x":'deliveries_faced'},inplace=True)

In [2286]:
user_deliveries_faced

Unnamed: 0_level_0,Unnamed: 1_level_0,deliveries_faced
email,name,Unnamed: 2_level_1
0070@stores.gane.co.uk,GAME STORE,10
012483@stretfordhigh.com,HANY,6
05tsmith@brightoncollege.net,TOMAS SMITH,17
0845100989.gf@gmail.com,GERRIE,6
08darsh@gmail.com,AVNEET,6
...,...,...
zureenali@hotmail.co.uk,ZUREEN ALI,6
zwebb05@icloud.com,ZAK,32
zygarde1234@gmail.com,SABBIR AHMED,24
zyounus@gmail.com,ZEESHAN,6


## 7.2 Number of BatFast Sessions

This figure will require a relatively strong assumption, although it is not necessarily unrealistic. To calculate the number of sessions a user has had, it will be assumed that each user will have a maximum number of one session per day. This is done given that it is very difficult to know what length of time to set between sessions to quantify the later one as being a separate session but on the same day. Thus, for ease, it will be assumed that each user will have a maximum of one session a day; in other words, the number of sessions will equate to the number of individual days a user has used a simulator. 

In [2287]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3110936 entries, 0 to 3110935
Data columns (total 43 columns):
 #   Column         Dtype  
---  ------         -----  
 0   sim_number     int64  
 1   timestamp_x    object 
 2   batfast_id     object 
 3   client_name    object 
 4   event_name     object 
 5   game_mode      object 
 6   score          object 
 7   speed          float64
 8   pitch          float64
 9   swing          float64
 10  pan            float64
 11  turn           float64
 12  r              float64
 13  theta          float64
 14  z              float64
 15  power          float64
 16  machine        object 
 17  scoring        object 
 18  length         object 
 19  email          object 
 20  name           object 
 21  gender         object 
 22  age_group      object 
 23  hand           object 
 24  timestamp_y    object 
 25  updated        object 
 26  speed_adj      float64
 27  swing_adj      float64
 28  theta_adj      float64
 29  hour          

In [2288]:
#Counting the nunber of unique values in each column for each individually grouped user. 
user_sessions = combined_df_1.groupby(['email','name']).nunique()
#For this statistic the date column is the unique value of interest
user_sessions = user_sessions['date'].to_frame()
user_sessions.rename(columns = {"date":"number_of_sessions"}, inplace=True)

In [2289]:
user_sessions

Unnamed: 0_level_0,Unnamed: 1_level_0,number_of_sessions
email,name,Unnamed: 2_level_1
0070@stores.gane.co.uk,GAME STORE,1
012483@stretfordhigh.com,HANY,1
05tsmith@brightoncollege.net,TOMAS SMITH,1
0845100989.gf@gmail.com,GERRIE,1
08darsh@gmail.com,AVNEET,1
...,...,...
zureenali@hotmail.co.uk,ZUREEN ALI,1
zwebb05@icloud.com,ZAK,1
zygarde1234@gmail.com,SABBIR AHMED,3
zyounus@gmail.com,ZEESHAN,1


In [2290]:
# THIS FORMULA gives the balls faced each day by the user
# date_time_practice.groupby(['date']).count()

### Returning Users Analysis

NOTE: BatFast have also indicated they would like to see the % of users which are returning users (i.e. which users have used BatFast simulators on at least two separate dates. Given the way that sessions have been calculated, this requires a simple filter for all users who have had more than 1 session as defined above. 

In [2291]:
returning_users_df = user_sessions.loc[user_sessions['number_of_sessions']>=2]
number_returning_users = len(returning_users_df)
percentage_returning_users = (len(returning_users_df)/len(user_sessions))*100

#The below calculations use data from only users who have provided their email and name
print("With user email and name:")
print(f"The number of returning users is: {number_returning_users}")
print(f"The percentage of returning users is: {round(percentage_returning_users,2)}%")

#The below calculations use all of BatFast's data and group based on BatFast ID
user_sessions_1 = combined_df_1.groupby(['batfast_id']).nunique()
user_sessions_1 = user_sessions_1['date'].to_frame()
user_sessions_1.rename(columns = {"date":"number_of_sessions"}, inplace=True)

returning_users_df_1 = user_sessions_1.loc[user_sessions_1['number_of_sessions']>=2]
number_returning_users_1 = len(returning_users_df_1)
percentage_returning_users_1 = (len(returning_users_df_1)/len(user_sessions_1))*100
print("With batfast_id:")
print(f"The number of returning users is: {number_returning_users_1}")
print(f"The percentage of returning users is: {round(percentage_returning_users_1,2)}%")


With user email and name:
The number of returning users is: 978
The percentage of returning users is: 7.33%
With batfast_id:
The number of returning users is: 1708
The percentage of returning users is: 2.91%


## 7.3 Fastest Delivery 

In [2292]:
#Finding the maximum value of each column for each individual group
fastest_delivery = combined_df_1[['email','name','speed_adj']].groupby(['email','name']).max()
#For this statistic the maximum value in the speed_adj column is of interest
fastest_delivery = fastest_delivery['speed_adj'].to_frame()
fastest_delivery.rename(columns={"speed_adj":'max_delivery_speed'},inplace=True)

In [2293]:
fastest_delivery

Unnamed: 0_level_0,Unnamed: 1_level_0,max_delivery_speed
email,name,Unnamed: 2_level_1
0070@stores.gane.co.uk,GAME STORE,70.0
012483@stretfordhigh.com,HANY,58.3
05tsmith@brightoncollege.net,TOMAS SMITH,100.0
0845100989.gf@gmail.com,GERRIE,100.0
08darsh@gmail.com,AVNEET,20.0
...,...,...
zureenali@hotmail.co.uk,ZUREEN ALI,
zwebb05@icloud.com,ZAK,50.0
zygarde1234@gmail.com,SABBIR AHMED,50.0
zyounus@gmail.com,ZEESHAN,60.0


## 7.4 Slowest Delivery

In [2294]:
#Finding the minimum value of each column for each individual group
slowest_delivery = combined_df_1[['email','name','speed_adj']].groupby(['email','name']).min()
#For this statistic the minimum value in the speed_adj column is of interest
slowest_delivery = slowest_delivery['speed_adj'].to_frame()
slowest_delivery.rename(columns={"speed_adj":'min_delivery_speed'},inplace=True)

In [2295]:
slowest_delivery


Output cache limit (currently 1000 entries) hit.
Flushing oldest 200 entries.



Unnamed: 0_level_0,Unnamed: 1_level_0,min_delivery_speed
email,name,Unnamed: 2_level_1
0070@stores.gane.co.uk,GAME STORE,60.0
012483@stretfordhigh.com,HANY,58.3
05tsmith@brightoncollege.net,TOMAS SMITH,100.0
0845100989.gf@gmail.com,GERRIE,70.0
08darsh@gmail.com,AVNEET,20.0
...,...,...
zureenali@hotmail.co.uk,ZUREEN ALI,
zwebb05@icloud.com,ZAK,20.0
zygarde1234@gmail.com,SABBIR AHMED,40.0
zyounus@gmail.com,ZEESHAN,60.0


## 7.5 Total Points/ Runs Scored

In [2296]:
#Summing the values in each column for each individual group
total_score = combined_df_1[['email','name','score_adj']].groupby(['email','name']).sum()
#We want the summed value of each individual group so the 'score_adj' column is of interest
total_score = total_score['score_adj'].to_frame()
total_score.rename(columns={"score_adj":'total_score/runs'},inplace=True)

In [2297]:
total_score

Unnamed: 0_level_0,Unnamed: 1_level_0,total_score/runs
email,name,Unnamed: 2_level_1
0070@stores.gane.co.uk,GAME STORE,18.0
012483@stretfordhigh.com,HANY,11.0
05tsmith@brightoncollege.net,TOMAS SMITH,0.0
0845100989.gf@gmail.com,GERRIE,8.0
08darsh@gmail.com,AVNEET,12.0
...,...,...
zureenali@hotmail.co.uk,ZUREEN ALI,16.0
zwebb05@icloud.com,ZAK,59.0
zygarde1234@gmail.com,SABBIR AHMED,48.0
zyounus@gmail.com,ZEESHAN,7.0


## 7.6 Sport Breakdown

In [2298]:
#Summing the values in each sports indicator column for each individual group to show the number of deliveries they
#have faced for each sport. 
sport_breakdown = combined_df_1[['email','name','cricket','baseball','tennis']].groupby(['email','name']).sum()
sport_breakdown.rename(columns={"cricket":"cricket_deliveries","baseball":"baseball_deliveries","tennis":"tennis_deliveries"}, inplace=True)

In [2299]:
sport_breakdown

Unnamed: 0_level_0,Unnamed: 1_level_0,cricket_deliveries,baseball_deliveries,tennis_deliveries
email,name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0070@stores.gane.co.uk,GAME STORE,10,0,0
012483@stretfordhigh.com,HANY,6,0,0
05tsmith@brightoncollege.net,TOMAS SMITH,17,0,0
0845100989.gf@gmail.com,GERRIE,6,0,0
08darsh@gmail.com,AVNEET,6,0,0
...,...,...,...,...
zureenali@hotmail.co.uk,ZUREEN ALI,6,0,0
zwebb05@icloud.com,ZAK,32,0,0
zygarde1234@gmail.com,SABBIR AHMED,24,0,0
zyounus@gmail.com,ZEESHAN,6,0,0


## 7.7 Average Delivery Speed 

In [2300]:
#Summing the total speed of deliveries
average_speed = combined_df_1[['email','name','speed_adj']].groupby(['email','name']).sum()
average_speed.rename(columns = {'speed_adj':'total_speed'}, inplace=True)
#Merging with dataframe which shows the number of deliveries faced
average_speed = average_speed.merge(user_deliveries_faced,
                                    left_index=True, right_index=True, how='outer')

#Dividing in order to find the average speed
average_speed['average_speed'] = round(average_speed['total_speed']/average_speed['deliveries_faced'],1)
average_speed.head(5)
#Dropping unecessary columns
average_speed.drop(columns=['total_speed','deliveries_faced'], inplace=True)
average_speed.head(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,total_speed,deliveries_faced,average_speed
email,name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0070@stores.gane.co.uk,GAME STORE,630.0,10,63.0
012483@stretfordhigh.com,HANY,349.8,6,58.3
05tsmith@brightoncollege.net,TOMAS SMITH,1700.0,17,100.0
0845100989.gf@gmail.com,GERRIE,480.0,6,80.0
08darsh@gmail.com,AVNEET,120.0,6,20.0


Unnamed: 0_level_0,Unnamed: 1_level_0,average_speed
email,name,Unnamed: 2_level_1
0070@stores.gane.co.uk,GAME STORE,63.0
012483@stretfordhigh.com,HANY,58.3
05tsmith@brightoncollege.net,TOMAS SMITH,100.0
0845100989.gf@gmail.com,GERRIE,80.0
08darsh@gmail.com,AVNEET,20.0


## 7.8 Merging Statistics 

In [2301]:
#MERGING THE ABOVE DATAFRAMES INTO ONE DATAFRAME
headline_statistics_1 = user_deliveries_faced.merge(user_sessions,
                                                  left_index=True, right_index=True, how='outer')

In [2302]:
headline_statistics_2 = headline_statistics_1.merge(fastest_delivery,
                                                  left_index=True, right_index=True, how='outer')

In [2303]:
headline_statistics_3 = headline_statistics_2.merge(slowest_delivery,
                                                  left_index=True, right_index=True, how='outer')

In [2304]:
headline_statistics_4 = headline_statistics_3.merge(total_score,
                                                  left_index=True, right_index=True, how='outer')

In [2305]:
headline_statistics_5 = headline_statistics_4.merge(sport_breakdown[['cricket_deliveries','baseball_deliveries','tennis_deliveries']],
                                                  left_index=True, right_index=True, how='outer')

In [2306]:
headline_statistics_6 = headline_statistics_5.merge(average_speed,
                                                  left_index=True, right_index=True, how='outer')

In [2307]:
headline_statistics = headline_statistics_6.copy()

In [2308]:
#The final dataframe
headline_statistics

Unnamed: 0_level_0,Unnamed: 1_level_0,deliveries_faced,number_of_sessions,max_delivery_speed,min_delivery_speed,total_score/runs,cricket_deliveries,baseball_deliveries,tennis_deliveries,average_speed
email,name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0070@stores.gane.co.uk,GAME STORE,10,1,70.0,60.0,18.0,10,0,0,63.0
012483@stretfordhigh.com,HANY,6,1,58.3,58.3,11.0,6,0,0,58.3
05tsmith@brightoncollege.net,TOMAS SMITH,17,1,100.0,100.0,0.0,17,0,0,100.0
0845100989.gf@gmail.com,GERRIE,6,1,100.0,70.0,8.0,6,0,0,80.0
08darsh@gmail.com,AVNEET,6,1,20.0,20.0,12.0,6,0,0,20.0
...,...,...,...,...,...,...,...,...,...,...
zureenali@hotmail.co.uk,ZUREEN ALI,6,1,,,16.0,6,0,0,0.0
zwebb05@icloud.com,ZAK,32,1,50.0,20.0,59.0,32,0,0,37.8
zygarde1234@gmail.com,SABBIR AHMED,24,3,50.0,40.0,48.0,24,0,0,47.9
zyounus@gmail.com,ZEESHAN,6,1,60.0,60.0,7.0,6,0,0,60.0


## 7.8 Generating Functions for Automatic Calculation of Headline Statistics

The below simply involves manipulating the above code into functions so that all the headline statistics can be generated from a base dataset which is in the same format as 'combined_df'.

In [2309]:
def user_deliveries_function(dataset):
    user_deliveries_faced_df = dataset.groupby(['email','name']).count()
    user_deliveries_faced_df = user_deliveries_faced_df['timestamp_x'].to_frame()
    user_deliveries_faced_df.rename(columns={"timestamp_x":'deliveries_faced'},inplace=True)
    return user_deliveries_faced_df

In [2310]:
def user_sessions_function(dataset):
    user_sessions_df = dataset.groupby(['email','name']).nunique()
    user_sessions_df = user_sessions_df['date'].to_frame()
    user_sessions_df.rename(columns = {"date":"number_of_sessions"}, inplace=True)
    return user_sessions_df

In [2311]:
def fastest_delivery_function(dataset):
    fastest_delivery_df = dataset[['email','name','speed_adj']].groupby(['email','name']).max()
    fastest_delivery_df = fastest_delivery_df['speed_adj'].to_frame()
    fastest_delivery_df.rename(columns={"speed_adj":'max_delivery_speed'},inplace=True)
    return fastest_delivery_df

In [2312]:
def slowest_delivery_function(dataset):
    slowest_delivery_df = dataset[['email','name','speed_adj']].groupby(['email','name']).min()
    slowest_delivery_df = slowest_delivery_df['speed_adj'].to_frame()
    slowest_delivery_df.rename(columns={"speed_adj":'min_delivery_speed'},inplace=True)
    return slowest_delivery_df

In [2313]:
def total_score_function(dataset):
    total_score_df = dataset[['email','name','score_adj']].groupby(['email','name']).sum()
    total_score_df = total_score_df['score_adj'].to_frame()
    total_score_df.rename(columns={"score_adj":'total_score'},inplace=True)
    return total_score_df

In [2314]:
def sport_breakdown_function(dataset):
    sport_breakdown_df = dataset[['email','name','cricket','baseball','tennis']].groupby(['email','name']).sum()
    sport_breakdown_df.rename(columns={"cricket":"cricket_deliveries","baseball":"baseball_deliveries","tennis":"tennis_deliveries"}, inplace=True)
    return sport_breakdown_df

In [2315]:
def average_speed_function(dataset):
    
    average_speed_df = dataset[['email','name','speed_adj']].groupby(['email','name']).sum()
    average_speed_df.rename(columns = {'speed_adj':'total_speed'}, inplace=True)
    average_speed_df = average_speed_df.merge(user_deliveries_faced,
                                    left_index=True, right_index=True, how='outer')

    average_speed_df['average_speed'] = round(average_speed_df['total_speed']/average_speed_df['deliveries_faced'],1)
    average_speed_df.drop(columns=['total_speed','deliveries_faced'], inplace=True)
    return average_speed_df

In [2316]:
def generate_headline_statistics_function(dataset):
    
    #Using above functions to generate relevant statistics and dataframes
    user_deliveries_faced_1 = user_deliveries_function(dataset)
    user_sessions_1 = user_sessions_function(dataset)
    fastest_delivery_1 = fastest_delivery_function(dataset)
    slowest_delivery_1 = slowest_delivery_function(dataset)
    total_score_1 = total_score_function(dataset)
    sport_breakdown_1 = sport_breakdown_function(dataset)
    average_speed_1 = average_speed_function(dataset)
    
    #Merging the above dataframes into 1 with all headline statistics
    headline_statistics_1 = user_deliveries_faced.merge(user_sessions_1,
                                                  left_index=True, right_index=True, how='outer')
    headline_statistics_2 = headline_statistics_1.merge(fastest_delivery_1,
                                                  left_index=True, right_index=True, how='outer')
    headline_statistics_3 = headline_statistics_2.merge(slowest_delivery_1,
                                                  left_index=True, right_index=True, how='outer')
    headline_statistics_4 = headline_statistics_3.merge(total_score_1,
                                                  left_index=True, right_index=True, how='outer')
    headline_statistics_5 = headline_statistics_4.merge(sport_breakdown_1,
                                                  left_index=True, right_index=True, how='outer')
    headline_statistics_6 = headline_statistics_5.merge(average_speed,
                                                  left_index=True, right_index=True, how='outer')
    headline_statistics = headline_statistics_6.copy()
    
    headline_statistics.reset_index(inplace=True)
    
    #Returning the final merged dataframe
    return headline_statistics

In [2317]:
#Thus, the manually calculated headline statistics can now be automatically calculated
#provided the dataset used is of the exact same format as 'combined_df'
headline_statistics = generate_headline_statistics_function(combined_df)
headline_statistics

Unnamed: 0,email,name,deliveries_faced,number_of_sessions,max_delivery_speed,min_delivery_speed,total_score,cricket_deliveries,baseball_deliveries,tennis_deliveries,average_speed
0,0070@stores.gane.co.uk,GAME STORE,10,1,70.0,60.0,18.0,10,0,0,63.0
1,012483@stretfordhigh.com,HANY,6,1,58.3,58.3,11.0,6,0,0,58.3
2,05tsmith@brightoncollege.net,TOMAS SMITH,17,1,100.0,100.0,0.0,17,0,0,100.0
3,0845100989.gf@gmail.com,GERRIE,6,1,100.0,70.0,8.0,6,0,0,80.0
4,08darsh@gmail.com,AVNEET,6,1,20.0,20.0,12.0,6,0,0,20.0
...,...,...,...,...,...,...,...,...,...,...,...
13330,zureenali@hotmail.co.uk,ZUREEN ALI,6,1,,,16.0,6,0,0,0.0
13331,zwebb05@icloud.com,ZAK,32,1,50.0,20.0,59.0,32,0,0,37.8
13332,zygarde1234@gmail.com,SABBIR AHMED,24,3,50.0,40.0,48.0,24,0,0,47.9
13333,zyounus@gmail.com,ZEESHAN,6,1,60.0,60.0,7.0,6,0,0,60.0


## 7.9 Headline Statistics Presentation

The below shows how the headline statistics can be presented on BatFast's website. It will be a simple 'dashboard' with numbers presenting an overview of their performance, as discussed above.

In [2318]:
#Taking a sample user from the headline statistics dataframe to use as practice 
headline_df = headline_statistics.loc[(headline_statistics['email']=='0070@stores.gane.co.uk') & (headline_statistics['name']=='GAME STORE')]
headline_df

Unnamed: 0,email,name,deliveries_faced,number_of_sessions,max_delivery_speed,min_delivery_speed,total_score,cricket_deliveries,baseball_deliveries,tennis_deliveries,average_speed
0,0070@stores.gane.co.uk,GAME STORE,10,1,70.0,60.0,18.0,10,0,0,63.0


In [2319]:
#PLOTLY VISUALISATION ADMIN
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "last_expr"

In [2320]:
#The below code consists of the 9 headline statistics calculated which are then presented with plotly's go.Indicator
#method

fig = go.Figure()

fig.add_trace(go.Indicator(
    mode = "number",
    value = headline_df.iloc[0]['deliveries_faced'],
    domain = {'row':0, 'column':0},
    title = "Total Deliveries Faced"))
    
fig.add_trace(go.Indicator(
    mode = "number",
    value = headline_df.iloc[0]['total_score'],
    domain = {'row':1, 'column':0},
    title = "Total Score"))

fig.add_trace(go.Indicator(
    mode = "number",
    value = headline_df.iloc[0]['number_of_sessions'],
    domain = {'row':2, 'column':0},
    title = "Number of Sessions"))
    
fig.add_trace(go.Indicator(
    mode = "number",
    value = headline_df.iloc[0]['max_delivery_speed'],
    domain = {'row':0, 'column':1},
    title = "Maximimum Speed (mph)"))

fig.add_trace(go.Indicator(
    mode = "number",
    value = headline_df.iloc[0]['min_delivery_speed'],
    domain = {'row':1, 'column':1},
    title = "Minimum Speed (mph)"))

fig.add_trace(go.Indicator(
    mode = "number",
    value = headline_df.iloc[0]['average_speed'],
    domain = {'row':2, 'column':1},
    title = "Average Speed (mph)"))

fig.add_trace(go.Indicator(
    mode = "number",
    value = headline_df.iloc[0]['cricket_deliveries'],
    domain = {'row':0, 'column':2},
    title = "Cricket Balls Faced"))

fig.add_trace(go.Indicator(
    mode = "number",
    value = headline_df.iloc[0]['baseball_deliveries'],
    domain = {'row':1, 'column':2},
    title = "Tennis Shots Returned"))

fig.add_trace(go.Indicator(
    mode = "number",
    value = headline_df.iloc[0]['tennis_deliveries'],
    domain = {'row':2, 'column':2},
    title = "Baseball Pitches Hit"))

fig.update_layout(
        grid = {'rows':3, 'columns':3, 'pattern': "independent"},
        )
    
fig.show()

In [2321]:
#The below code puts the above visualisations into a function so that any individual group (i.e. any unique combination
#of user emails and names) can have their headline statistics presented to them. 

def headline_statistics_visual(user_email, user_name):
    
    df = headline_statistics.loc[(headline_statistics['email']==user_email) & (headline_statistics['name']==user_name)]
    
    fig = go.Figure()

    fig.add_trace(go.Indicator(
    mode = "number",
    value = df.iloc[0]['deliveries_faced'],
    domain = {'row':0, 'column':0},
    title = "Total Deliveries Faced"))
    
    fig.add_trace(go.Indicator(
    mode = "number",
    value = df.iloc[0]['total_score'],
    domain = {'row':1, 'column':0},
    title = "Total Score"))

    fig.add_trace(go.Indicator(
    mode = "number",
    value = df.iloc[0]['number_of_sessions'],
    domain = {'row':2, 'column':0},
    title = "Number of Sessions"))
    
    fig.add_trace(go.Indicator(
    mode = "number",
    value = df.iloc[0]['max_delivery_speed'],
    domain = {'row':0, 'column':1},
    title = "Maximimum Speed (mph)"))

    fig.add_trace(go.Indicator(
    mode = "number",
    value = df.iloc[0]['min_delivery_speed'],
    domain = {'row':1, 'column':1},
    title = "Minimum Speed (mph)"))

    fig.add_trace(go.Indicator(
    mode = "number",
    value = df.iloc[0]['average_speed'],
    domain = {'row':2, 'column':1},
    title = "Average Speed (mph)"))

    fig.add_trace(go.Indicator(
    mode = "number",
    value = df.iloc[0]['cricket_deliveries'],
    domain = {'row':0, 'column':2},
    title = "Cricket Balls Faced"))

    fig.add_trace(go.Indicator(
    mode = "number",
    value = df.iloc[0]['baseball_deliveries'],
    domain = {'row':1, 'column':2},
    title = "Tennis Shots Returned"))

    fig.add_trace(go.Indicator(
    mode = "number",
    value = df.iloc[0]['tennis_deliveries'],
    domain = {'row':2, 'column':2},
    title = "Baseball Pitches Hit"))

    fig.update_layout(
        grid = {'rows':3, 'columns':3, 'pattern': "independent"},
        )
    
    fig.show()    

In [2322]:
headline_statistics_visual('0070@stores.gane.co.uk', 'GAME STORE')

# 8. Cricket Statistics and Dashboard 

In [2323]:
#Identifying all cricket deliveries within the combined dataframe
cricket_df = combined_df.loc[combined_df['cricket']==1]
cricket_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1907084 entries, 0 to 3110935
Data columns (total 43 columns):
 #   Column         Dtype  
---  ------         -----  
 0   sim_number     int64  
 1   timestamp_x    object 
 2   batfast_id     object 
 3   client_name    object 
 4   event_name     object 
 5   game_mode      object 
 6   score          object 
 7   speed          float64
 8   pitch          float64
 9   swing          float64
 10  pan            float64
 11  turn           float64
 12  r              float64
 13  theta          float64
 14  z              float64
 15  power          float64
 16  machine        object 
 17  scoring        object 
 18  length         object 
 19  email          object 
 20  name           object 
 21  gender         object 
 22  age_group      object 
 23  hand           object 
 24  timestamp_y    object 
 25  updated        object 
 26  speed_adj      float64
 27  swing_adj      float64
 28  theta_adj      float64
 29  hour          

## 8.1 Cricket Simulator Uses

In [2324]:
#Calculating the percentage of entertainment vs. training deliveries in the cricket dataframe
cricket_training_percent = (cricket_df['training'].sum()/len(cricket_df))*100
cricket_entertainment_percent = (cricket_df['entertainment'].sum()/len(cricket_df))*100

print(f"% of cricket simulators used for training: {round(cricket_training_percent,1)}")
print(f"% of cricket simulators used for entertainment: {round(cricket_entertainment_percent,1)}")

% of cricket simulators used for training: 15.3
% of cricket simulators used for entertainment: 84.7


The above shows that cricket simulators are overwhelmingly used for entertainment purposes although there is still a sizeable minority used for training purposes.

In [2325]:
#Sorting BatFast's cricket clients
cricket_clients = cricket_df.groupby(['client_name']).count() 
cricket_clients.sort_values('timestamp_x', ascending=False, inplace=True)
cricket_clients

Unnamed: 0_level_0,sim_number,timestamp_x,batfast_id,event_name,game_mode,score,speed,pitch,swing,pan,...,date,score_adj,runs,points,wicket,training,entertainment,cricket,baseball,tennis
client_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
funholics-pakistan,854619,854619,854619,854619,854619,6898,824506,824506,824506,824506,...,854619,6898,854619,6898,530,854619,854619,854619,854619,854619
cricket-simulator-canada,230874,230874,230874,230874,230874,206501,230874,230874,230874,216457,...,230874,206501,230874,206501,6,230874,230874,230874,230874,230874
tenpin,118051,118051,118051,118051,118051,115636,118051,118051,118051,118051,...,118051,115636,118051,115636,24,118051,118051,118051,118051,118051
batfast,113421,113421,113421,113421,113421,53816,111191,111191,111191,110560,...,113421,53816,113421,53816,1244,113421,113421,113421,113421,113421
the-game-adelaide,86384,86384,86384,86384,86384,73393,86384,86384,86384,86384,...,86384,73393,86384,73393,917,86384,86384,86384,86384,86384
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
bnp-paribas,194,194,194,194,194,165,194,194,194,194,...,194,165,194,165,34,194,194,194,194,194
mlb-series-london,175,175,175,175,175,0,175,175,175,175,...,175,0,175,0,0,175,175,175,175,175
cerebrum-matters,161,161,161,161,161,18,161,161,161,161,...,161,18,161,18,0,161,161,161,161,161
dallas-polo,80,80,80,80,80,20,80,80,80,0,...,80,20,80,20,1,80,80,80,80,80


In [2326]:
#Finding BatFast's top 5 cricket clients
top5_clients = list(cricket_clients.index)[:5]
print(f"BatFast's top 5 cricket clients are:\n1. {top5_clients[0]}\n2. {top5_clients[1]}\n3. {top5_clients[2]}\n4. {top5_clients[3]}\n5. {top5_clients[4]}")

BatFast's top 5 cricket clients are:
1. funholics-pakistan
2. cricket-simulator-canada
3. tenpin
4. batfast
5. the-game-adelaide


## 8.2 Cricket Statistics 

### Cricket Deliveries Faced

In [2327]:
# Cricket Deliveries Faced
user_deliveries_function(cricket_df)

Unnamed: 0_level_0,Unnamed: 1_level_0,deliveries_faced
email,name,Unnamed: 2_level_1
0070@stores.gane.co.uk,GAME STORE,10
012483@stretfordhigh.com,HANY,6
05tsmith@brightoncollege.net,TOMAS SMITH,17
0845100989.gf@gmail.com,GERRIE,6
08darsh@gmail.com,AVNEET,6
...,...,...
zureenali@hotmail.co.uk,ZUREEN ALI,6
zwebb05@icloud.com,ZAK,32
zygarde1234@gmail.com,SABBIR AHMED,24
zyounus@gmail.com,ZEESHAN,6


### Total Runs Scored

In [2328]:
#Finding the total runs scored by each user
total_runs_scored = cricket_df[['email','name','runs']].groupby(['email','name']).sum()
total_runs_scored = total_runs_scored['runs'].to_frame()
total_runs_scored.rename(columns = {"runs":"total_runs_scored"},inplace=True)

In [2329]:
total_runs_scored

Unnamed: 0_level_0,Unnamed: 1_level_0,total_runs_scored
email,name,Unnamed: 2_level_1
0070@stores.gane.co.uk,GAME STORE,18.0
012483@stretfordhigh.com,HANY,11.0
05tsmith@brightoncollege.net,TOMAS SMITH,0.0
0845100989.gf@gmail.com,GERRIE,8.0
08darsh@gmail.com,AVNEET,12.0
...,...,...
zureenali@hotmail.co.uk,ZUREEN ALI,16.0
zwebb05@icloud.com,ZAK,59.0
zygarde1234@gmail.com,SABBIR AHMED,48.0
zyounus@gmail.com,ZEESHAN,7.0


###  Total Points Scored

In [2330]:
#Finding the total runs scored by each user
total_points_scored = cricket_df[['email','name','points']].groupby(['email','name']).sum()
total_points_scored = total_points_scored['points'].to_frame()
total_points_scored.rename(columns = {"points":"total_points_scored"},inplace=True)

In [2331]:
total_points_scored

Unnamed: 0_level_0,Unnamed: 1_level_0,total_points_scored
email,name,Unnamed: 2_level_1
0070@stores.gane.co.uk,GAME STORE,0.0
012483@stretfordhigh.com,HANY,0.0
05tsmith@brightoncollege.net,TOMAS SMITH,0.0
0845100989.gf@gmail.com,GERRIE,0.0
08darsh@gmail.com,AVNEET,0.0
...,...,...
zureenali@hotmail.co.uk,ZUREEN ALI,0.0
zwebb05@icloud.com,ZAK,0.0
zygarde1234@gmail.com,SABBIR AHMED,0.0
zyounus@gmail.com,ZEESHAN,0.0


### Average Runs and Points

In order to work out a user's average runs, we must calculate the number of times a user has been out whilst facing deliveries that revolved around scoring runs and NOT points (and vice versa for average points). This is to ensure we can calculate a separate average for points, that only accounts for wickets that are taken during game modes that use points. Otherwise, the average runs and points would be skewed by wickets which were not relevant to the calculation of each particular average. 

According to BatFast, the only time a cricket delivery would involve scoring points is at the clients TenPin. As the below shows aside from an anomalous 5 deliveries all points that are scored within the dataset are from TenPin. Thus wickets under points game modes will only be incurred at TenPin. All other wickets will be used to calculate the average for runs. 

<!-- However, this poses an issue. No specific description of the scoring methods used for each game mode is given, thus it is not particularly clear which wickets should be used when calculating the average runs/ points of a user. In other words, it is not clear which wickets should be used to calculate average runs or average points. 

On inspection, as shown below, when looking at what game modes use runs (scores of 1-6) and which use points (scores of +- >6), there is a cross over, indicating that some game modes use both runs and points as a scoring method. Therefore, it is impossible to discern between runs and points based on game mode.  -->

In [2332]:
tenpin_check = cricket_df.loc[cricket_df['points']>0]
tenpin_check['client_name'].value_counts()

tenpin     109619
batfast         3
sixes           2
Name: client_name, dtype: int64

In [2333]:
#Creating wicket indicators for both bowled and caught behind for when deliveries ended up taking the wicket of users
wickets_df = pd.get_dummies(cricket_df['wicket'])
wickets_df

Unnamed: 0,Bowled,Caught Behind
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0
...,...,...
3110931,0,0
3110932,0,0
3110933,0,0
3110934,0,0


In [2334]:
#Merging the wickets dataframe to the main cricket dataframne
cricket_df = cricket_df.merge(wickets_df, left_index=True, right_index=True, how='inner')
cricket_df['Bowled'] = np.where(cricket_df['client_name']!='tenpin', cricket_df['Bowled'],0)
cricket_df['Caught Behind'] = np.where(cricket_df['client_name']!='tenpin', cricket_df['Caught Behind'],0)
cricket_df['Points Bowled'] = np.where(cricket_df['client_name']=='tenpin', cricket_df['Bowled'],0)
cricket_df['Points Caught Behind'] = np.where(cricket_df['client_name']=='tenpin', cricket_df['Caught Behind'],0)
cricket_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1907084 entries, 0 to 3110935
Data columns (total 47 columns):
 #   Column                Dtype  
---  ------                -----  
 0   sim_number            int64  
 1   timestamp_x           object 
 2   batfast_id            object 
 3   client_name           object 
 4   event_name            object 
 5   game_mode             object 
 6   score                 object 
 7   speed                 float64
 8   pitch                 float64
 9   swing                 float64
 10  pan                   float64
 11  turn                  float64
 12  r                     float64
 13  theta                 float64
 14  z                     float64
 15  power                 float64
 16  machine               object 
 17  scoring               object 
 18  length                object 
 19  email                 object 
 20  name                  object 
 21  gender                object 
 22  age_group             object 
 23  hand   

In [2335]:
#Calculating the total number of each type of dismissal for each user
average_runs_points = cricket_df[['email','name','wicket','Bowled', 'Caught Behind','Points Bowled', 'Points Caught Behind']].groupby(['email','name']).sum()
#Creating a total wickets columns to sum the two bowled and caught behind columns
average_runs_points['total_wickets'] = average_runs_points['Bowled'] + average_runs_points['Caught Behind'] + average_runs_points['Points Bowled'] + average_runs_points['Points Caught Behind']

#Merging the dataframe with the total number of runs and points scored
average_runs_points = average_runs_points.merge(total_runs_scored, left_index=True, right_index=True, how='outer')
average_runs_points = average_runs_points.merge(total_points_scored, left_index=True, right_index=True, how='outer')
average_runs_points

Unnamed: 0_level_0,Unnamed: 1_level_0,Bowled,Caught Behind,Points Bowled,Points Caught Behind,total_wickets,total_runs_scored,total_points_scored
email,name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0070@stores.gane.co.uk,GAME STORE,0.0,0.0,0.0,0.0,0.0,18.0,0.0
012483@stretfordhigh.com,HANY,1.0,0.0,0.0,0.0,1.0,11.0,0.0
05tsmith@brightoncollege.net,TOMAS SMITH,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0845100989.gf@gmail.com,GERRIE,1.0,1.0,0.0,0.0,2.0,8.0,0.0
08darsh@gmail.com,AVNEET,0.0,0.0,0.0,0.0,0.0,12.0,0.0
...,...,...,...,...,...,...,...,...
zureenali@hotmail.co.uk,ZUREEN ALI,1.0,0.0,0.0,0.0,1.0,16.0,0.0
zwebb05@icloud.com,ZAK,0.0,0.0,0.0,0.0,0.0,59.0,0.0
zygarde1234@gmail.com,SABBIR AHMED,3.0,0.0,0.0,0.0,3.0,48.0,0.0
zyounus@gmail.com,ZEESHAN,2.0,0.0,0.0,0.0,2.0,7.0,0.0


In [2551]:
#Calculating average runs and points from the above dataframe
average_runs_points['average_runs'] = average_runs_points['total_runs_scored']/((average_runs_points['Bowled'] + average_runs_points['Caught Behind']))
average_runs_points['average_points'] = average_runs_points['total_points_scored']/((average_runs_points['Points Bowled'] + average_runs_points['Points Caught Behind']))
average_runs_points

Unnamed: 0_level_0,Unnamed: 1_level_0,Bowled,Caught Behind,Points Bowled,Points Caught Behind,total_wickets,total_runs_scored,total_points_scored,average_runs,average_points
email,name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0070@stores.gane.co.uk,GAME STORE,0.0,0.0,0.0,0.0,0.0,18.0,0.0,inf,
012483@stretfordhigh.com,HANY,1.0,0.0,0.0,0.0,1.0,11.0,0.0,11.0,
05tsmith@brightoncollege.net,TOMAS SMITH,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,
0845100989.gf@gmail.com,GERRIE,1.0,1.0,0.0,0.0,2.0,8.0,0.0,4.0,
08darsh@gmail.com,AVNEET,0.0,0.0,0.0,0.0,0.0,12.0,0.0,inf,
...,...,...,...,...,...,...,...,...,...,...
zureenali@hotmail.co.uk,ZUREEN ALI,1.0,0.0,0.0,0.0,1.0,16.0,0.0,16.0,
zwebb05@icloud.com,ZAK,0.0,0.0,0.0,0.0,0.0,59.0,0.0,inf,
zygarde1234@gmail.com,SABBIR AHMED,3.0,0.0,0.0,0.0,3.0,48.0,0.0,16.0,
zyounus@gmail.com,ZEESHAN,2.0,0.0,0.0,0.0,2.0,7.0,0.0,3.5,


In [2337]:
#Given that some users have scored runs/ points without getting out they have been given averages of infinity
#(total score/ 0 = infintiy). Therefore, whilst not technically their true average, these infinity values have 
#been overwritten and simply replaced with the total number of points/ runs scored to make the dashboard statistics
#more engaging

average_runs_points['average_runs'].loc[average_runs_points['average_runs'] ==np.inf] = average_runs_points['total_runs_scored']
average_runs_points['average_points'].loc[average_runs_points['average_points'] ==np.inf] = average_runs_points['total_points_scored']
average_runs_points

Unnamed: 0_level_0,Unnamed: 1_level_0,Bowled,Caught Behind,Points Bowled,Points Caught Behind,total_wickets,total_runs_scored,total_points_scored,average_runs,average_points
email,name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0070@stores.gane.co.uk,GAME STORE,0.0,0.0,0.0,0.0,0.0,18.0,0.0,18.0,
012483@stretfordhigh.com,HANY,1.0,0.0,0.0,0.0,1.0,11.0,0.0,11.0,
05tsmith@brightoncollege.net,TOMAS SMITH,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,
0845100989.gf@gmail.com,GERRIE,1.0,1.0,0.0,0.0,2.0,8.0,0.0,4.0,
08darsh@gmail.com,AVNEET,0.0,0.0,0.0,0.0,0.0,12.0,0.0,12.0,
...,...,...,...,...,...,...,...,...,...,...
zureenali@hotmail.co.uk,ZUREEN ALI,1.0,0.0,0.0,0.0,1.0,16.0,0.0,16.0,
zwebb05@icloud.com,ZAK,0.0,0.0,0.0,0.0,0.0,59.0,0.0,59.0,
zygarde1234@gmail.com,SABBIR AHMED,3.0,0.0,0.0,0.0,3.0,48.0,0.0,16.0,
zyounus@gmail.com,ZEESHAN,2.0,0.0,0.0,0.0,2.0,7.0,0.0,3.5,


### Comparative Performance

Given the more fun focused side of the entertainment dashboard and as suggested in the BatFast project brief, entertainemnt users will likely find it satisfuing to know where their performance stands relative to other BatFast entertainment users. Thus, four rankings will be given to the users. The first will be ranking their total runs and scores, the second the number of deliveries faced, the third the number of times they have been dismissed and fourth and finally their average runs and scores. 

In [2338]:
rank_df = average_runs_points[['total_runs_scored','total_points_scored','total_wickets','average_runs']].merge(user_deliveries_faced, left_index=True, right_index=True, how='outer')
rank_df

Unnamed: 0_level_0,Unnamed: 1_level_0,total_runs_scored,total_points_scored,total_wickets,average_runs,deliveries_faced
email,name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0070@stores.gane.co.uk,GAME STORE,18.0,0.0,0.0,18.0,10
012483@stretfordhigh.com,HANY,11.0,0.0,1.0,11.0,6
05tsmith@brightoncollege.net,TOMAS SMITH,0.0,0.0,0.0,,17
0845100989.gf@gmail.com,GERRIE,8.0,0.0,2.0,4.0,6
08darsh@gmail.com,AVNEET,12.0,0.0,0.0,12.0,6
...,...,...,...,...,...,...
zureenali@hotmail.co.uk,ZUREEN ALI,16.0,0.0,1.0,16.0,6
zwebb05@icloud.com,ZAK,59.0,0.0,0.0,59.0,32
zygarde1234@gmail.com,SABBIR AHMED,48.0,0.0,3.0,16.0,24
zyounus@gmail.com,ZEESHAN,7.0,0.0,2.0,3.5,6


In [2339]:
# Ranking Total Runs
rank_df['runs_rank'] = rank_df['total_runs_scored'].rank(ascending=False)
rank_df['runs_rank_percentile'] = (round(rank_df['runs_rank'].rank(pct=True),2))*100
rank_df

Unnamed: 0_level_0,Unnamed: 1_level_0,total_runs_scored,total_points_scored,total_wickets,average_runs,deliveries_faced,runs_rank,runs_rank_percentile
email,name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0070@stores.gane.co.uk,GAME STORE,18.0,0.0,0.0,18.0,10,3976.5,30.0
012483@stretfordhigh.com,HANY,11.0,0.0,1.0,11.0,6,6820.0,52.0
05tsmith@brightoncollege.net,TOMAS SMITH,0.0,0.0,0.0,,17,12509.0,95.0
0845100989.gf@gmail.com,GERRIE,8.0,0.0,2.0,4.0,6,8508.5,64.0
08darsh@gmail.com,AVNEET,12.0,0.0,0.0,12.0,6,6314.5,48.0
...,...,...,...,...,...,...,...,...
zureenali@hotmail.co.uk,ZUREEN ALI,16.0,0.0,1.0,16.0,6,4620.0,35.0
zwebb05@icloud.com,ZAK,59.0,0.0,0.0,59.0,32,927.5,7.0
zygarde1234@gmail.com,SABBIR AHMED,48.0,0.0,3.0,16.0,24,1199.0,9.0
zyounus@gmail.com,ZEESHAN,7.0,0.0,2.0,3.5,6,9056.0,69.0


In [2340]:
# Ranking Total Points
rank_df['points_rank'] = rank_df['total_points_scored'].rank(ascending=False)
rank_df['points_rank_percentile'] = (round(rank_df['points_rank'].rank(pct=True),2))*100
rank_df

Unnamed: 0_level_0,Unnamed: 1_level_0,total_runs_scored,total_points_scored,total_wickets,average_runs,deliveries_faced,runs_rank,runs_rank_percentile,points_rank,points_rank_percentile
email,name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0070@stores.gane.co.uk,GAME STORE,18.0,0.0,0.0,18.0,10,3976.5,30.0,6689.5,51.0
012483@stretfordhigh.com,HANY,11.0,0.0,1.0,11.0,6,6820.0,52.0,6689.5,51.0
05tsmith@brightoncollege.net,TOMAS SMITH,0.0,0.0,0.0,,17,12509.0,95.0,6689.5,51.0
0845100989.gf@gmail.com,GERRIE,8.0,0.0,2.0,4.0,6,8508.5,64.0,6689.5,51.0
08darsh@gmail.com,AVNEET,12.0,0.0,0.0,12.0,6,6314.5,48.0,6689.5,51.0
...,...,...,...,...,...,...,...,...,...,...
zureenali@hotmail.co.uk,ZUREEN ALI,16.0,0.0,1.0,16.0,6,4620.0,35.0,6689.5,51.0
zwebb05@icloud.com,ZAK,59.0,0.0,0.0,59.0,32,927.5,7.0,6689.5,51.0
zygarde1234@gmail.com,SABBIR AHMED,48.0,0.0,3.0,16.0,24,1199.0,9.0,6689.5,51.0
zyounus@gmail.com,ZEESHAN,7.0,0.0,2.0,3.5,6,9056.0,69.0,6689.5,51.0


In [2341]:
#Ranking Total Deliveries Faced
rank_df['deliveries_rank'] = rank_df['deliveries_faced'].rank(ascending=False)
rank_df['deliveries_rank_percentile'] = (round(rank_df['deliveries_rank'].rank(pct=True),2))*100
rank_df

Unnamed: 0_level_0,Unnamed: 1_level_0,total_runs_scored,total_points_scored,total_wickets,average_runs,deliveries_faced,runs_rank,runs_rank_percentile,points_rank,points_rank_percentile,deliveries_rank,deliveries_rank_percentile
email,name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0070@stores.gane.co.uk,GAME STORE,18.0,0.0,0.0,18.0,10,3976.5,30.0,6689.5,51.0,5186.0,39.0
012483@stretfordhigh.com,HANY,11.0,0.0,1.0,11.0,6,6820.0,52.0,6689.5,51.0,9380.5,70.0
05tsmith@brightoncollege.net,TOMAS SMITH,0.0,0.0,0.0,,17,12509.0,95.0,6689.5,51.0,3148.0,24.0
0845100989.gf@gmail.com,GERRIE,8.0,0.0,2.0,4.0,6,8508.5,64.0,6689.5,51.0,9380.5,70.0
08darsh@gmail.com,AVNEET,12.0,0.0,0.0,12.0,6,6314.5,48.0,6689.5,51.0,9380.5,70.0
...,...,...,...,...,...,...,...,...,...,...,...,...
zureenali@hotmail.co.uk,ZUREEN ALI,16.0,0.0,1.0,16.0,6,4620.0,35.0,6689.5,51.0,9380.5,70.0
zwebb05@icloud.com,ZAK,59.0,0.0,0.0,59.0,32,927.5,7.0,6689.5,51.0,1419.0,11.0
zygarde1234@gmail.com,SABBIR AHMED,48.0,0.0,3.0,16.0,24,1199.0,9.0,6689.5,51.0,2051.5,15.0
zyounus@gmail.com,ZEESHAN,7.0,0.0,2.0,3.5,6,9056.0,69.0,6689.5,51.0,9380.5,70.0


In [2342]:
#Ranking Total Wickets
rank_df['wickets_rank'] = rank_df['total_wickets'].rank(ascending=False)
rank_df['wickets_rank_percentile'] = (round(rank_df['wickets_rank'].rank(pct=True),2))*100
rank_df

Unnamed: 0_level_0,Unnamed: 1_level_0,total_runs_scored,total_points_scored,total_wickets,average_runs,deliveries_faced,runs_rank,runs_rank_percentile,points_rank,points_rank_percentile,deliveries_rank,deliveries_rank_percentile,wickets_rank,wickets_rank_percentile
email,name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0070@stores.gane.co.uk,GAME STORE,18.0,0.0,0.0,18.0,10,3976.5,30.0,6689.5,51.0,5186.0,39.0,8628.5,65.0
012483@stretfordhigh.com,HANY,11.0,0.0,1.0,11.0,6,6820.0,52.0,6689.5,51.0,9380.5,70.0,2999.0,23.0
05tsmith@brightoncollege.net,TOMAS SMITH,0.0,0.0,0.0,,17,12509.0,95.0,6689.5,51.0,3148.0,24.0,8628.5,65.0
0845100989.gf@gmail.com,GERRIE,8.0,0.0,2.0,4.0,6,8508.5,64.0,6689.5,51.0,9380.5,70.0,1487.0,11.0
08darsh@gmail.com,AVNEET,12.0,0.0,0.0,12.0,6,6314.5,48.0,6689.5,51.0,9380.5,70.0,8628.5,65.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zureenali@hotmail.co.uk,ZUREEN ALI,16.0,0.0,1.0,16.0,6,4620.0,35.0,6689.5,51.0,9380.5,70.0,2999.0,23.0
zwebb05@icloud.com,ZAK,59.0,0.0,0.0,59.0,32,927.5,7.0,6689.5,51.0,1419.0,11.0,8628.5,65.0
zygarde1234@gmail.com,SABBIR AHMED,48.0,0.0,3.0,16.0,24,1199.0,9.0,6689.5,51.0,2051.5,15.0,804.5,6.0
zyounus@gmail.com,ZEESHAN,7.0,0.0,2.0,3.5,6,9056.0,69.0,6689.5,51.0,9380.5,70.0,1487.0,11.0


In [2343]:
#Ranking Average Runs
rank_df['average_runs_rank'] = rank_df['average_runs'].rank(ascending=False)
rank_df['average_runs_rank_percentile'] = (round(rank_df['average_runs_rank'].rank(pct=True),2))*100
rank_df

Unnamed: 0_level_0,Unnamed: 1_level_0,total_runs_scored,total_points_scored,total_wickets,average_runs,deliveries_faced,runs_rank,runs_rank_percentile,points_rank,points_rank_percentile,deliveries_rank,deliveries_rank_percentile,wickets_rank,wickets_rank_percentile,average_runs_rank,average_runs_rank_percentile
email,name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0070@stores.gane.co.uk,GAME STORE,18.0,0.0,0.0,18.0,10,3976.5,30.0,6689.5,51.0,5186.0,39.0,8628.5,65.0,3311.0,28.0
012483@stretfordhigh.com,HANY,11.0,0.0,1.0,11.0,6,6820.0,52.0,6689.5,51.0,9380.5,70.0,2999.0,23.0,6266.5,52.0
05tsmith@brightoncollege.net,TOMAS SMITH,0.0,0.0,0.0,,17,12509.0,95.0,6689.5,51.0,3148.0,24.0,8628.5,65.0,,
0845100989.gf@gmail.com,GERRIE,8.0,0.0,2.0,4.0,6,8508.5,64.0,6689.5,51.0,9380.5,70.0,1487.0,11.0,10303.5,86.0
08darsh@gmail.com,AVNEET,12.0,0.0,0.0,12.0,6,6314.5,48.0,6689.5,51.0,9380.5,70.0,8628.5,65.0,5732.5,48.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zureenali@hotmail.co.uk,ZUREEN ALI,16.0,0.0,1.0,16.0,6,4620.0,35.0,6689.5,51.0,9380.5,70.0,2999.0,23.0,3963.0,33.0
zwebb05@icloud.com,ZAK,59.0,0.0,0.0,59.0,32,927.5,7.0,6689.5,51.0,1419.0,11.0,8628.5,65.0,503.5,4.0
zygarde1234@gmail.com,SABBIR AHMED,48.0,0.0,3.0,16.0,24,1199.0,9.0,6689.5,51.0,2051.5,15.0,804.5,6.0,3963.0,33.0
zyounus@gmail.com,ZEESHAN,7.0,0.0,2.0,3.5,6,9056.0,69.0,6689.5,51.0,9380.5,70.0,1487.0,11.0,10603.0,88.0


In [2344]:
rank_df[['runs_rank','runs_rank_percentile']]

Unnamed: 0_level_0,Unnamed: 1_level_0,runs_rank,runs_rank_percentile
email,name,Unnamed: 2_level_1,Unnamed: 3_level_1
0070@stores.gane.co.uk,GAME STORE,3976.5,30.0
012483@stretfordhigh.com,HANY,6820.0,52.0
05tsmith@brightoncollege.net,TOMAS SMITH,12509.0,95.0
0845100989.gf@gmail.com,GERRIE,8508.5,64.0
08darsh@gmail.com,AVNEET,6314.5,48.0
...,...,...,...
zureenali@hotmail.co.uk,ZUREEN ALI,4620.0,35.0
zwebb05@icloud.com,ZAK,927.5,7.0
zygarde1234@gmail.com,SABBIR AHMED,1199.0,9.0
zyounus@gmail.com,ZEESHAN,9056.0,69.0


### Boundary Analysis

In [2345]:
#Indicating which deliveries were hit for 4 and 6
cricket_df['4_boundary'] = np.where(cricket_df['runs']==4,1,0)
cricket_df['6_boundary'] = np.where(cricket_df['runs']==6,1,0)
cricket_df['4_distance'] = np.where(cricket_df['runs']==4,cricket_df['r'],0)
cricket_df['6_distance'] = np.where(cricket_df['runs']==6,cricket_df['r'],0)

In [2346]:
#Finding total boundary count for each user
boundary_count = cricket_df[['email','name','4_boundary','6_boundary']].groupby(['email','name']).sum()
boundary_count.rename(columns={"4_boundary":'number_4s',"6_boundary":"number_6s"}, inplace=True)
boundary_count

Unnamed: 0_level_0,Unnamed: 1_level_0,number_4s,number_6s
email,name,Unnamed: 2_level_1,Unnamed: 3_level_1
0070@stores.gane.co.uk,GAME STORE,3,1
012483@stretfordhigh.com,HANY,0,0
05tsmith@brightoncollege.net,TOMAS SMITH,0,0
0845100989.gf@gmail.com,GERRIE,2,0
08darsh@gmail.com,AVNEET,0,0
...,...,...,...
zureenali@hotmail.co.uk,ZUREEN ALI,1,2
zwebb05@icloud.com,ZAK,2,2
zygarde1234@gmail.com,SABBIR AHMED,6,0
zyounus@gmail.com,ZEESHAN,1,0


In [2347]:
#Finding the maximum boundary distance for each boundary type
boundary_distance = cricket_df[['email','name','4_distance','6_distance']].groupby(['email','name']).max()
boundary_distance

Unnamed: 0_level_0,Unnamed: 1_level_0,4_distance,6_distance
email,name,Unnamed: 2_level_1,Unnamed: 3_level_1
0070@stores.gane.co.uk,GAME STORE,0.0,0.0
012483@stretfordhigh.com,HANY,0.0,0.0
05tsmith@brightoncollege.net,TOMAS SMITH,0.0,0.0
0845100989.gf@gmail.com,GERRIE,0.0,0.0
08darsh@gmail.com,AVNEET,0.0,0.0
...,...,...,...
zureenali@hotmail.co.uk,ZUREEN ALI,0.0,0.0
zwebb05@icloud.com,ZAK,0.0,0.0
zygarde1234@gmail.com,SABBIR AHMED,0.0,0.0
zyounus@gmail.com,ZEESHAN,0.0,0.0


In [2348]:
#Merging the two boundary analysis dataframes
boundary_analysis = boundary_count.merge(boundary_distance, left_index=True, right_index=True, how='outer')
boundary_analysis

Unnamed: 0_level_0,Unnamed: 1_level_0,number_4s,number_6s,4_distance,6_distance
email,name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0070@stores.gane.co.uk,GAME STORE,3,1,0.0,0.0
012483@stretfordhigh.com,HANY,0,0,0.0,0.0
05tsmith@brightoncollege.net,TOMAS SMITH,0,0,0.0,0.0
0845100989.gf@gmail.com,GERRIE,2,0,0.0,0.0
08darsh@gmail.com,AVNEET,0,0,0.0,0.0
...,...,...,...,...,...
zureenali@hotmail.co.uk,ZUREEN ALI,1,2,0.0,0.0
zwebb05@icloud.com,ZAK,2,2,0.0,0.0
zygarde1234@gmail.com,SABBIR AHMED,6,0,0.0,0.0
zyounus@gmail.com,ZEESHAN,1,0,0.0,0.0


### Max-Min Speed of User

In [2349]:
#Calculated above in headline statistics
max_min_speed = fastest_delivery.merge(slowest_delivery, left_index=True, right_index=True, how='outer')
max_min_speed

Unnamed: 0_level_0,Unnamed: 1_level_0,max_delivery_speed,min_delivery_speed
email,name,Unnamed: 2_level_1,Unnamed: 3_level_1
0070@stores.gane.co.uk,GAME STORE,70.0,60.0
012483@stretfordhigh.com,HANY,58.3,58.3
05tsmith@brightoncollege.net,TOMAS SMITH,100.0,100.0
0845100989.gf@gmail.com,GERRIE,100.0,70.0
08darsh@gmail.com,AVNEET,20.0,20.0
...,...,...,...
zureenali@hotmail.co.uk,ZUREEN ALI,,
zwebb05@icloud.com,ZAK,50.0,20.0
zygarde1234@gmail.com,SABBIR AHMED,50.0,40.0
zyounus@gmail.com,ZEESHAN,60.0,60.0


### Average Speed

In [2350]:
#Again already calculated in the above headline statistics
average_speed

Unnamed: 0_level_0,Unnamed: 1_level_0,average_speed
email,name,Unnamed: 2_level_1
0070@stores.gane.co.uk,GAME STORE,63.0
012483@stretfordhigh.com,HANY,58.3
05tsmith@brightoncollege.net,TOMAS SMITH,100.0
0845100989.gf@gmail.com,GERRIE,80.0
08darsh@gmail.com,AVNEET,20.0
...,...,...
zureenali@hotmail.co.uk,ZUREEN ALI,0.0
zwebb05@icloud.com,ZAK,37.8
zygarde1234@gmail.com,SABBIR AHMED,47.9
zyounus@gmail.com,ZEESHAN,60.0


### Average Runs at Different Speeds

In [2351]:
#Creating the bins for the different speeds
bins = [0,10,20,30,40,50,60,70,80,90,101]
names = ["0-10","10-20","20-30","30-40","40-50","50-60","60-70","70-80","80-90","90-100"]

#Implementing the above bins to create indicators for each speed category
cricket_df['speed_bins'] = pd.cut(cricket_df['speed_adj'], bins, labels=names)
cricket_df

Unnamed: 0,sim_number,timestamp_x,batfast_id,client_name,event_name,game_mode,score,speed,pitch,swing,...,tennis,Bowled,Caught Behind,Points Bowled,Points Caught Behind,4_boundary,6_boundary,4_distance,6_distance,speed_bins
0,12,2018-04-10 16:45:57+00:00,bfs1202458,exchange-mall-illford,exchange-mall-2018,Whack,Dead Ball,6.0,20.0,0.0,...,0,0,0,0,0,0,0,0.0,0.0,50-60
1,12,2018-04-10 16:46:34+00:00,bfs1202458,exchange-mall-illford,exchange-mall-2018,Whack,Dead Ball,6.0,20.0,0.0,...,0,0,0,0,0,0,0,0.0,0.0,50-60
2,12,2018-04-10 16:46:54+00:00,bfs1202458,exchange-mall-illford,exchange-mall-2018,Whack,Dead Ball,6.0,20.0,0.0,...,0,0,0,0,0,0,0,0.0,0.0,50-60
3,12,2018-04-10 16:47:14+00:00,bfs1202458,exchange-mall-illford,exchange-mall-2018,Whack,0,6.0,20.0,0.0,...,0,0,0,0,0,0,0,0.0,0.0,50-60
4,12,2018-04-10 16:47:36+00:00,bfs1202458,exchange-mall-illford,exchange-mall-2018,Whack,6,6.0,20.0,0.0,...,0,0,0,0,0,0,1,0.0,,50-60
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3110931,9,2020-03-13 19:52:40+00:00,bfs901873,batfast,36-hours-sports-relief,Training,1,7.4,2.0,0.0,...,0,0,0,0,0,0,0,0.0,0.0,70-80
3110932,9,2020-03-13 19:52:55+00:00,bfs901873,batfast,36-hours-sports-relief,Training,1,7.4,2.0,0.0,...,0,0,0,0,0,0,0,0.0,0.0,70-80
3110933,9,2020-03-13 19:53:09+00:00,bfs901873,batfast,36-hours-sports-relief,Training,1,7.4,2.0,0.0,...,0,0,0,0,0,0,0,0.0,0.0,70-80
3110934,9,2020-03-13 19:53:24+00:00,bfs901873,batfast,36-hours-sports-relief,Training,0,7.4,2.0,0.0,...,0,0,0,0,0,0,0,0.0,0.0,70-80


In [2352]:
# Example dataframe of the bins for one user (grouping for all users will not occur here due to the code
#repeatedly killing the kernel. Therefore, the grouping will be done at the indivudal level within the visualisation
#function)
speed_runs_df = cricket_df.loc[(cricket_df['email']=='alex.mathews@batfast.com') & (cricket_df['name']=='ALEX MATHEWS')] 
speed_runs_df

average_runs_speed_bin_df = speed_runs_df[['email','name','runs','speed_bins']].groupby(['email','name','speed_bins']).mean()
average_runs_speed_bin_df.rename(columns={"runs":"average_runs_per_speed_bin"}, inplace=True)
average_runs_speed_bin_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,average_runs_per_speed_bin
email,name,speed_bins,Unnamed: 3_level_1
alex.mathews@batfast.com,ALEX MATHEWS,0-10,
alex.mathews@batfast.com,ALEX MATHEWS,10-20,1.0
alex.mathews@batfast.com,ALEX MATHEWS,20-30,0.378378
alex.mathews@batfast.com,ALEX MATHEWS,30-40,0.955157
alex.mathews@batfast.com,ALEX MATHEWS,40-50,0.981132
alex.mathews@batfast.com,ALEX MATHEWS,50-60,1.482759
alex.mathews@batfast.com,ALEX MATHEWS,60-70,1.166667
alex.mathews@batfast.com,ALEX MATHEWS,70-80,
alex.mathews@batfast.com,ALEX MATHEWS,80-90,
alex.mathews@batfast.com,ALEX MATHEWS,90-100,


### Average Runs for Different Length Balls

In [2353]:
#Checking the cricket_df length values
cricket_df['length'].value_counts()

Yorker             29419
Extra Short        25652
Full                3985
Good                3354
Short               1765
Mid Strike Zone      113
Service Line          16
Net Line               2
Name: length, dtype: int64

The above shows that there are some erroneous length values within the cricket dataframe as 'Mid Strike Zone', 'Service Line' and 'Net Line' are baseball and tennis descriptions respectively. The below dataframe shows that the game mode for these deliveries was Test Ball (a cricket game mode) and also that some deliveries were for the client, 'Sixes' which is a cricket specific entertainment pub/ venue. Thus, these length values are incorrect and will be replaced with NaN values.

In [2354]:
#Setting the incorrectly labelled game mode cells to np.NaN
cricket_df['length'].loc[(cricket_df['length']=='Service Line')|(cricket_df['length']=='Net Line')|(cricket_df['length']=='Mid Strike Zone')] = np.NaN
cricket_df['length'].value_counts()



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Yorker         29419
Extra Short    25652
Full            3985
Good            3354
Short           1765
Name: length, dtype: int64

In [2355]:
#Creating indicator values for each relevant length of delivery. NOTE that np.NaN instead of 0 is used for the
#deliveries which are not that specific length. This is done so that when the mean is caluclated using the 
#groupby.mean() function, the other length deliveries are ingored for that specific lenght. I.e. the total runs
#for yorkers will only be averaged by the number of yorker length balls.
cricket_df['Yorker_runs'] = np.where(cricket_df['length']=='Yorker',cricket_df['runs'],np.NaN)
cricket_df['Full_runs'] = np.where(cricket_df['length']=='Full',cricket_df['runs'],np.NaN)
cricket_df['Good_runs'] = np.where(cricket_df['length']=='Good',cricket_df['runs'],np.NaN)
cricket_df['Short_runs'] = np.where(cricket_df['length']=='Short',cricket_df['runs'],np.NaN)
cricket_df['Extra_short_runs'] = np.where(cricket_df['length']=='Extra Short',cricket_df['runs'],np.NaN)
cricket_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1907084 entries, 0 to 3110935
Data columns (total 57 columns):
 #   Column                Dtype   
---  ------                -----   
 0   sim_number            int64   
 1   timestamp_x           object  
 2   batfast_id            object  
 3   client_name           object  
 4   event_name            object  
 5   game_mode             object  
 6   score                 object  
 7   speed                 float64 
 8   pitch                 float64 
 9   swing                 float64 
 10  pan                   float64 
 11  turn                  float64 
 12  r                     float64 
 13  theta                 float64 
 14  z                     float64 
 15  power                 float64 
 16  machine               object  
 17  scoring               object  
 18  length                object  
 19  email                 object  
 20  name                  object  
 21  gender                object  
 22  age_group         

In [2356]:
#Calculating the average runs per delivery at each length
length_average_runs = cricket_df[['email','name','Yorker_runs','Full_runs','Good_runs','Short_runs','Extra_short_runs']].groupby(['email','name']).mean()
length_average_runs.rename(columns={'Yorker_runs':'Yorker_average_runs','Full_runs':'Full_average_runs','Good_runs':'Good_average_runs','Short_runs':'Short_average_runs','Extra_short_runs':'Extra_short_average_runs'}, inplace=True)
length_average_runs

Unnamed: 0_level_0,Unnamed: 1_level_0,Yorker_average_runs,Full_average_runs,Good_average_runs,Short_average_runs,Extra_short_average_runs
email,name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0070@stores.gane.co.uk,GAME STORE,,,,,
012483@stretfordhigh.com,HANY,,,,,
05tsmith@brightoncollege.net,TOMAS SMITH,,,,,
0845100989.gf@gmail.com,GERRIE,,,,,
08darsh@gmail.com,AVNEET,,,,,
...,...,...,...,...,...,...
zureenali@hotmail.co.uk,ZUREEN ALI,,,,,
zwebb05@icloud.com,ZAK,,,,,
zygarde1234@gmail.com,SABBIR AHMED,,,,,
zyounus@gmail.com,ZEESHAN,,,,,


### Average Swing Performance

In [2357]:
#Identifying away and in swinging deliveries as well as straight deliveries (and therefore which
#runs came off of whcih type of swing)
cricket_df['away_swing_runs'] = np.where(cricket_df['swing']>0,cricket_df['runs'],np.NaN)
cricket_df['in_swing_runs'] = np.where(cricket_df['swing']<0,cricket_df['runs'],np.NaN)
cricket_df['straight_runs'] = np.where(cricket_df['swing']==0,cricket_df['runs'],np.NaN)
cricket_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1907084 entries, 0 to 3110935
Data columns (total 60 columns):
 #   Column                Dtype   
---  ------                -----   
 0   sim_number            int64   
 1   timestamp_x           object  
 2   batfast_id            object  
 3   client_name           object  
 4   event_name            object  
 5   game_mode             object  
 6   score                 object  
 7   speed                 float64 
 8   pitch                 float64 
 9   swing                 float64 
 10  pan                   float64 
 11  turn                  float64 
 12  r                     float64 
 13  theta                 float64 
 14  z                     float64 
 15  power                 float64 
 16  machine               object  
 17  scoring               object  
 18  length                object  
 19  email                 object  
 20  name                  object  
 21  gender                object  
 22  age_group         

In [2358]:
#Calculating the mean runs per delivery for each type of swing
swing_analysis = cricket_df[['email','name','away_swing_runs','in_swing_runs','straight_runs']].groupby(['email','name']).mean()
swing_analysis

Unnamed: 0_level_0,Unnamed: 1_level_0,away_swing_runs,in_swing_runs,straight_runs
email,name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0070@stores.gane.co.uk,GAME STORE,,,1.800000
012483@stretfordhigh.com,HANY,,,1.833333
05tsmith@brightoncollege.net,TOMAS SMITH,,,0.000000
0845100989.gf@gmail.com,GERRIE,0.800000,,4.000000
08darsh@gmail.com,AVNEET,,,2.000000
...,...,...,...,...
zureenali@hotmail.co.uk,ZUREEN ALI,,,
zwebb05@icloud.com,ZAK,,,1.843750
zygarde1234@gmail.com,SABBIR AHMED,2.666667,,1.904762
zyounus@gmail.com,ZEESHAN,,,1.166667


### Offside vs Leg Side

Given we know the direction of the shot thanks to the 'theta' value showing us the angle the ball travelled from the batsmen, we can analyse both the proportion of a user's shots which are on the leg side, as well as the average number of runs scored on both sides of the wickets. 

We know that the bowler is 0 degrees from the batsmen (i.e. straight ahead) and that the direction then proceeds anti-clockwise from the bowler. Therefore, we know that the leg side of the pitch is shown by any theta value of 0-180 degrees and the offside of the pitch is signified by any values of 180-360 degrees. Therefore, we simply have to indicate which deliveries were hit into the off or leg side in the dataframne. 

In [2359]:
#Identifying which deliveries were hit to the off-side and leg-side
cricket_df['leg_side_shot'] = np.where((cricket_df['theta']>=0) & (cricket_df['theta']<180), 1,np.NaN)
cricket_df['off_side_shot'] = np.where((cricket_df['theta']>=180) & (cricket_df['theta']<360), 1,np.NaN)
cricket_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1907084 entries, 0 to 3110935
Data columns (total 62 columns):
 #   Column                Dtype   
---  ------                -----   
 0   sim_number            int64   
 1   timestamp_x           object  
 2   batfast_id            object  
 3   client_name           object  
 4   event_name            object  
 5   game_mode             object  
 6   score                 object  
 7   speed                 float64 
 8   pitch                 float64 
 9   swing                 float64 
 10  pan                   float64 
 11  turn                  float64 
 12  r                     float64 
 13  theta                 float64 
 14  z                     float64 
 15  power                 float64 
 16  machine               object  
 17  scoring               object  
 18  length                object  
 19  email                 object  
 20  name                  object  
 21  gender                object  
 22  age_group         

In [2360]:
cricket_df[['leg_side_shot','off_side_shot']]
cricket_df['leg_side_shot'].sum()
cricket_df['off_side_shot'].sum()

65.0

The above shows that there is a hugely dominant side that users tend to hit - the leg side. 

In [2361]:
#Calculating the total number of leg-side vs. off-sdie shots
off_leg_side_analysis = cricket_df[['email','name','leg_side_shot','off_side_shot']].groupby(['email','name']).sum()
off_leg_side_analysis

Unnamed: 0_level_0,Unnamed: 1_level_0,leg_side_shot,off_side_shot
email,name,Unnamed: 2_level_1,Unnamed: 3_level_1
0070@stores.gane.co.uk,GAME STORE,0.0,0.0
012483@stretfordhigh.com,HANY,5.0,0.0
05tsmith@brightoncollege.net,TOMAS SMITH,0.0,0.0
0845100989.gf@gmail.com,GERRIE,0.0,0.0
08darsh@gmail.com,AVNEET,5.0,0.0
...,...,...,...
zureenali@hotmail.co.uk,ZUREEN ALI,0.0,0.0
zwebb05@icloud.com,ZAK,0.0,0.0
zygarde1234@gmail.com,SABBIR AHMED,0.0,0.0
zyounus@gmail.com,ZEESHAN,0.0,0.0


### Spin Analysis

In [2362]:
#Identifying which deliveries are offside (and therefore which runs came off of off-spin) 
cricket_df['off_spin'] = np.where(cricket_df['turn']<0, cricket_df['runs'],np.NaN)
cricket_df['leg_spin'] = np.where(cricket_df['turn']>0, cricket_df['runs'],np.NaN)


In [2363]:
#Finding total runs off each type of spin
spin_analysis_1 = cricket_df[['email','name','off_spin','leg_spin']].groupby(['email','name']).sum()
spin_analysis_1.rename(columns={"off_spin":"off_spin_total_runs","leg_spin":"leg_spin_total_runs"},inplace=True)
spin_analysis_1

Unnamed: 0_level_0,Unnamed: 1_level_0,off_spin_total_runs,leg_spin_total_runs
email,name,Unnamed: 2_level_1,Unnamed: 3_level_1
0070@stores.gane.co.uk,GAME STORE,0.0,0.0
012483@stretfordhigh.com,HANY,0.0,0.0
05tsmith@brightoncollege.net,TOMAS SMITH,0.0,0.0
0845100989.gf@gmail.com,GERRIE,0.0,0.0
08darsh@gmail.com,AVNEET,12.0,0.0
...,...,...,...
zureenali@hotmail.co.uk,ZUREEN ALI,0.0,0.0
zwebb05@icloud.com,ZAK,0.0,0.0
zygarde1234@gmail.com,SABBIR AHMED,0.0,0.0
zyounus@gmail.com,ZEESHAN,0.0,0.0


In [2364]:
#Finding MEAN runs per delivery off of each type of spin
spin_analysis_2 = cricket_df[['email','name','off_spin','leg_spin']].groupby(['email','name']).mean()
spin_analysis_2.rename(columns={"off_spin":"off_spin_average_runs","leg_spin":"leg_spin_average_runs"},inplace=True)
spin_analysis_2

Unnamed: 0_level_0,Unnamed: 1_level_0,off_spin_average_runs,leg_spin_average_runs
email,name,Unnamed: 2_level_1,Unnamed: 3_level_1
0070@stores.gane.co.uk,GAME STORE,,
012483@stretfordhigh.com,HANY,,
05tsmith@brightoncollege.net,TOMAS SMITH,,
0845100989.gf@gmail.com,GERRIE,,
08darsh@gmail.com,AVNEET,2.0,
...,...,...,...
zureenali@hotmail.co.uk,ZUREEN ALI,,
zwebb05@icloud.com,ZAK,,
zygarde1234@gmail.com,SABBIR AHMED,,
zyounus@gmail.com,ZEESHAN,,


In [2365]:
#Merging the two spin dataframes together
spin_analysis = spin_analysis_1.merge(spin_analysis_2, left_index=True, right_index=True, how='outer')
spin_analysis

Unnamed: 0_level_0,Unnamed: 1_level_0,off_spin_total_runs,leg_spin_total_runs,off_spin_average_runs,leg_spin_average_runs
email,name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0070@stores.gane.co.uk,GAME STORE,0.0,0.0,,
012483@stretfordhigh.com,HANY,0.0,0.0,,
05tsmith@brightoncollege.net,TOMAS SMITH,0.0,0.0,,
0845100989.gf@gmail.com,GERRIE,0.0,0.0,,
08darsh@gmail.com,AVNEET,12.0,0.0,2.0,
...,...,...,...,...,...
zureenali@hotmail.co.uk,ZUREEN ALI,0.0,0.0,,
zwebb05@icloud.com,ZAK,0.0,0.0,,
zygarde1234@gmail.com,SABBIR AHMED,0.0,0.0,,
zyounus@gmail.com,ZEESHAN,0.0,0.0,,


### Spin versus Seam Performance

In [2366]:
#Identifying seam and spin deliveries

#Any time a ball is delivered with any turn (spin) it is indicated as being a spin ball
cricket_df['spin_delivery'] = np.where(abs(cricket_df['turn'])>0, 1, 0)
#Any ball that does not spin will be considered seam
cricket_df['seam_delivery'] = np.where(abs(cricket_df['turn'])>0, 0, 1)
cricket_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1907084 entries, 0 to 3110935
Data columns (total 66 columns):
 #   Column                Dtype   
---  ------                -----   
 0   sim_number            int64   
 1   timestamp_x           object  
 2   batfast_id            object  
 3   client_name           object  
 4   event_name            object  
 5   game_mode             object  
 6   score                 object  
 7   speed                 float64 
 8   pitch                 float64 
 9   swing                 float64 
 10  pan                   float64 
 11  turn                  float64 
 12  r                     float64 
 13  theta                 float64 
 14  z                     float64 
 15  power                 float64 
 16  machine               object  
 17  scoring               object  
 18  length                object  
 19  email                 object  
 20  name                  object  
 21  gender                object  
 22  age_group         

In [2367]:
# Seam vs Spin Wickets
cricket_df['Seam Bowled'] = np.where((cricket_df['Bowled']==1) & (cricket_df['seam_delivery']==1),1,0)
cricket_df['Seam Caught Behind'] = np.where((cricket_df['Caught Behind']==1) & (cricket_df['seam_delivery']==1),1,0)
cricket_df['Spin Bowled'] = np.where((cricket_df['Bowled']==1) & (cricket_df['spin_delivery']==1),1,0)
cricket_df['Spin Caught Behind'] = np.where((cricket_df['Caught Behind']==1) & (cricket_df['spin_delivery']==1),1,0)

cricket_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1907084 entries, 0 to 3110935
Data columns (total 70 columns):
 #   Column                Dtype   
---  ------                -----   
 0   sim_number            int64   
 1   timestamp_x           object  
 2   batfast_id            object  
 3   client_name           object  
 4   event_name            object  
 5   game_mode             object  
 6   score                 object  
 7   speed                 float64 
 8   pitch                 float64 
 9   swing                 float64 
 10  pan                   float64 
 11  turn                  float64 
 12  r                     float64 
 13  theta                 float64 
 14  z                     float64 
 15  power                 float64 
 16  machine               object  
 17  scoring               object  
 18  length                object  
 19  email                 object  
 20  name                  object  
 21  gender                object  
 22  age_group         

In [2368]:
seam_spin_wickets_df = cricket_df[['email','name','Seam Bowled','Seam Caught Behind','Spin Bowled','Spin Caught Behind']].groupby(['email','name']).sum()
seam_spin_wickets_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Seam Bowled,Seam Caught Behind,Spin Bowled,Spin Caught Behind
email,name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0070@stores.gane.co.uk,GAME STORE,0,0,0,0
012483@stretfordhigh.com,HANY,1,0,0,0
05tsmith@brightoncollege.net,TOMAS SMITH,0,0,0,0
0845100989.gf@gmail.com,GERRIE,1,1,0,0
08darsh@gmail.com,AVNEET,0,0,0,0
...,...,...,...,...,...
zureenali@hotmail.co.uk,ZUREEN ALI,1,0,0,0
zwebb05@icloud.com,ZAK,0,0,0,0
zygarde1234@gmail.com,SABBIR AHMED,3,0,0,0
zyounus@gmail.com,ZEESHAN,2,0,0,0


In [2369]:
#Seam Spin Runs
cricket_df['seam_runs'] = np.where(cricket_df['seam_delivery']==1,cricket_df['runs'],np.NaN)
cricket_df['spin_runs'] = np.where(cricket_df['spin_delivery']==1,cricket_df['runs'],np.NaN)

In [2370]:
seam_spin_runs_df = cricket_df[['email','name','seam_runs','spin_runs']].groupby(['email','name']).sum()
seam_spin_runs_df

Unnamed: 0_level_0,Unnamed: 1_level_0,seam_runs,spin_runs
email,name,Unnamed: 2_level_1,Unnamed: 3_level_1
0070@stores.gane.co.uk,GAME STORE,18.0,0.0
012483@stretfordhigh.com,HANY,11.0,0.0
05tsmith@brightoncollege.net,TOMAS SMITH,0.0,0.0
0845100989.gf@gmail.com,GERRIE,8.0,0.0
08darsh@gmail.com,AVNEET,0.0,12.0
...,...,...,...
zureenali@hotmail.co.uk,ZUREEN ALI,16.0,0.0
zwebb05@icloud.com,ZAK,59.0,0.0
zygarde1234@gmail.com,SABBIR AHMED,48.0,0.0
zyounus@gmail.com,ZEESHAN,7.0,0.0


In [2374]:
#Seam vs Spin Deliveries
seam_spin_deliveries_df = cricket_df[['email','name','seam_delivery','spin_delivery']].groupby(['email','name']).sum()
seam_spin_deliveries_df

Unnamed: 0_level_0,Unnamed: 1_level_0,seam_delivery,spin_delivery
email,name,Unnamed: 2_level_1,Unnamed: 3_level_1
0070@stores.gane.co.uk,GAME STORE,10,0
012483@stretfordhigh.com,HANY,6,0
05tsmith@brightoncollege.net,TOMAS SMITH,17,0
0845100989.gf@gmail.com,GERRIE,6,0
08darsh@gmail.com,AVNEET,0,6
...,...,...,...
zureenali@hotmail.co.uk,ZUREEN ALI,6,0
zwebb05@icloud.com,ZAK,32,0
zygarde1234@gmail.com,SABBIR AHMED,24,0
zyounus@gmail.com,ZEESHAN,6,0


### Power Statistics

In [2458]:
#Average Power of Shots
average_power_df = cricket_df[['email','name','power']].groupby(['email','name']).mean()
average_power_df['power_percent'] = (average_power_df['power']/1000)*100
average_power_df.drop(columns='power', inplace=True)
average_power_df.loc[average_power_df['power_percent']>0]

Unnamed: 0_level_0,Unnamed: 1_level_0,power_percent
email,name,Unnamed: 2_level_1
123@gmail.com,ETHAN RATHORE,25.143902
5oby@hotmail.com,ZACK,15.338462
aaman9@hotmail.com,AAMAN,51.116667
aaron@vmsglobal.co.uk,AARON LARAMAN,38.175000
ab.brennen@gmail.com,ANDREW BRENNEN,25.604348
...,...,...
zeeshan_m@hotmail.co.uk,ZEESHAN MAHMOOD,98.327273
zmurtagh7@gmail.com,ZACK MURTAGH,20.950000
zolotarevnik@gmail.com,ALISA ZOLOTAREVA,17.100000
zolotarevnik@gmail.com,ANNA ZOLOTAREVA,15.100000


## Generating Functions for Cricket Statistics

Again, the below code manipulats the above code so that the cricket statistics can be generated using generic functions from a base dataset. 

In [2376]:
cricket_base = combined_df.loc[combined_df['cricket']==1]

In [2377]:
def total_runs_scored_function(dataset):
    runs_scored_df = dataset[['email','name','runs']].groupby(['email','name']).sum()
    runs_scored_df = runs_scored_df['runs'].to_frame()
    runs_scored_df.rename(columns = {"runs":"total_runs_scored"},inplace=True) 
    return runs_scored_df

In [2378]:
def total_points_scored_function(dataset):
    points_scored_df = dataset[['email','name','points']].groupby(['email','name']).sum()
    points_scored_df = points_scored_df['points'].to_frame()
    points_scored_df.rename(columns = {"points":"total_points_scored"},inplace=True) 
    return points_scored_df

In [2379]:
def average_runs_points_function(dataset):
    
    #Ensuring the wicket column is created for the dataset (bearing in mind these functions are meant to be used on base datasets)
    wickets_df = pd.get_dummies(dataset['wicket'])
    dataset_df = dataset.merge(wickets_df, left_index=True, right_index=True, how='inner')
    dataset_df['Bowled'] = np.where(dataset_df['client_name']!='tenpin', dataset_df['Bowled'],0)
    dataset_df['Caught Behind'] = np.where(dataset_df['client_name']!='tenpin', dataset_df['Caught Behind'],0)
    dataset_df['Points Bowled'] = np.where(dataset_df['client_name']=='tenpin', dataset_df['Bowled'],0)
    dataset_df['Points Caught Behind'] = np.where(dataset_df['client_name']=='tenpin', dataset_df['Caught Behind'],0)

    #Calculating the total number of each type of dismissal for each user
    average_runs_points_df = dataset_df[['email','name','wicket','Bowled', 'Caught Behind','Points Bowled', 'Points Caught Behind']].groupby(['email','name']).sum()
    #Creating a total wickets columns to sum the two bowled and caught behind columns
    average_runs_points_df['total_wickets'] = average_runs_points_df['Bowled'] + average_runs_points_df['Caught Behind'] + average_runs_points_df['Points Bowled'] + average_runs_points_df['Points Caught Behind']

    #Merging the dataframe with the total number of runs and points scored
    average_runs_points_df = average_runs_points_df.merge(total_runs_scored_function(dataset), left_index=True, right_index=True, how='outer')
    average_runs_points_df = average_runs_points_df.merge(total_points_scored_function(dataset), left_index=True, right_index=True, how='outer')
    average_runs_points_df
    
    #Calculating average runs and points from the above dataframe
    average_runs_points_df['average_runs'] = average_runs_points_df['total_runs_scored']/((average_runs_points_df['Bowled'] + average_runs_points_df['Caught Behind']))
    average_runs_points_df['average_points'] = average_runs_points_df['total_points_scored']/((average_runs_points_df['Points Bowled'] + average_runs_points_df['Points Caught Behind']))
    
    #Resolving infinity values
    average_runs_points_df['average_runs'].loc[average_runs_points_df['average_runs'] ==np.inf] = average_runs_points_df['total_runs_scored']
    average_runs_points_df['average_points'].loc[average_runs_points_df['average_points'] ==np.inf] = average_runs_points_df['total_points_scored']

    return average_runs_points_df    

In [2380]:
def ranking_function(dataset):
    #Generating the necessary DataFrames
    average_runs_points_df = average_runs_points_function(cricket_base)
    rank_df_1 = average_runs_points_df[['total_runs_scored','total_points_scored','total_wickets','average_runs']].merge(user_deliveries_faced, left_index=True, right_index=True, how='outer')
    
    #Ranking by Total Runs
    rank_df_1['runs_rank'] = rank_df_1['total_runs_scored'].rank(ascending=False)
    rank_df_1['runs_rank_percentile'] = (round(rank_df_1['runs_rank'].rank(pct=True),2))*100
    
    # Ranking by Total Points
    rank_df_1['points_rank'] = rank_df_1['total_points_scored'].rank(ascending=False)
    rank_df_1['points_rank_percentile'] = (round(rank_df_1['points_rank'].rank(pct=True),2))*100
    
    #Ranking Total Deliveries Faced
    rank_df_1['deliveries_rank'] = rank_df_1['deliveries_faced'].rank(ascending=False)
    rank_df_1['deliveries_rank_percentile'] = (round(rank_df_1['deliveries_rank'].rank(pct=True),2))*100
    
    #Ranking by Wickets
    rank_df_1['wickets_rank'] = rank_df_1['total_wickets'].rank(ascending=False)
    rank_df_1['wickets_rank_percentile'] = (round(rank_df_1['wickets_rank'].rank(pct=True),2))*100
    
    #Ranking by Average Runs
    rank_df_1['average_runs_rank'] = rank_df_1['average_runs'].rank(ascending=False)
    rank_df_1['average_runs_rank_percentile'] = (round(rank_df_1['average_runs_rank'].rank(pct=True),2))*100
    
    return rank_df_1

In [2381]:
def max_min_speed_function(dataset):
    max_speed_df = fastest_delivery_function(dataset)
    min_speed_df = slowest_delivery_function(dataset)
    
    max_min_speed_df = max_speed_df.merge(min_speed_df, left_index=True, right_index=True, how='outer')
    
    return max_min_speed_df

In [2382]:
def boundary_analysis_function(dataset):
    #Identifying boundary deliveries
    dataset['4_boundary'] = np.where(dataset['runs']==4,1,0)
    dataset['6_boundary'] = np.where(dataset['runs']==6,1,0)
    
    dataset['4_distance'] = np.where(dataset['runs']==4,cricket_df['r'],0)
    dataset['6_distance'] = np.where(dataset['runs']==6,cricket_df['r'],0)
    
    #Counting boundaries and distance
    boundary_count_df = dataset[['email','name','4_boundary','6_boundary']].groupby(['email','name']).sum()
    boundary_count_df.rename(columns={"4_boundary":'number_4s',"6_boundary":"number_6s"}, inplace=True)
    
    boundary_distance_df = cricket_df[['email','name','4_distance','6_distance']].groupby(['email','name']).max()
    
    boundary_analysis_df = boundary_count_df.merge(boundary_distance_df, left_index=True, right_index=True, how='outer')
    
    return boundary_analysis_df

In [2383]:
#AVERAGE SPEED FUNCTION ALREADY CALCULATED
average_speed_function(cricket_base)

Unnamed: 0_level_0,Unnamed: 1_level_0,average_speed
email,name,Unnamed: 2_level_1
0070@stores.gane.co.uk,GAME STORE,63.0
012483@stretfordhigh.com,HANY,58.3
05tsmith@brightoncollege.net,TOMAS SMITH,100.0
0845100989.gf@gmail.com,GERRIE,80.0
08darsh@gmail.com,AVNEET,20.0
...,...,...
zureenali@hotmail.co.uk,ZUREEN ALI,0.0
zwebb05@icloud.com,ZAK,37.8
zygarde1234@gmail.com,SABBIR AHMED,47.9
zyounus@gmail.com,ZEESHAN,60.0


In [2384]:
def average_runs_speed_bins(dataset):
    
    bins = [0,10,20,30,40,50,60,70,80,90,101]
    names = ["0-10","10-20","20-30","30-40","40-50","50-60","60-70","70-80","80-90","90-100"]
    
    dataset['speed_bins'] = pd.cut(dataset['speed_adj'], bins, labels=names)
    
    #Note this statistic will not yet be grouped as the grouping for so many users kills the kernel. Thus, the 
    #grouping will occur at the individual level when the data is visualised
    return dataset.info()

In [2385]:
def average_runs_different_lengths_function(dataset):
    #Identifying relevant delveries for each length
    dataset['Yorker_runs'] = np.where(dataset['length']=='Yorker',dataset['runs'],np.NaN)
    dataset['Full_runs'] = np.where(dataset['length']=='Full',dataset['runs'],np.NaN)
    dataset['Good_runs'] = np.where(dataset['length']=='Good',dataset['runs'],np.NaN)
    dataset['Short_runs'] = np.where(dataset['length']=='Short',dataset['runs'],np.NaN)
    dataset['Extra_short_runs'] = np.where(dataset['length']=='Extra Short',dataset['runs'],np.NaN)
    
    #Calculating average scores off each length of delivery
    length_average_runs_df = dataset[['email','name','Yorker_runs','Full_runs','Good_runs','Short_runs','Extra_short_runs']].groupby(['email','name']).mean()
    length_average_runs_df.rename(columns={'Yorker_runs':'Yorker_average_runs','Full_runs':'Full_average_runs','Good_runs':'Good_average_runs','Short_runs':'Short_average_runs','Extra_short_runs':'Extra_short_average_runs'}, inplace=True)
    
    return length_average_runs_df   

In [2386]:
def average_runs_swing_function(dataset):
    #Identifying respective swing deliveries
    dataset['away_swing_runs'] = np.where(dataset['swing']>0,dataset['runs'],np.NaN)
    dataset['in_swing_runs'] = np.where(dataset['swing']<0,dataset['runs'],np.NaN)
    dataset['straight_runs'] = np.where(dataset['swing']==0,dataset['runs'],np.NaN)
    
    #Calculating average runs for each type of swing
    swing_analysis_df = dataset[['email','name','away_swing_runs','in_swing_runs','straight_runs']].groupby(['email','name']).mean()
    
    return swing_analysis_df

In [2387]:
def spin_analysis_function(dataset):
    dataset['off_spin'] = np.where(dataset['turn']<0, dataset['runs'],np.NaN)
    dataset['leg_spin'] = np.where(dataset['turn']>0, dataset['runs'],np.NaN)

    spin_analysis_1_df = dataset[['email','name','off_spin','leg_spin']].groupby(['email','name']).sum()
    spin_analysis_1_df.rename(columns={"off_spin":"off_spin_total_runs","leg_spin":"leg_spin_total_runs"},inplace=True)

    spin_analysis_2_df = dataset[['email','name','off_spin','leg_spin']].groupby(['email','name']).mean()
    spin_analysis_2_df.rename(columns={"off_spin":"off_spin_average_runs","leg_spin":"leg_spin_average_runs"},inplace=True)

    spin_analysis = spin_analysis_1_df.merge(spin_analysis_2_df, left_index=True, right_index=True, how='outer')
    
    return spin_analysis

In [2388]:
def off_leg_side_analysis_function(dataset):
    dataset['leg_side_shot'] = np.where((dataset['theta']>=0) & (dataset['theta']<180), 1,np.NaN)
    dataset['off_side_shot'] = np.where((dataset['theta']>=180) & (dataset['theta']<360), 1,np.NaN)
    
    off_leg_side_analysis_df = dataset[['email','name','leg_side_shot','off_side_shot']].groupby(['email','name']).sum()
 
    return off_leg_side_analysis_df

In [2446]:
def seam_versus_spin_function(dataset):
    
    #Ensuring the wicket column is created for the dataset (bearing in mind these functions are meant to be used on base datasets)
    wickets_df = pd.get_dummies(dataset['wicket'])
    dataset_df = dataset.merge(wickets_df, left_index=True, right_index=True, how='inner')
    
    #Identifying each bowling type
    dataset_df['spin_delivery'] = np.where(abs(dataset_df['turn'])>0, 1, 0)
    dataset_df['seam_delivery'] = np.where(dataset_df['spin_delivery']==0, 1, 0)
    
    # Seam vs Spin Wickets
    dataset_df['Seam Bowled'] = np.where((dataset_df['Bowled']==1) & (dataset_df['seam_delivery']==1),1,0)
    dataset_df['Seam Caught Behind'] = np.where((dataset_df['Caught Behind']==1) & (dataset_df['seam_delivery']==1),1,0)
    dataset_df['Spin Bowled'] = np.where((dataset_df['Bowled']==1) & (dataset_df['spin_delivery']==1),1,0)
    dataset_df['Spin Caught Behind'] = np.where((dataset_df['Caught Behind']==1) & (dataset_df['spin_delivery']==1),1,0)

    seam_spin_wickets_df = dataset_df[['email','name','Seam Bowled','Seam Caught Behind','Spin Bowled','Spin Caught Behind']].groupby(['email','name']).sum()

    #Seam Spin Runs
    dataset_df['seam_runs'] = np.where(dataset_df['seam_delivery']==1,dataset_df['runs'],np.NaN)
    dataset_df['spin_runs'] = np.where(dataset_df['spin_delivery']==1,dataset_df['runs'],np.NaN)

    seam_spin_runs_df = dataset_df[['email','name','seam_runs','spin_runs']].groupby(['email','name']).sum()
    
    #Seam Spin Deliveries
    seam_spin_deliveries_df = dataset_df[['email','name','seam_delivery','spin_delivery']].groupby(['email','name']).sum()
    seam_spin_deliveries_df
    
    #Merging runs and wickets
    seam_versus_spin_df = seam_spin_wickets_df.merge(seam_spin_runs_df, 
                                                     left_index=True, right_index=True, how='outer')
    
    seam_versus_spin_df_2 = seam_versus_spin_df.merge(seam_spin_deliveries_df, 
                                                     left_index=True, right_index=True, how='outer')
    
    return seam_versus_spin_df_2

In [2390]:
def average_power_function(dataset):
    average_power_df = dataset[['email','name','power']].groupby(['email','name']).mean()
    average_power_df['power_percent'] = (average_power_df['power']/1000)*100
    average_power_df.drop(columns='power', inplace=True)
    return average_power_df

In [2391]:
def generate_cricket_statistics(dataset):
    
    #Generating dataframes using above functions
    total_runs_scored_df_1 = total_runs_scored_function(dataset)
    total_points_scored_df_1 = total_points_scored_function(dataset)
    average_runs_points_df_1 = average_runs_points_function(dataset)
    rank_df_1 = ranking_function(dataset)
    max_min_speed_df_1 = max_min_speed_function(dataset)
    boundary_count_df_1 = boundary_analysis_function(dataset)
    average_speed_df_1 = average_speed_function(dataset)
    average_runs_different_lengths_df_1 = average_runs_different_lengths_function(dataset)
    average_runs_swing_df_1 = average_runs_swing_function(dataset)
    spin_analysis_df_1 = spin_analysis_function(dataset)
    off_leg_side_analysis_function_df_1 = off_leg_side_analysis_function(dataset)
    cricket_deliveries_faced_df_1 = user_deliveries_function(dataset)
    seam_versus_spin_df_1 = seam_versus_spin_function(dataset)
    average_power_df_1 = average_power_function(dataset)
    
    #Merging DataFrames
    cricket_statistics_1 = total_runs_scored_df_1.merge(total_points_scored_df_1,
                                                                      left_index=True, right_index=True, how='outer')
    cricket_statistics_2 = cricket_statistics_1.merge(average_runs_points_df_1[['Bowled','Caught Behind','total_wickets','average_runs','average_points']],
                                                                      left_index=True, right_index=True, how='outer')
    cricket_statistics_3 = cricket_statistics_2.merge(rank_df[['runs_rank','runs_rank_percentile','points_rank','points_rank_percentile','deliveries_rank','deliveries_rank_percentile','wickets_rank','wickets_rank_percentile','average_runs_rank','average_runs_rank_percentile']],
                                                                      left_index=True, right_index=True, how='outer')
    cricket_statistics_4 = cricket_statistics_3.merge(max_min_speed_df_1,
                                                                      left_index=True, right_index=True, how='outer')
    cricket_statistics_5 = cricket_statistics_4.merge(boundary_count_df_1,
                                                                      left_index=True, right_index=True, how='outer')
    cricket_statistics_6 = cricket_statistics_5.merge(average_speed_df_1,
                                                                      left_index=True, right_index=True, how='outer')
    cricket_statistics_7 = cricket_statistics_6.merge(average_runs_different_lengths_df_1,
                                                                      left_index=True, right_index=True, how='outer')
    cricket_statistics_8 = cricket_statistics_7.merge(average_runs_swing_df_1,
                                                                      left_index=True, right_index=True, how='outer')
    cricket_statistics_9 = cricket_statistics_8.merge(spin_analysis_df_1,
                                                                      left_index=True, right_index=True, how='outer')
    cricket_statistics_10 = cricket_statistics_9.merge(off_leg_side_analysis_function_df_1,
                                                                      left_index=True, right_index=True, how='outer')
    cricket_statistics_11 = cricket_statistics_10.merge(cricket_deliveries_faced_df_1,
                                                                      left_index=True, right_index=True, how='outer')
    cricket_statistics_12 = cricket_statistics_11.merge(seam_versus_spin_df_1,
                                                                      left_index=True, right_index=True, how='outer')
    cricket_statistics_13 = cricket_statistics_12.merge(average_power_df_1,
                                                                      left_index=True, right_index=True, how='outer')

    
    cricket_statistics = cricket_statistics_13.copy()
    
    cricket_statistics.reset_index(inplace=True)
    
    return cricket_statistics

In [2392]:
cricket_base = combined_df.loc[combined_df['cricket']==1]

In [2393]:
cricket_statistics = generate_cricket_statistics(cricket_base)
cricket_statistics.head(5)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/

Unnamed: 0,email,name,total_runs_scored,total_points_scored,Bowled,Caught Behind,total_wickets,average_runs,average_points,runs_rank,...,deliveries_faced,Seam Bowled,Seam Caught Behind,Spin Bowled,Spin Caught Behind,seam_runs,spin_runs,seam_delivery,spin_delivery,power_percent
0,0070@stores.gane.co.uk,GAME STORE,18.0,0.0,0.0,0.0,0.0,18.0,,3976.5,...,10.0,0.0,0.0,0.0,0.0,18.0,0.0,10.0,0.0,
1,012483@stretfordhigh.com,HANY,11.0,0.0,1.0,0.0,1.0,11.0,,6820.0,...,6.0,1.0,0.0,0.0,0.0,11.0,0.0,6.0,0.0,
2,05tsmith@brightoncollege.net,TOMAS SMITH,0.0,0.0,0.0,0.0,0.0,,,12509.0,...,17.0,0.0,0.0,0.0,0.0,0.0,0.0,17.0,0.0,
3,0845100989.gf@gmail.com,GERRIE,8.0,0.0,1.0,1.0,2.0,4.0,,8508.5,...,6.0,1.0,1.0,0.0,0.0,8.0,0.0,6.0,0.0,
4,08darsh@gmail.com,AVNEET,12.0,0.0,0.0,0.0,0.0,12.0,,6314.5,...,6.0,0.0,0.0,0.0,0.0,0.0,12.0,0.0,6.0,


## 8.3 Visualising the Statistics 

The below code visualises the above statistics to be presented within the customer dashboard.

### Cricket Balls Faced

In [2394]:
def cricket_deliveries_faced_visual(user_email, user_name):
    
    df = cricket_statistics.loc[(cricket_statistics['email']==user_email) & (cricket_statistics['name']==user_name)]
    
    fig = go.Figure()

    fig.add_trace(go.Indicator(
    mode = "number",
    value = df.iloc[0]['deliveries_faced'],
    title = "Cricket Deliveries Faced"))

    fig.show()

In [2395]:
cricket_deliveries_faced_visual('alex.mathews@batfast.com', 'ALEX MATHEWS')

### Wagon Wheel

In [2396]:
wagon_wheel_practice = cricket_df.loc[(cricket_df['r']>0) & (cricket_df['theta']>0)]
wagon_wheel_practice_1 = wagon_wheel_practice.iloc[:100]
wagon_wheel_practice_1[['email','name']]

Unnamed: 0,email,name
11,012483@stretfordhigh.com,HANY
12,012483@stretfordhigh.com,HANY
13,012483@stretfordhigh.com,HANY
14,012483@stretfordhigh.com,HANY
15,012483@stretfordhigh.com,HANY
...,...,...
605,17nkennedy@rgshw.com,NOAH KENNEDY
606,17nkennedy@rgshw.com,NOAH KENNEDY
608,17nkennedy@rgshw.com,NOAH KENNEDY
609,17nkennedy@rgshw.com,NOAH KENNEDY


NOTE: Following consultation with BatFast only deliveries where manual scoring was used will be incorporated into the wagon wheel.

In [2397]:
def wagon_wheel_visual(user_email, user_name):
    #Ensuring only deliveries where manual scoring was implemented are used for the wagon wheel
    df = cricket_df.loc[(cricket_df['scoring']=='manual_scoring') & (cricket_df['email']==user_email) & (cricket_df['name']==user_name)]
    
    df['runs'] = df['runs'].astype(str)
    
    runs_color_map = {"0.0":"black",
                 "1.0":"yellow",
                 "2.0":"orange",
                 "3.0":"lime",
                 "4.0":"blue",
                 "6.0":"red"}
    
    fig = px.scatter_polar(df, r="r", theta="theta", direction="counterclockwise", color='runs',
                      color_discrete_map=runs_color_map, template="presentation", 
                    category_orders={"runs":['0.0','1.0','2.0','3.0','4.0','5.0','6.0']})

    fig.update_layout(title="Wagon Wheel",
                 title_font_size = 20,
                  title_x = 0.05,
                  title_y = 0.8,
                 legend_title='Runs')

    fig.update_polars(
                 radialaxis_title=dict(text="(m)",font_size=20),
                 radialaxis_tickangle=90)
                         
    fig.show()

In [2398]:
wagon_wheel_visual('17nkennedy@rgshw.com', 'NOAH KENNEDY')



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [2399]:
df = cricket_df.loc[(cricket_df['scoring']=='manual_scoring') & (cricket_df['email']=='alex.mathews@batfast.com') & (cricket_df['name']=='ALEX MATHEWS')]
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 0 entries
Data columns (total 72 columns):
 #   Column                Non-Null Count  Dtype   
---  ------                --------------  -----   
 0   sim_number            0 non-null      int64   
 1   timestamp_x           0 non-null      object  
 2   batfast_id            0 non-null      object  
 3   client_name           0 non-null      object  
 4   event_name            0 non-null      object  
 5   game_mode             0 non-null      object  
 6   score                 0 non-null      object  
 7   speed                 0 non-null      float64 
 8   pitch                 0 non-null      float64 
 9   swing                 0 non-null      float64 
 10  pan                   0 non-null      float64 
 11  turn                  0 non-null      float64 
 12  r                     0 non-null      float64 
 13  theta                 0 non-null      float64 
 14  z                     0 non-null      float64 
 15  power             

### Total Runs and Points

In [2400]:
def total_runs_points_visual(user_email, user_name):
    
    df = cricket_statistics.loc[(cricket_statistics['email']==user_email) & (cricket_statistics['name']==user_name)]
    
    fig = go.Figure()

    fig.add_trace(go.Indicator(
    mode = "number",
    value = df.iloc[0]['total_runs_scored'],
    domain = {'row':0, 'column':0},
    title = "Total Runs"))
    
    fig.add_trace(go.Indicator(
    mode = "number",
    value = df.iloc[0]['total_points_scored'],
    domain = {'row':1, 'column':0},
    title = "Total Points"))
    
    fig.update_layout(
        grid = {'rows':2, 'columns':1, 'pattern': "independent"},
        )
    
    fig.show()

In [2401]:
total_runs_points_visual('17nkennedy@rgshw.com', 'NOAH KENNEDY')

### Average Runs and Points

In [2402]:
def average_runs_points_visual(user_email, user_name):
    
    df = cricket_statistics.loc[(cricket_statistics['email']==user_email) & (cricket_statistics['name']==user_name)]
    
    fig = go.Figure()

    fig.add_trace(go.Indicator(
    mode = "number",
    value = df.iloc[0]['average_runs'],
    domain = {'row':0, 'column':0},
    title = "Average Runs"))
    
    fig.add_trace(go.Indicator(
    mode = "number",
    value = df.iloc[0]['average_points'],
    domain = {'row':1, 'column':0},
    title = "Average Points"))
    
    fig.update_layout(
        grid = {'rows':2, 'columns':1, 'pattern': "independent"},
        )
    
    fig.show()

In [2403]:
average_runs_points_visual('012483@stretfordhigh.com', 'HANY')

### Comparative Performance

In [2404]:
comp_df = cricket_statistics.loc[(cricket_statistics['email']=='123@gmail.com') & (cricket_statistics['name']=='ETHAN RATHORE')]
cricket_statistics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13335 entries, 0 to 13334
Data columns (total 50 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   email                         13335 non-null  object 
 1   name                          13335 non-null  object 
 2   total_runs_scored             13218 non-null  float64
 3   total_points_scored           13218 non-null  float64
 4   Bowled                        13218 non-null  float64
 5   Caught Behind                 13218 non-null  float64
 6   total_wickets                 13218 non-null  float64
 7   average_runs                  11983 non-null  float64
 8   average_points                162 non-null    float64
 9   runs_rank                     13218 non-null  float64
 10  runs_rank_percentile          13218 non-null  float64
 11  points_rank                   13218 non-null  float64
 12  points_rank_percentile        13218 non-null  float64
 13  d

In [2405]:
def cricket_ranking_visual(user_email, user_name):
    
    df = cricket_statistics.loc[(cricket_statistics['email']==user_email) & (cricket_statistics['name']==user_name)] 
    
    fig = go.Figure()

    fig.add_trace(go.Indicator(
        mode = "number",
        value = df.iloc[0]['runs_rank'],
        domain = {'row':0, 'column':1},
        title = '<b>Total Runs<b>',
        number = {'prefix':'Rank: ', 'suffix':f' / {len(cricket_statistics)}'}))

    fig.add_trace(go.Indicator(
        mode = "number",
        value = df.iloc[0]['average_runs_rank'],
        domain = {'row':1, 'column':1},
        title = '<b>Average Runs<b>',
        number = {'prefix':'Rank: ', 'suffix':f' / {len(cricket_statistics)}'}))
    
    fig.add_trace(go.Indicator(
        mode = "number",
        value = df.iloc[0]['points_rank'],
        domain = {'row':0, 'column':3},
        title = '<b>Total Points<b>',
        number = {'prefix':'Rank: ', 'suffix':f' / {len(cricket_statistics)}'}))

    fig.add_trace(go.Indicator(
        mode = "number",
        value = df.iloc[0]['deliveries_rank'],
        domain = {'row':1, 'column':3},
        title = '<b>Total Deliveries Faced<b>',
        number = {'prefix':'Rank: ', 'suffix':f' / {len(cricket_statistics)}'}))

    fig.update_layout(
            grid = {'rows':2, 'columns':7, 'pattern': "independent"})

    fig.show()

In [2406]:
cricket_ranking_visual('alex.mathews@batfast.com', 'ALEX MATHEWS')

### Boundary Analysis

In [2407]:
bound_df = cricket_statistics.loc[(cricket_statistics['email']=='123@gmail.com') & (cricket_statistics['name']=='ETHAN RATHORE')]
bound_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1 entries, 13 to 13
Data columns (total 50 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   email                         1 non-null      object 
 1   name                          1 non-null      object 
 2   total_runs_scored             1 non-null      float64
 3   total_points_scored           1 non-null      float64
 4   Bowled                        1 non-null      float64
 5   Caught Behind                 1 non-null      float64
 6   total_wickets                 1 non-null      float64
 7   average_runs                  1 non-null      float64
 8   average_points                0 non-null      float64
 9   runs_rank                     1 non-null      float64
 10  runs_rank_percentile          1 non-null      float64
 11  points_rank                   1 non-null      float64
 12  points_rank_percentile        1 non-null      float64
 13  deliver

In [2408]:
fig = go.Figure(
    data = [go.Bar(
            x = ['<b>4s<b>', '<b>6s<b>'],
            y = [bound_df.iloc[0]['number_4s'], bound_df.iloc[0]['number_6s']],
        width = [0.5, 0.5],
        text = [bound_df.iloc[0]['number_4s'], bound_df.iloc[0]['number_6s']],
        textposition = 'outside',
        marker_color = ['blue','red']
)])

fig.update_layout(title={'text': '<b>Number of Boundaries </b>','x':0.5},
                  xaxis_title_text='<b>Boundary Type</b>')

fig.update_xaxes(type='category', showgrid=False, tickangle = 0, ticks="outside",
                showline=True, linewidth=1, linecolor='#121212', tickfont_size=20)
fig.update_yaxes(showticklabels = False)

fig.update_traces(texttemplate='%{text: .1s}',textposition='auto',textfont_size=20)

fig.layout.plot_bgcolor = 'white'
fig.layout.paper_bgcolor = 'white'
fig.layout.yaxis.gridcolor = '#D3D3D3'

fig.show()

In [2409]:
fig = go.Figure()

fig.add_trace(go.Indicator(
    mode = "number",
    value = bound_df.iloc[0][['4_distance','6_distance']].max(),
    number =  {'suffix':'m'},
    domain = {'row':0, 'column':0},
    title = "Longest Boundary Shot"))

In [2410]:
fig = go.Figure()

# fig.add_trace(go.Indicator(
#     mode = "number+delta",
#     value = 300,
#     guage = {'shape':'bullet', 'axis':{'visible':True}},
#     domain = {'row': 0, 'column': 0}))

fig = go.Figure(go.Indicator(
    mode = "number+gauge",
    gauge = {'shape': "bullet",
            'axis': {'range': [None, 150], 'dtick':10}},
    number = {'suffix': "m", 'font':{'size': 40}},
    value = bound_df.iloc[0][['4_distance','6_distance']].max(),
    domain = {'x': [0.1, 1], 'y': [0.2, 0.9]},
    title = {'text': "Longest Boundary", 'font':{'size': 14},'align':'center'}))

fig.show()
# fig = go.Figure()

# fig.add_trace(go.Indicator(
#     mode = "number",
#     value = bound_df.iloc[0][['4_distance','6_distance']].max(),
#     number =  {'suffix':'m'},
#     domain = {'row':0, 'column':0},
#     title = "Longest Boundary Shot"))

In [2411]:
def boundary_analysis_visual(user_email, user_name):
    
    df = cricket_statistics.loc[(cricket_statistics['email']==user_email) & (cricket_statistics['name']==user_name)]
    
    fig_boundary_count = go.Figure(
    data = [go.Bar(
            x = ['<b>4s<b>', '<b>6s<b>'],
            y = [bound_df.iloc[0]['number_4s'], df.iloc[0]['number_6s']],
        width = [0.5, 0.5],
        text = [bound_df.iloc[0]['number_4s'], df.iloc[0]['number_6s']],
        textposition = 'outside',
        marker_color = ['black','red']
    )])

    fig_boundary_count.update_layout(title={'text': '<b>Number of Boundaries </b>','x':0.5},
                  xaxis_title_text='<b>Boundary Type</b>')

    fig_boundary_count.update_xaxes(type='category', showgrid=False, tickangle = 0, ticks="outside",
                showline=True, linewidth=1, linecolor='#121212', tickfont_size=20)
        
    fig_boundary_count.update_yaxes(showticklabels = False)

    fig_boundary_count.update_traces(texttemplate='%{text: .1s}',textposition='auto',textfont_size=20)

    fig_boundary_count.layout.plot_bgcolor = 'white'
    fig_boundary_count.layout.paper_bgcolor = 'white'
    fig_boundary_count.layout.yaxis.gridcolor = '#D3D3D3'

    fig_boundary_count.show()
        
    
    fig_longest_boundary = go.Figure()

    fig_longest_boundary.add_trace(go.Indicator(
            mode = "number",
            value = df.iloc[0][['4_distance','6_distance']].max(),
            number =  {'suffix':'m'},
            domain = {'row':0, 'column':0},
            title = "Longest Boundary Shot"))
        
    fig_longest_boundary.show()

In [2412]:
boundary_analysis_visual('123@gmail.com', 'ETHAN RATHORE')

### Maximum, Minimum and Average Speed

In [2413]:
def speed_visualisation_function(user_email, user_name):
    
    df = cricket_statistics.loc[(cricket_statistics['email']==user_email) & (cricket_statistics['name']==user_name)]

    fig = go.Figure(go.Indicator(
        mode = "gauge+number",
        value = df.iloc[0]['average_speed'],
        number = {'prefix':"Avg: ",'suffix': "mph", 'font':{'size': 40}},
        title = {'text': "Minimum, Maximum and Average Delivery Speed", 'font':{'size': 25},'align':'center'},
        gauge = {'axis': {'range': [None, 100], 'dtick':10},
                 'bar': {'color':"black"},
                 'steps' : [
                     {'range': [0, df.iloc[0]['min_delivery_speed']], 'color': "lightgray"},
                     {'range': [df.iloc[0]['min_delivery_speed'], df.iloc[0]['max_delivery_speed']], 'color': "gray"}],
                 'threshold' : {'line': {'color': "red", 'width': 4}, 'thickness': 0.75, 'value': df.iloc[0]['max_delivery_speed']}}))

    fig.show()

In [2414]:
speed_visualisation_function('0070@stores.gane.co.uk', 'GAME STORE')

### Average Runs for Different Speeds

In [2415]:
speed_runs_df = cricket_df.loc[(cricket_df['email']=='alex.mathews@batfast.com') & (cricket_df['name']=='ALEX MATHEWS')] 

average_runs_speed_bin_df = speed_runs_df[['email','name','runs','speed_bins']].groupby(['email','name','speed_bins']).mean()
average_runs_speed_bin_df.rename(columns={"runs":"average_runs_per_speed_bin"}, inplace=True)
average_runs_speed_bin_df.reset_index(inplace=True)
average_runs_speed_bin_df


Unnamed: 0,email,name,speed_bins,average_runs_per_speed_bin
0,alex.mathews@batfast.com,ALEX MATHEWS,0-10,
1,alex.mathews@batfast.com,ALEX MATHEWS,10-20,1.0
2,alex.mathews@batfast.com,ALEX MATHEWS,20-30,0.378378
3,alex.mathews@batfast.com,ALEX MATHEWS,30-40,0.955157
4,alex.mathews@batfast.com,ALEX MATHEWS,40-50,0.981132
5,alex.mathews@batfast.com,ALEX MATHEWS,50-60,1.482759
6,alex.mathews@batfast.com,ALEX MATHEWS,60-70,1.166667
7,alex.mathews@batfast.com,ALEX MATHEWS,70-80,
8,alex.mathews@batfast.com,ALEX MATHEWS,80-90,
9,alex.mathews@batfast.com,ALEX MATHEWS,90-100,


In [2416]:
fig = go.Figure(
    data = [go.Bar(
            x = list(average_runs_speed_bin_df['speed_bins'].unique()),
            y = [average_runs_speed_bin_df.iloc[0]['average_runs_per_speed_bin'],
                 average_runs_speed_bin_df.iloc[1]['average_runs_per_speed_bin'],
                 average_runs_speed_bin_df.iloc[2]['average_runs_per_speed_bin'],
                 average_runs_speed_bin_df.iloc[3]['average_runs_per_speed_bin'],
                 average_runs_speed_bin_df.iloc[4]['average_runs_per_speed_bin'],
                 average_runs_speed_bin_df.iloc[5]['average_runs_per_speed_bin'],
                 average_runs_speed_bin_df.iloc[6]['average_runs_per_speed_bin'],
                 average_runs_speed_bin_df.iloc[7]['average_runs_per_speed_bin'],
                 average_runs_speed_bin_df.iloc[8]['average_runs_per_speed_bin'],
                 average_runs_speed_bin_df.iloc[9]['average_runs_per_speed_bin']],
        text = [round(average_runs_speed_bin_df.iloc[0]['average_runs_per_speed_bin'],2),
                 round(average_runs_speed_bin_df.iloc[1]['average_runs_per_speed_bin'],2),
                 round(average_runs_speed_bin_df.iloc[2]['average_runs_per_speed_bin'],2),
                 round(average_runs_speed_bin_df.iloc[3]['average_runs_per_speed_bin'],2),
                 round(average_runs_speed_bin_df.iloc[4]['average_runs_per_speed_bin'],2),
                 round(average_runs_speed_bin_df.iloc[5]['average_runs_per_speed_bin'],2),
                 round(average_runs_speed_bin_df.iloc[6]['average_runs_per_speed_bin'],2),
                 round(average_runs_speed_bin_df.iloc[7]['average_runs_per_speed_bin'],2),
                 round(average_runs_speed_bin_df.iloc[8]['average_runs_per_speed_bin'],2),
                 round(average_runs_speed_bin_df.iloc[9]['average_runs_per_speed_bin'],2)],
        textposition = 'outside',
        marker_color = ['red','red','red','red','red','red','red','red','red','red']
)])

fig.update_layout(title={'text': '<b>Performance vs Different Speeds</b>','x':0.5},
                  xaxis_title_text='<b>Speed (mph)</b>',
                 yaxis_title_text='<b>Average Runs per Delivery</b>')

fig.update_xaxes(type='category', showgrid=False, tickangle = 0, ticks="outside",
                showline=True, linewidth=1, linecolor='#121212', tickfont_size=20)

fig.update_traces(textposition='outside',textfont_size=15)

fig.layout.plot_bgcolor = 'white'
fig.layout.paper_bgcolor = 'white'
fig.layout.yaxis.gridcolor = '#D3D3D3'

fig.show()

In [2417]:
def performance_different_speeds_visual(user_email, user_name):
    
    df = cricket_df.loc[(cricket_df['email']==user_email) & (cricket_df['name']==user_name)] 

    average_runs_speed_bin_df = df[['email','name','runs','speed_bins']].groupby(['email','name','speed_bins']).mean()
    average_runs_speed_bin_df.rename(columns={"runs":"average_runs_per_speed_bin"}, inplace=True)
    average_runs_speed_bin_df.reset_index(inplace=True)
    
    fig = go.Figure(
        data = [go.Bar(
                x = list(average_runs_speed_bin_df['speed_bins'].unique()),
                y = [average_runs_speed_bin_df.iloc[0]['average_runs_per_speed_bin'],
                     average_runs_speed_bin_df.iloc[1]['average_runs_per_speed_bin'],
                     average_runs_speed_bin_df.iloc[2]['average_runs_per_speed_bin'],
                     average_runs_speed_bin_df.iloc[3]['average_runs_per_speed_bin'],
                     average_runs_speed_bin_df.iloc[4]['average_runs_per_speed_bin'],
                     average_runs_speed_bin_df.iloc[5]['average_runs_per_speed_bin'],
                     average_runs_speed_bin_df.iloc[6]['average_runs_per_speed_bin'],
                     average_runs_speed_bin_df.iloc[7]['average_runs_per_speed_bin'],
                     average_runs_speed_bin_df.iloc[8]['average_runs_per_speed_bin'],
                     average_runs_speed_bin_df.iloc[9]['average_runs_per_speed_bin']],
            text = [round(average_runs_speed_bin_df.iloc[0]['average_runs_per_speed_bin'],2),
                     round(average_runs_speed_bin_df.iloc[1]['average_runs_per_speed_bin'],2),
                     round(average_runs_speed_bin_df.iloc[2]['average_runs_per_speed_bin'],2),
                     round(average_runs_speed_bin_df.iloc[3]['average_runs_per_speed_bin'],2),
                     round(average_runs_speed_bin_df.iloc[4]['average_runs_per_speed_bin'],2),
                     round(average_runs_speed_bin_df.iloc[5]['average_runs_per_speed_bin'],2),
                     round(average_runs_speed_bin_df.iloc[6]['average_runs_per_speed_bin'],2),
                     round(average_runs_speed_bin_df.iloc[7]['average_runs_per_speed_bin'],2),
                     round(average_runs_speed_bin_df.iloc[8]['average_runs_per_speed_bin'],2),
                     round(average_runs_speed_bin_df.iloc[9]['average_runs_per_speed_bin'],2)],
            textposition = 'outside',
            marker_color = ['red','red','red','red','red','red','red','red','red','red']
    )])

    fig.update_layout(title={'text': '<b>Performance vs Different Speeds</b>','x':0.5},
                      xaxis_title_text='<b>Speed (mph)</b>',
                     yaxis_title_text='<b>Average Runs per Delivery</b>')

    fig.update_xaxes(type='category', showgrid=False, tickangle = 0, ticks="outside",
                    showline=True, linewidth=1, linecolor='#121212', tickfont_size=20)

    fig.update_traces(textposition='outside',textfont_size=15)

    fig.layout.plot_bgcolor = 'white'
    fig.layout.paper_bgcolor = 'white'
    fig.layout.yaxis.gridcolor = '#D3D3D3'

    fig.show()

performance_different_speeds_visual('alex.mathews@batfast.com', 'ALEX MATHEWS')

### Pitch Performance

In [2418]:
pitch_df = cricket_statistics.loc[(cricket_statistics['email']=='zeeshaanattari@gmail.com') & (cricket_statistics['name']=='ZEESHAAN HAMID')]
pitch_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1 entries, 13289 to 13289
Data columns (total 50 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   email                         1 non-null      object 
 1   name                          1 non-null      object 
 2   total_runs_scored             1 non-null      float64
 3   total_points_scored           1 non-null      float64
 4   Bowled                        1 non-null      float64
 5   Caught Behind                 1 non-null      float64
 6   total_wickets                 1 non-null      float64
 7   average_runs                  1 non-null      float64
 8   average_points                0 non-null      float64
 9   runs_rank                     1 non-null      float64
 10  runs_rank_percentile          1 non-null      float64
 11  points_rank                   1 non-null      float64
 12  points_rank_percentile        1 non-null      float64
 13  d

In [2419]:
fig = go.Figure(
    data = [go.Bar(
            x = ['Yorker', 'Full', 'Good', 'Short', 'Extra Short'],
            y = [pitch_df.iloc[0]['Yorker_average_runs'], pitch_df.iloc[0]['Full_average_runs'],
                pitch_df.iloc[0]['Good_average_runs'], pitch_df.iloc[0]['Short_average_runs'],
                pitch_df.iloc[0]['Extra_short_average_runs']],
        text = [round(pitch_df.iloc[0]['Yorker_average_runs'],2), round(pitch_df.iloc[0]['Full_average_runs'],2),
                round(pitch_df.iloc[0]['Good_average_runs'],2), round(pitch_df.iloc[0]['Short_average_runs'],2),
                round(pitch_df.iloc[0]['Extra_short_average_runs'],2)],
        textposition = 'outside',
        marker_color = ['rgb(255,237,111)','rgb(115,175,72)','rgb(204,80,62)','rgb(47,138,196)','black']
)])

fig.update_layout(title={'text': '<b>Delivery Length Performance  </b>','x':0.5},
                  xaxis_title_text='<b>Delivery Length</b>',
                 yaxis_title_text='<b>Average Runs per Delivery</b>')

fig.update_xaxes(type='category', showgrid=False, tickangle = 0, ticks="outside",
                showline=True, linewidth=1, linecolor='#121212', tickfont_size=20)

fig.update_traces(textposition='outside',textfont_size=15)

fig.layout.plot_bgcolor = 'white'
fig.layout.paper_bgcolor = 'white'
fig.layout.yaxis.gridcolor = '#D3D3D3'

fig.show()

In [2420]:
pitch_df[['Yorker_average_runs','Full_average_runs','Good_average_runs','Short_average_runs','Extra_short_average_runs']]

Unnamed: 0,Yorker_average_runs,Full_average_runs,Good_average_runs,Short_average_runs,Extra_short_average_runs
13289,1.485714,0.909091,0.78125,1.112903,0.483516


In [2421]:
def pitch_performance_visual(user_email, user_name):
    
    df = cricket_statistics.loc[(cricket_statistics['email']==user_email) & (cricket_statistics['name']==user_name)] 
    
    fig_pitch_performance = go.Figure(
        
        data = [go.Bar(
            x = ['Yorker', 'Full', 'Good', 'Short', 'Extra Short'],
            y = [df.iloc[0]['Yorker_average_runs'], df.iloc[0]['Full_average_runs'],
                df.iloc[0]['Good_average_runs'], df.iloc[0]['Short_average_runs'],
                df.iloc[0]['Extra_short_average_runs']],
        text = [round(df.iloc[0]['Yorker_average_runs'],2), round(df.iloc[0]['Full_average_runs'],2),
                round(df.iloc[0]['Good_average_runs'],2), round(df.iloc[0]['Short_average_runs'],2),
                round(df.iloc[0]['Extra_short_average_runs'],2)],
        textposition = 'outside',
        marker_color = ['rgb(255,237,111)','rgb(115,175,72)','rgb(204,80,62)','rgb(47,138,196)','black']
    )])

    fig_pitch_performance.update_layout(title={'text': '<b>Delivery Length Performance  </b>','x':0.5},
                  xaxis_title_text='<b>Delivery Length</b>',
                 yaxis_title_text='<b>Average Runs per Delivery</b>')

    fig_pitch_performance.update_xaxes(type='category', showgrid=False, tickangle = 0, ticks="outside",
                showline=True, linewidth=1, linecolor='#121212', tickfont_size=20)

    fig_pitch_performance.update_traces(textposition='outside',textfont_size=15)

    fig_pitch_performance.layout.plot_bgcolor = 'white'
    fig_pitch_performance.layout.paper_bgcolor = 'white'
    fig_pitch_performance.layout.yaxis.gridcolor = '#D3D3D3'

    fig_pitch_performance.show()

In [2422]:
pitch_performance_visual('zeeshaanattari@gmail.com', 'ZEESHAAN HAMID')

### Swing Analysis

In [2423]:
swing_df = cricket_statistics.loc[(cricket_statistics['email']=='zygarde1234@gmail.com') & (cricket_statistics['name']=='SABBIR AHMED')]
swing_df

Unnamed: 0,email,name,total_runs_scored,total_points_scored,Bowled,Caught Behind,total_wickets,average_runs,average_points,runs_rank,...,deliveries_faced,Seam Bowled,Seam Caught Behind,Spin Bowled,Spin Caught Behind,seam_runs,spin_runs,seam_delivery,spin_delivery,power_percent
13332,zygarde1234@gmail.com,SABBIR AHMED,48.0,0.0,3.0,0.0,3.0,16.0,,1199.0,...,24.0,3.0,0.0,0.0,0.0,48.0,0.0,24.0,0.0,


In [2424]:
# test_df = cricket_df.loc[(cricket_df['email']=='zygarde1234@gmail.com') & (cricket_df['name']=='SABBIR AHMED')]
# # test_df['away_swing_runs']
# test_df['away_swing_runs'].mean()


In [2425]:
fig = go.Figure(
    data = [go.Bar(
            x = ['Away-swing', 'Straight', 'In-swing'],
            y = [swing_df.iloc[0]['away_swing_runs'], swing_df.iloc[0]['straight_runs'],
                swing_df.iloc[0]['in_swing_runs']],
        text = [round(swing_df.iloc[0]['away_swing_runs'],2), round(swing_df.iloc[0]['straight_runs'],2),
                round(swing_df.iloc[0]['in_swing_runs'],2)],
        textposition = 'outside',
        marker_color = ['red','black','blue']
)])

fig.update_layout(title={'text': '<b>Swing Analysis  </b>','x':0.5},
                  xaxis_title_text='<b>Swing Direction</b>',
                 yaxis_title_text='<b>Average Runs per Delivery per Swing Type</b>')

fig.update_xaxes(type='category', showgrid=False, tickangle = 0, ticks="outside",
                showline=True, linewidth=1, linecolor='#121212', tickfont_size=20)

fig.update_traces(textposition='outside',textfont_size=15)

fig.layout.plot_bgcolor = 'white'
fig.layout.paper_bgcolor = 'white'
fig.layout.yaxis.gridcolor = '#D3D3D3'

fig.show()

In [2426]:
def swing_visual(user_email, user_name):
    
    df = cricket_statistics.loc[(cricket_statistics['email']==user_email) & (cricket_statistics['name']==user_name)] 
    
    fig_swing = go.Figure(
    data = [go.Bar(
            x = ['Away-swing', 'Straight', 'In-swing'],
            y = [df.iloc[0]['away_swing_runs'], df.iloc[0]['straight_runs'],
                df.iloc[0]['in_swing_runs']],
        text = [round(df.iloc[0]['away_swing_runs'],2), round(df.iloc[0]['straight_runs'],2),
                round(df.iloc[0]['in_swing_runs'],2)],
        textposition = 'outside',
        marker_color = ['red','black','blue']
    )])

    fig_swing.update_layout(title={'text': '<b>Swing Analysis  </b>','x':0.5},
                  xaxis_title_text='<b>Swing Direction</b>',
                 yaxis_title_text='<b>Average Runs per Delivery per Swing Type</b>')

    fig_swing.update_xaxes(type='category', showgrid=False, tickangle = 0, ticks="outside",
                showline=True, linewidth=1, linecolor='#121212', tickfont_size=20)

    fig_swing.update_traces(textposition='outside',textfont_size=15)

    fig_swing.layout.plot_bgcolor = 'white'
    fig_swing.layout.paper_bgcolor = 'white'
    fig_swing.layout.yaxis.gridcolor = '#D3D3D3'

    fig_swing.show()

In [2427]:
swing_visual('zygarde1234@gmail.com', 'SABBIR AHMED')

### Off/ Leg Side 

In [2428]:
def leg_off_side_visual(user_email, user_name):
    
    df = cricket_statistics.loc[(cricket_statistics['email']==user_email) & (cricket_statistics['name']==user_name)] 
    
    fig_leg_off = go.Figure()

    fig_leg_off.add_trace(go.Indicator(
        mode = "number",
        value = df.iloc[0]['off_side_shot'],
        domain = {'row':0, 'column':1},
        title = 'Off Side Shots'))
    
    fig_leg_off.add_trace(go.Indicator(
        mode = "number",
        value = df.iloc[0]['leg_side_shot'],
        domain = {'row':0, 'column':0},
        title = 'Leg Side Shots'))

    fig_leg_off.update_layout(
        grid = {'rows':2, 'columns':2, 'pattern': "independent"},
        )
    
    fig_leg_off.show()
    
leg_off_side_visual('alex.mathews@batfast.com', 'ALEX MATHEWS')

### Wicket Analysis

In [2429]:
wicket_df = cricket_statistics.loc[(cricket_statistics['email']=='alex.mathews@batfast.com') & (cricket_statistics['name']=='ALEX MATHEWS')] 
wicket_df

Unnamed: 0,email,name,total_runs_scored,total_points_scored,Bowled,Caught Behind,total_wickets,average_runs,average_points,runs_rank,...,deliveries_faced,Seam Bowled,Seam Caught Behind,Spin Bowled,Spin Caught Behind,seam_runs,spin_runs,seam_delivery,spin_delivery,power_percent
554,alex.mathews@batfast.com,ALEX MATHEWS,561.0,600.0,8.0,0.0,8.0,70.125,600.0,39.0,...,584.0,8.0,0.0,0.0,0.0,509.0,52.0,530.0,54.0,57.163415


In [2430]:

#MAYBE ADD/ CHANGE THIS TO RUNS?

fig = go.Figure()

fig.add_trace(go.Indicator(
mode = "number",
value = wicket_df.iloc[0]['total_wickets'],
domain = {'row':0, 'column':3},
title = 'Total Wickets'))
    
fig.add_trace(go.Indicator(
mode = "number",
value = wicket_df.iloc[0]['Bowled'],
domain = {'row':1, 'column':2},
title = 'Bowled'))

fig.add_trace(go.Indicator(
mode = "number",
value = wicket_df.iloc[0]['Caught Behind'],
domain = {'row':1, 'column':4},
title = 'Caught Behind'))

fig.add_trace(go.Indicator(
mode = "number",
value = wicket_df.iloc[0]['Seam Bowled'],
domain = {'row':2, 'column':0},
title = 'vs. Seam'))

fig.add_trace(go.Indicator(
mode = "number",
value = wicket_df.iloc[0]['Spin Bowled'],
domain = {'row':2, 'column':2},
title = 'vs. Spin'))

fig.add_trace(go.Indicator(
mode = "number",
value = wicket_df.iloc[0]['Seam Caught Behind'],
domain = {'row':2, 'column':4},
title = 'vs. Seam'))

fig.add_trace(go.Indicator(
mode = "number",
value = wicket_df.iloc[0]['Spin Caught Behind'],
domain = {'row':2, 'column':6},
title = 'vs. Spin'))


fig.update_layout(
        grid = {'rows':3, 'columns':7, 'pattern': "independent"},
        )
    
fig.show()

In [2431]:
def wickets_visual(user_email, user_name):
    
    df = cricket_statistics.loc[(cricket_statistics['email']==user_email) & (cricket_statistics['name']==user_name)] 
    
    fig = go.Figure()

    fig.add_trace(go.Indicator(
        mode = "number",
        value = df.iloc[0]['total_wickets'],
        domain = {'row':0, 'column':3},
        title = 'Total Wickets'))

    fig.add_trace(go.Indicator(
        mode = "number",
        value = df.iloc[0]['Bowled'],
        domain = {'row':1, 'column':2},
        title = 'No. Bowled'))

    fig.add_trace(go.Indicator(
        mode = "number",
        value = df.iloc[0]['Caught Behind'],
        domain = {'row':1, 'column':4},
        title = 'No. Caught Behind'))

    fig.add_trace(go.Indicator(
        mode = "number",
        value = wicket_df.iloc[0]['Seam Bowled'],
        domain = {'row':2, 'column':0},
        title = 'vs. Seam'))

    fig.add_trace(go.Indicator(
        mode = "number",
        value = df.iloc[0]['Spin Bowled'],
        domain = {'row':2, 'column':1},
        title = 'vs. Spin'))

    fig.add_trace(go.Indicator(
        mode = "number",
        value = df.iloc[0]['Seam Caught Behind'],
        domain = {'row':2, 'column':5},
        title = 'vs. Seam'))

    fig.add_trace(go.Indicator(
        mode = "number",
        value = df.iloc[0]['Spin Caught Behind'],
        domain = {'row':2, 'column':6},
        title = 'vs. Spin'))


    fig.update_layout(
        grid = {'rows':3, 'columns':7, 'pattern': "independent"},
        )
    
    fig.show()
    
    
wickets_visual('alex.mathews@batfast.com', 'ALEX MATHEWS')

### Spin Analysis

In [2432]:
spin_df = cricket_statistics.loc[(cricket_statistics['email']=='alex.mathews@batfast.com') & (cricket_statistics['name']=='ALEX MATHEWS')] 
spin_df

Unnamed: 0,email,name,total_runs_scored,total_points_scored,Bowled,Caught Behind,total_wickets,average_runs,average_points,runs_rank,...,deliveries_faced,Seam Bowled,Seam Caught Behind,Spin Bowled,Spin Caught Behind,seam_runs,spin_runs,seam_delivery,spin_delivery,power_percent
554,alex.mathews@batfast.com,ALEX MATHEWS,561.0,600.0,8.0,0.0,8.0,70.125,600.0,39.0,...,584.0,8.0,0.0,0.0,0.0,509.0,52.0,530.0,54.0,57.163415


In [2433]:
    fig = go.Figure()

    fig.add_trace(go.Indicator(
        mode = "number",
        value = spin_df.iloc[0]['off_spin_total_runs'],
        domain = {'row':0, 'column':1},
        title = 'Total Off-spin Runs'))
    
    fig.add_trace(go.Indicator(
        mode = "number",
        value = spin_df.iloc[0]['leg_spin_total_runs'],
        domain = {'row':0, 'column':0},
        title = 'Total Leg-spin Runs'))

    fig.update_layout(
        grid = {'rows':2, 'columns':2, 'pattern': "independent"},
        )
    
    fig.show()


In [2434]:
fig = go.Figure(
    data = [go.Bar(
            x = ['Off Spin', 'Leg-swing'],
            y = [spin_df.iloc[0]['off_spin_average_runs'], spin_df.iloc[0]['leg_spin_average_runs']],
        text = [round(spin_df.iloc[0]['off_spin_average_runs'],2), round(spin_df.iloc[0]['leg_spin_average_runs'],2)],
        textposition = 'outside',
        marker_color = ['black','red']
)])

fig.update_layout(title={'text': '<b>Average Runs Against Spin  </b>','x':0.5},
                  xaxis_title_text='<b>Type of Spin</b>',
                  yaxis_title_text='<b>Average Runs</b>')

fig.update_xaxes(type='category', showgrid=False, tickangle = 0, ticks="outside", showline=True, linewidth=1, linecolor='#121212', tickfont_size=20)

fig.update_traces(textposition='outside',textfont_size=15)

fig.layout.plot_bgcolor = 'white'
fig.layout.paper_bgcolor = 'white'
fig.layout.yaxis.gridcolor = '#D3D3D3'

fig.show()

In [2435]:
def spin_visual(user_email, user_name):
    
    df = cricket_statistics.loc[(cricket_statistics['email']==user_email) & (cricket_statistics['name']==user_name)] 
    
    fig_spin_total = go.Figure()
    fig_spin_total.add_trace(go.Indicator(
        mode = "number",
        value = spin_df.iloc[0]['off_spin_total_runs'],
        domain = {'row':0, 'column':1},
        title = 'Total Off-spin Runs'))
    
    fig_spin_total.add_trace(go.Indicator(
        mode = "number",
        value = spin_df.iloc[0]['leg_spin_total_runs'],
        domain = {'row':0, 'column':0},
        title = 'Total Leg-spin Runs'))

    fig_spin_total.update_layout(
        grid = {'rows':2, 'columns':2, 'pattern': "independent"},
        )
    
    fig_spin_total.show()
    
    
    fig_spin_average_runs = go.Figure(
    data = [go.Bar(
            x = ['Off Spin', 'Leg-swing'],
            y = [spin_df.iloc[0]['off_spin_average_runs'], spin_df.iloc[0]['leg_spin_average_runs']],
        text = [round(spin_df.iloc[0]['off_spin_average_runs'],2), round(spin_df.iloc[0]['leg_spin_average_runs'],2)],
        textposition = 'outside',
        marker_color = ['black','red']
    )])

    fig_spin_average_runs.update_layout(title={'text': '<b>Average Runs Against Spin  </b>','x':0.5},
                  xaxis_title_text='<b>Type of Spin</b>',
                  yaxis_title_text='<b>Average Runs</b>')

    fig_spin_average_runs.update_xaxes(type='category', showgrid=False, tickangle = 0, ticks="outside", showline=True, linewidth=1, linecolor='#121212', tickfont_size=20)

    fig_spin_average_runs.update_traces(textposition='outside',textfont_size=15)

    fig_spin_average_runs.layout.plot_bgcolor = 'white'
    fig_spin_average_runs.layout.paper_bgcolor = 'white'
    fig_spin_average_runs.layout.yaxis.gridcolor = '#D3D3D3'

    fig_spin_average_runs.show()
    
spin_visual('alex.mathews@batfast.com', 'ALEX MATHEWS')

In [2436]:
spin_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1 entries, 554 to 554
Data columns (total 50 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   email                         1 non-null      object 
 1   name                          1 non-null      object 
 2   total_runs_scored             1 non-null      float64
 3   total_points_scored           1 non-null      float64
 4   Bowled                        1 non-null      float64
 5   Caught Behind                 1 non-null      float64
 6   total_wickets                 1 non-null      float64
 7   average_runs                  1 non-null      float64
 8   average_points                1 non-null      float64
 9   runs_rank                     1 non-null      float64
 10  runs_rank_percentile          1 non-null      float64
 11  points_rank                   1 non-null      float64
 12  points_rank_percentile        1 non-null      float64
 13  deliv

### Spin versus Seam Runs 

In [2456]:
def spin_versus_seam_runs_visual(user_email, user_name):

    df = cricket_statistics.loc[(cricket_statistics['email']==user_email) & (cricket_statistics['name']==user_name)] 
    
    fig_spin_seam = go.Figure()
    
    fig_spin_seam.add_trace(go.Indicator(
        mode = "number",
        value = df.iloc[0]['seam_delivery'],
        domain = {'row':0, 'column':0},
        title = 'Total Seam Deliveries'))
    
    fig_spin_seam.add_trace(go.Indicator(
        mode = "number",
        value = df.iloc[0]['spin_delivery'],
        domain = {'row':0, 'column':1},
        title = 'Total Spin Deliveries'))
    
    fig_spin_seam.add_trace(go.Indicator(
        mode = "number",
        value = df.iloc[0]['seam_runs'],
        domain = {'row':1, 'column':0},
        title = 'Total Runs Scored off Seam'))
    
    fig_spin_seam.add_trace(go.Indicator(
        mode = "number",
        value = df.iloc[0]['spin_runs'],
        domain = {'row':1, 'column':1},
        title = 'Total Runs Scored off Spin'))
    
    fig_spin_seam.add_trace(go.Indicator(
        mode = "number",
        value = round((df.iloc[0]['seam_runs'])/(df.iloc[0]['seam_delivery']),2),
        domain = {'row':2, 'column':0},
        title = 'Average Runs per Seam Delivery'))
    
    fig_spin_seam.add_trace(go.Indicator(
        mode = "number",
        value = round((df.iloc[0]['spin_runs'])/(df.iloc[0]['spin_delivery']),2),
        domain = {'row':2, 'column':1},
        title = 'Average Runs per Spin Delivery'))

    fig_spin_seam.update_layout(
        grid = {'rows':3, 'columns':2, 'pattern': "independent"},
        )
    
    fig_spin_seam.show()

In [2457]:
spin_versus_seam_runs_visual('alex.mathews@batfast.com', 'ALEX MATHEWS')

### Power Visuals

In [2439]:
def power_visual(user_email, user_name):
    
    df = cricket_statistics.loc[(cricket_statistics['email']==user_email) & (cricket_statistics['name']==user_name)] 
    
    fig = go.Figure(go.Indicator(
        mode = "number+gauge",
        gauge = {'shape': "bullet",
                'axis': {'range': [None, 100], 'dtick':10}},
        number = {'suffix': "%", 'font':{'size': 40}},
        value = round(df.iloc[0]['power_percent'],0),
        domain = {'x': [0.1, 1], 'y': [0.2, 0.9]},
        title = {'text': "Average Power", 'font':{'size': 14},'align':'center'}))


    fig.show()
    
power_visual('123@gmail.com', 'ETHAN RATHORE')

In [2440]:
average_power_df.loc[average_power_df['power_percent']>0]

# cricket_statistics.loc[(cricket_statistics['email']=='123@gmail.com') & (cricket_statistics['name']=='ETHAN RATHORE')] 


Unnamed: 0_level_0,Unnamed: 1_level_0,power_percent
email,name,Unnamed: 2_level_1
123@gmail.com,ETHAN RATHORE,25.143902
5oby@hotmail.com,ZACK,15.338462
aaman9@hotmail.com,AAMAN,51.116667
aaron@vmsglobal.co.uk,AARON LARAMAN,38.175000
ab.brennen@gmail.com,ANDREW BRENNEN,25.604348
...,...,...
zeeshan_m@hotmail.co.uk,ZEESHAN MAHMOOD,98.327273
zmurtagh7@gmail.com,ZACK MURTAGH,20.950000
zolotarevnik@gmail.com,ALISA ZOLOTAREVA,17.100000
zolotarevnik@gmail.com,ANNA ZOLOTAREVA,15.100000


## Generating Cricket Visuals 

In [2441]:
def generate_cricket_visuals(user_email, user_name):
    
    print("<b>Cricket Deliveries Faced<b>")
    cricket_deliveries_faced_visual(user_email, user_name)
    
    print("<b>Total Runs and Points<b>")
    total_runs_points_visual(user_email, user_name)
    
    print("<b>Average Runs and Points<b>")
    average_runs_points_visual(user_email, user_name)
    
    print("<b>Ranking Relative to Other Users")
    cricket_ranking_visual(user_email, user_name)
    
    print("<b>Boundary Analysis<b>")
    boundary_analysis_visual(user_email, user_name)
    
    print("<b>Speed of Deliveries<b>")
    speed_visualisation_function(user_email, user_name)
    
    print("<b>Performance vs Different Speeds<b>")
    performance_different_speeds_visual(user_email, user_name)
    
    print("<b>Pitch of Deliveries<b>")
    pitch_performance_visual(user_email, user_name)
    
    print("<b>Spin Analysis<b>")
    swing_visual(user_email, user_name)
    
    print("<b>Direction of Shots<b>")
    leg_off_side_visual(user_email, user_name)
    
    print("<b>Wickets Analysis<b>")
    wickets_visual(user_email, user_name)
    
    print("<b>Spin Analysis<b>")
    spin_visual(user_email, user_name)
    
    print("<b>Spin vs Seam Runs Analysis<b>")
    spin_versus_seam_runs_visual(user_email, user_name)
    
    print("<b>Power Visual<b>")
    power_visual(user_email, user_name)
    
    print("<b>Wagon Wheel<b>")
    wagon_wheel_visual(user_email, user_name)

In [2442]:
generate_cricket_visuals('alex.mathews@batfast.com', 'ALEX MATHEWS')

<b>Cricket Deliveries Faced<b>


<b>Total Runs and Points<b>


<b>Average Runs and Points<b>


<b>Ranking Relative to Other Users


<b>Boundary Analysis<b>


<b>Speed of Deliveries<b>


<b>Performance vs Different Speeds<b>


<b>Pitch of Deliveries<b>


<b>Spin Analysis<b>


<b>Direction of Shots<b>


<b>Wickets Analysis<b>


<b>Spin Analysis<b>


<b>Spin vs Seam Runs Analysis<b>


<b>Power Visual<b>


<b>Wagon Wheel<b>


KeyError: 'runs'

## Tennis Statistics

In [2459]:
tennis_df = combined_df.loc[combined_df['tennis']==1]
tennis_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1586 entries, 10031 to 1979526
Data columns (total 43 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   sim_number     1586 non-null   int64  
 1   timestamp_x    1586 non-null   object 
 2   batfast_id     1586 non-null   object 
 3   client_name    1586 non-null   object 
 4   event_name     1586 non-null   object 
 5   game_mode      1586 non-null   object 
 6   score          1479 non-null   object 
 7   speed          1586 non-null   float64
 8   pitch          1586 non-null   float64
 9   swing          1586 non-null   float64
 10  pan            1586 non-null   float64
 11  turn           1586 non-null   float64
 12  r              150 non-null    float64
 13  theta          119 non-null    float64
 14  z              0 non-null      float64
 15  power          683 non-null    float64
 16  machine        1586 non-null   object 
 17  scoring        1586 non-null   object 
 18  l

### Deliveries

In [2460]:
tennis_user_deliveries_faced = user_deliveries_function(tennis_df)
tennis_user_deliveries_faced

Unnamed: 0_level_0,Unnamed: 1_level_0,deliveries_faced
email,name,Unnamed: 2_level_1
alex.mathews@batfast.com,ALEX MATHEWS,4
covette.timmons@batfast.com,COVETTE,231
jack.roddick@batfast.com,JACK RODDICK,30
jackroddick@gmail.com,JACK RODDICK,40
jason.smith@batfast.com,JASON SMITH,24
jessica-denman@hotmail.co.uk,JESS,42
jignesh.patel@batfast.com,JIGNESH PATEL,58
joseph.tolson@batfast.com,JOSEPH TOLSON,151
lee.duckworth@batfast.com,LEE DUCKWORTH,66
runish.gudhka@batfast.com,RUNISH,12


### Score

In [2461]:
tennis_total_score_df = total_score_function(tennis_df)
tennis_total_score_df

Unnamed: 0_level_0,Unnamed: 1_level_0,total_score
email,name,Unnamed: 2_level_1
alex.mathews@batfast.com,ALEX MATHEWS,0.0
covette.timmons@batfast.com,COVETTE,182.0
jack.roddick@batfast.com,JACK RODDICK,140.0
jackroddick@gmail.com,JACK RODDICK,306.0
jason.smith@batfast.com,JASON SMITH,74.0
jessica-denman@hotmail.co.uk,JESS,10.0
jignesh.patel@batfast.com,JIGNESH PATEL,50.0
joseph.tolson@batfast.com,JOSEPH TOLSON,415.0
lee.duckworth@batfast.com,LEE DUCKWORTH,63.0
runish.gudhka@batfast.com,RUNISH,4.0


###  Average Score 

In [2462]:
def average_score_tennis_function(dataset):
    
    average_score_df = dataset[['email','name','score_adj']].groupby(['email','name']).sum()
    average_score_df.rename(columns = {'score_adj':'total_score'}, inplace=True)
    tennis_deliveries_faced = user_deliveries_function(tennis_df)
    average_score_df = average_score_df.merge(tennis_deliveries_faced,
                                    left_index=True, right_index=True, how='outer')

    average_score_df['average_score'] = round(average_score_df['total_score']/average_score_df['deliveries_faced'],1)
    average_score_df.drop(columns=['total_score','deliveries_faced'], inplace=True)
    average_score_df.rename(columns={"average_score":"average_score_per_delivery"}, inplace=True)
    return average_score_df



In [2463]:
average_score_tennis_function(tennis_df)

Unnamed: 0_level_0,Unnamed: 1_level_0,average_score_per_delivery
email,name,Unnamed: 2_level_1
alex.mathews@batfast.com,ALEX MATHEWS,0.0
covette.timmons@batfast.com,COVETTE,0.8
jack.roddick@batfast.com,JACK RODDICK,4.7
jackroddick@gmail.com,JACK RODDICK,7.6
jason.smith@batfast.com,JASON SMITH,3.1
jessica-denman@hotmail.co.uk,JESS,0.2
jignesh.patel@batfast.com,JIGNESH PATEL,0.9
joseph.tolson@batfast.com,JOSEPH TOLSON,2.7
lee.duckworth@batfast.com,LEE DUCKWORTH,1.0
runish.gudhka@batfast.com,RUNISH,0.3


### Length Tennis

In [2464]:
tennis_df['length'].value_counts()

Service Line    988
Net Line        154
Full              6
Extra Short       6
Name: length, dtype: int64

In [2465]:
def average_points_different_lengths_function(dataset):
    #Identifying relevant delveries for each length
    dataset['service_line'] = np.where(dataset['length']=='Service Line',dataset['runs'],np.NaN)
    dataset['net_line'] = np.where(dataset['length']=='Net',dataset['runs'],np.NaN)
    
    #Calculating average scores off each length of delivery
    length_average_points_df = dataset[['email','name','service_line','net_line']].groupby(['email','name']).mean()
    length_average_points_df.rename(columns={'service_line':'service_line_average_points','net_line':'net_line_average_points'}, inplace=True)
    
    return length_average_points_df  

In [2466]:
average_points_different_lengths_function(tennis_df)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0_level_0,Unnamed: 1_level_0,service_line_average_points,net_line_average_points
email,name,Unnamed: 2_level_1,Unnamed: 3_level_1
alex.mathews@batfast.com,ALEX MATHEWS,,
covette.timmons@batfast.com,COVETTE,0.822222,
jack.roddick@batfast.com,JACK RODDICK,2.666667,
jackroddick@gmail.com,JACK RODDICK,0.0,
jason.smith@batfast.com,JASON SMITH,1.833333,
jessica-denman@hotmail.co.uk,JESS,0.238095,
jignesh.patel@batfast.com,JIGNESH PATEL,1.125,
joseph.tolson@batfast.com,JOSEPH TOLSON,1.941748,
lee.duckworth@batfast.com,LEE DUCKWORTH,0.954545,
runish.gudhka@batfast.com,RUNISH,,


### Max Min Speed Tennis

In [2467]:
def max_min_speed_tennis_function(dataset):
    max_speed_df = fastest_delivery_function(dataset)
    min_speed_df = slowest_delivery_function(dataset)
    
    max_min_speed_df = max_speed_df.merge(min_speed_df, left_index=True, right_index=True, how='outer')
    
    max_min_speed_df.rename(columns={"max_delivery_speed":"max_serve_speed_faced","min_delivery_speed":"min_serve_speed_faced"}, inplace=True)
    
    return max_min_speed_df

In [2468]:
max_min_speed_tennis_function(tennis_df)

Unnamed: 0_level_0,Unnamed: 1_level_0,max_serve_speed_faced,min_serve_speed_faced
email,name,Unnamed: 2_level_1,Unnamed: 3_level_1
alex.mathews@batfast.com,ALEX MATHEWS,30.0,30.0
covette.timmons@batfast.com,COVETTE,50.0,30.0
jack.roddick@batfast.com,JACK RODDICK,45.0,45.0
jackroddick@gmail.com,JACK RODDICK,59.0,50.0
jason.smith@batfast.com,JASON SMITH,45.0,45.0
jessica-denman@hotmail.co.uk,JESS,30.0,30.0
jignesh.patel@batfast.com,JIGNESH PATEL,50.0,30.0
joseph.tolson@batfast.com,JOSEPH TOLSON,59.0,45.0
lee.duckworth@batfast.com,LEE DUCKWORTH,50.0,50.0
runish.gudhka@batfast.com,RUNISH,59.0,50.0


### Average Speed Tennis

In [2469]:
def average_speed_tennis_function(dataset):
    
    average_speed_df = dataset[['email','name','speed_adj']].groupby(['email','name']).sum()
    average_speed_df.rename(columns = {'speed_adj':'total_speed'}, inplace=True)
    tennis_deliveries_faced = user_deliveries_function(tennis_df)
    average_speed_df = average_speed_df.merge(tennis_deliveries_faced,
                                    left_index=True, right_index=True, how='outer')

    average_speed_df['average_speed'] = round(average_speed_df['total_speed']/average_speed_df['deliveries_faced'],1)
    average_speed_df.drop(columns=['total_speed','deliveries_faced'], inplace=True)
    average_speed_df.rename(columns={"average_speed":"average_service_speed_faced"}, inplace=True)
    return average_speed_df

In [2470]:
average_speed_tennis_function(tennis_df)

Unnamed: 0_level_0,Unnamed: 1_level_0,average_service_speed_faced
email,name,Unnamed: 2_level_1
alex.mathews@batfast.com,ALEX MATHEWS,30.0
covette.timmons@batfast.com,COVETTE,42.0
jack.roddick@batfast.com,JACK RODDICK,45.0
jackroddick@gmail.com,JACK RODDICK,51.4
jason.smith@batfast.com,JASON SMITH,45.0
jessica-denman@hotmail.co.uk,JESS,30.0
jignesh.patel@batfast.com,JIGNESH PATEL,39.6
joseph.tolson@batfast.com,JOSEPH TOLSON,48.9
lee.duckworth@batfast.com,LEE DUCKWORTH,50.0
runish.gudhka@batfast.com,RUNISH,54.5


### Ranking Tennis

In [2471]:
def ranking_tennis_function(dataset):
    #Generating the necessary DataFrames
    average_score_tennis_df = average_score_tennis_function(tennis_df)
    rank_df_1 = average_score_tennis_df.merge(tennis_user_deliveries_faced, left_index=True, right_index=True, how='outer')
    rank_df_2 = rank_df_1.merge(tennis_total_score_df, left_index=True, right_index=True, how='outer')
    
    #Ranking by Total Score
    rank_df_2['score_rank'] = rank_df_2['total_score'].rank(ascending=False)
    rank_df_2['score_rank_percentile'] = (round(rank_df_2['score_rank'].rank(pct=True),2))*100
    
    # Ranking by Average Score
    rank_df_2['average_score_rank'] = rank_df_2['average_score_per_delivery'].rank(ascending=False)
    rank_df_2['average_score_rank_percentile'] = (round(rank_df_2['average_score_rank'].rank(pct=True),2))*100
    
    
    #Ranking Total Deliveries Faced
    rank_df_2['deliveries_rank'] = rank_df_2['deliveries_faced'].rank(ascending=False)
    rank_df_2['deliveries_rank_percentile'] = (round(rank_df_2['deliveries_rank'].rank(pct=True),2))*100
    
    return rank_df_2

In [2472]:
ranking_tennis_function(tennis_df)

Unnamed: 0_level_0,Unnamed: 1_level_0,average_score_per_delivery,deliveries_faced,total_score,score_rank,score_rank_percentile,average_score_rank,average_score_rank_percentile,deliveries_rank,deliveries_rank_percentile
email,name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
alex.mathews@batfast.com,ALEX MATHEWS,0.0,4,0.0,13.0,100.0,13.0,100.0,13.0,100.0
covette.timmons@batfast.com,COVETTE,0.8,231,182.0,4.0,31.0,10.0,77.0,2.0,15.0
jack.roddick@batfast.com,JACK RODDICK,4.7,30,140.0,5.0,38.0,3.0,23.0,8.0,62.0
jackroddick@gmail.com,JACK RODDICK,7.6,40,306.0,3.0,23.0,1.0,8.0,7.0,54.0
jason.smith@batfast.com,JASON SMITH,3.1,24,74.0,8.0,62.0,5.0,38.0,10.0,77.0
jessica-denman@hotmail.co.uk,JESS,0.2,42,10.0,11.0,85.0,12.0,92.0,6.0,46.0
jignesh.patel@batfast.com,JIGNESH PATEL,0.9,58,50.0,10.0,77.0,9.0,69.0,5.0,38.0
joseph.tolson@batfast.com,JOSEPH TOLSON,2.7,151,415.0,2.0,15.0,6.0,46.0,3.0,23.0
lee.duckworth@batfast.com,LEE DUCKWORTH,1.0,66,63.0,9.0,69.0,8.0,62.0,4.0,31.0
runish.gudhka@batfast.com,RUNISH,0.3,12,4.0,12.0,92.0,11.0,85.0,12.0,92.0


### Average Service Speed Score

In [2473]:
def average_score_speed_tennis(dataset):
    
    bins = [0,10,20,30,40,50,60,70,80,90,101]
    names = ["0-10","10-20","20-30","30-40","40-50","50-60","60-70","70-80","80-90","90-100"]
    
    dataset['speed_bins'] = pd.cut(dataset['speed_adj'], bins, labels=names)
    
    #Note this statistic will not yet be grouped as the grouping for so many users kills the kernel. Thus, the 
    #grouping will occur at the individual level when the data is visualised
    return dataset['speed_bins'].value_counts()

In [2474]:
average_score_speed_tennis(tennis_df)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



40-50     802
30-40     357
20-30     260
50-60     167
0-10        0
10-20       0
60-70       0
70-80       0
80-90       0
90-100      0
Name: speed_bins, dtype: int64

### Tennis Power Analysis

In [2475]:
#Average tennis power
average_power_function(tennis_df)

Unnamed: 0_level_0,Unnamed: 1_level_0,power_percent
email,name,Unnamed: 2_level_1
alex.mathews@batfast.com,ALEX MATHEWS,
covette.timmons@batfast.com,COVETTE,20.043478
jack.roddick@batfast.com,JACK RODDICK,26.775
jackroddick@gmail.com,JACK RODDICK,17.12
jason.smith@batfast.com,JASON SMITH,38.60625
jessica-denman@hotmail.co.uk,JESS,12.625
jignesh.patel@batfast.com,JIGNESH PATEL,24.285714
joseph.tolson@batfast.com,JOSEPH TOLSON,37.641837
lee.duckworth@batfast.com,LEE DUCKWORTH,22.516216
runish.gudhka@batfast.com,RUNISH,2.5


###  Merging Tennis Statistics

In [2476]:
def generate_tennis_statistics(dataset):
    
    #Generating dataframes using above functions
    tennis_deliveries_faced_df_1 = user_deliveries_function(dataset)
    tennis_total_score_df_1 = total_score_function(dataset)
    tennis_average_score_df_1 = average_score_tennis_function(dataset)
    tennis_average_points_different_lengths_df_1 = average_points_different_lengths_function(dataset)
    tennis_max_min_speed_df_1 = max_min_speed_tennis_function(dataset)
    tennis_average_speed_df_1 = average_speed_tennis_function(dataset)
    tennis_ranking_df_1 = ranking_tennis_function(dataset)
    tennis_average_score_speed_df_1 = average_score_speed_tennis(dataset)
    tennis_average_power_df_1 = average_power_function(dataset)
    
    
    #Merging DataFrames
    
    tennis_statistics_1 = tennis_total_score_df_1.merge(tennis_deliveries_faced_df_1,
                                                                      left_index=True, right_index=True, how='outer')
    tennis_statistics_2 = tennis_statistics_1.merge(tennis_average_score_df_1,
                                                                      left_index=True, right_index=True, how='outer')
    tennis_statistics_3 = tennis_statistics_2.merge(tennis_average_points_different_lengths_df_1,
                                                                      left_index=True, right_index=True, how='outer')
    tennis_statistics_4 = tennis_statistics_3.merge(tennis_max_min_speed_df_1,
                                                                      left_index=True, right_index=True, how='outer')
    tennis_statistics_5 = tennis_statistics_4.merge(tennis_average_speed_df_1,
                                                                      left_index=True, right_index=True, how='outer')
    tennis_statistics_6 = tennis_statistics_5.merge(tennis_ranking_df_1[['score_rank','score_rank_percentile','average_score_rank','average_score_rank_percentile','deliveries_rank','deliveries_rank_percentile']],
                                                                      left_index=True, right_index=True, how='outer')
    tennis_statistics_7 = tennis_statistics_6.merge(tennis_average_power_df_1,
                                                                      left_index=True, right_index=True, how='outer')
    
    tennis_statistics = tennis_statistics_7.copy()
    
    tennis_statistics.reset_index(inplace=True)
    
    return tennis_statistics

In [2477]:
tennis_statistics = generate_tennis_statistics(tennis_df)
tennis_statistics



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,email,name,total_score,deliveries_faced,average_score_per_delivery,service_line_average_points,net_line_average_points,max_serve_speed_faced,min_serve_speed_faced,average_service_speed_faced,score_rank,score_rank_percentile,average_score_rank,average_score_rank_percentile,deliveries_rank,deliveries_rank_percentile,power_percent
0,alex.mathews@batfast.com,ALEX MATHEWS,0.0,4,0.0,,,30.0,30.0,30.0,13.0,100.0,13.0,100.0,13.0,100.0,
1,covette.timmons@batfast.com,COVETTE,182.0,231,0.8,0.822222,,50.0,30.0,42.0,4.0,31.0,10.0,77.0,2.0,15.0,20.043478
2,jack.roddick@batfast.com,JACK RODDICK,140.0,30,4.7,2.666667,,45.0,45.0,45.0,5.0,38.0,3.0,23.0,8.0,62.0,26.775
3,jackroddick@gmail.com,JACK RODDICK,306.0,40,7.6,0.0,,59.0,50.0,51.4,3.0,23.0,1.0,8.0,7.0,54.0,17.12
4,jason.smith@batfast.com,JASON SMITH,74.0,24,3.1,1.833333,,45.0,45.0,45.0,8.0,62.0,5.0,38.0,10.0,77.0,38.60625
5,jessica-denman@hotmail.co.uk,JESS,10.0,42,0.2,0.238095,,30.0,30.0,30.0,11.0,85.0,12.0,92.0,6.0,46.0,12.625
6,jignesh.patel@batfast.com,JIGNESH PATEL,50.0,58,0.9,1.125,,50.0,30.0,39.6,10.0,77.0,9.0,69.0,5.0,38.0,24.285714
7,joseph.tolson@batfast.com,JOSEPH TOLSON,415.0,151,2.7,1.941748,,59.0,45.0,48.9,2.0,15.0,6.0,46.0,3.0,23.0,37.641837
8,lee.duckworth@batfast.com,LEE DUCKWORTH,63.0,66,1.0,0.954545,,50.0,50.0,50.0,9.0,69.0,8.0,62.0,4.0,31.0,22.516216
9,runish.gudhka@batfast.com,RUNISH,4.0,12,0.3,,,59.0,50.0,54.5,12.0,92.0,11.0,85.0,12.0,92.0,2.5


## Tennis Visualisations 

In [2478]:
user_email, user_name = 'jack.roddick@batfast.com', 'JACK RODDICK'

### Tennis Balls Returned 

In [2479]:
def tennis_balls_faced_visual(user_email, user_name):
    
    df = tennis_statistics.loc[(tennis_statistics['email']==user_email) & (tennis_statistics['name']==user_name)]
    
    fig = go.Figure()

    fig.add_trace(go.Indicator(
    mode = "number",
    value = df.iloc[0]['deliveries_faced'],
    title = "Tennis Balls Returned"))

    fig.show()

In [2480]:
tennis_balls_faced_visual('jack.roddick@batfast.com', 'JACK RODDICK')

### Total Tennis Score

In [2481]:
def tennis_total_score_visual(user_email, user_name):
    
    df = tennis_statistics.loc[(tennis_statistics['email']==user_email) & (tennis_statistics['name']==user_name)]
    
    fig = go.Figure()

    fig.add_trace(go.Indicator(
    mode = "number",
    value = df.iloc[0]['total_score'],
    title = "Total Score"))

    fig.show()

In [2482]:
tennis_total_score_visual('jack.roddick@batfast.com', 'JACK RODDICK')

In [2483]:
def tennis_average_score_visual(user_email, user_name):
    
    df = tennis_statistics.loc[(tennis_statistics['email']==user_email) & (tennis_statistics['name']==user_name)]
    
    fig = go.Figure()

    fig.add_trace(go.Indicator(
    mode = "number",
    value = df.iloc[0]['average_score_per_delivery'],
    title = "Average Score per Serve Returned"))

    fig.show()

In [2484]:
tennis_average_score_visual('jack.roddick@batfast.com', 'JACK RODDICK')

### Different Length Serves Average Points

In [2485]:
def different_length_serve_average_score_visual(user_email, user_name):
    
    df = tennis_statistics.loc[(tennis_statistics['email']==user_email) & (tennis_statistics['name']==user_name)]
    
    fig = go.Figure(
    data = [go.Bar(
            x = ['<b>Service Line<b>', '<b>Net Line<b>'],
            y = [df.iloc[0]['service_line_average_points'], df.iloc[0]['net_line_average_points']],
        width = [0.5, 0.5],
        text = [df.iloc[0]['service_line_average_points'], df.iloc[0]['net_line_average_points']],
        textposition = 'outside',
        marker_color = ['black','red']
    )])

    fig.update_layout(title={'text': '<b>Average Score vs Different Length Serves</b>','x':0.5},
                  xaxis_title_text='<b>Service Length</b>',
                      yaxis_title_text='<b>Average Score per Serve</b>')

    fig.update_xaxes(type='category', showgrid=False, tickangle = 0, ticks="outside",
                showline=True, linewidth=1, linecolor='#121212', tickfont_size=20)
        
    fig.update_yaxes()

    fig.update_traces(texttemplate='%{text: .2s}',textposition='auto',textfont_size=20)

    fig.layout.plot_bgcolor = 'white'
    fig.layout.paper_bgcolor = 'white'
    fig.layout.yaxis.gridcolor = '#D3D3D3'

    fig.show()

In [2486]:
different_length_serve_average_score_visual('jack.roddick@batfast.com', 'JACK RODDICK')

###  Maximum, Minimum and Average Service Speed 

In [2548]:
def service_speed_visualisation_function(user_email, user_name):
    
    df = tennis_statistics.loc[(tennis_statistics['email']==user_email) & (tennis_statistics['name']==user_name)]

    fig = go.Figure(go.Indicator(
        mode = "gauge+number",
        value = df.iloc[0]['average_service_speed_faced'],
        number = {'prefix':"Avg: ",'suffix': "mph", 'font':{'size': 40}},
        title = {'text': "Minimum, Maximum and Average Service Speed", 'font':{'size': 25},'align':'center'},
        gauge = {'axis': {'range': [None, 100], 'dtick':10},
                 'bar': {'color':"black"},
                 'steps' : [
                     {'range': [0, df.iloc[0]['min_serve_speed_faced']], 'color': "lightgray"},
                     {'range': [df.iloc[0]['min_serve_speed_faced'], df.iloc[0]['max_serve_speed_faced']], 'color': "gray"}],
                 'threshold' : {'line': {'color': "red", 'width': 4}, 'thickness': 0.75, 'value': df.iloc[0]['max_serve_speed_faced']}}))

    fig.show()

In [2549]:
service_speed_visualisation_function('jack.roddick@batfast.com', 'JACK RODDICK')

### Tennis Ranking

In [2546]:
def tennis_ranking_visual(user_email, user_name):
    
    df = tennis_statistics.loc[(tennis_statistics['email']==user_email) & (tennis_statistics['name']==user_name)] 
    
    fig = go.Figure()

    fig.add_trace(go.Indicator(
        mode = "number",
        value = df.iloc[0]['score_rank'],
        domain = {'row':0, 'column':3},
        title = '<b>Total Score<b>',
        number = {'prefix':'Rank: ', 'suffix':f' / {len(tennis_statistics)}'}))

    fig.add_trace(go.Indicator(
        mode = "number",
        value = df.iloc[0]['average_score_rank'],
        domain = {'row':1, 'column':1},
        title = '<b>Average Score<b>',
        number = {'prefix':'Rank: ', 'suffix':f' / {len(tennis_statistics)}'}))
    
    fig.add_trace(go.Indicator(
        mode = "number",
        value = df.iloc[0]['deliveries_rank'],
        domain = {'row':1, 'column':5},
        title = '<b>Serves Received<b>',
        number = {'prefix':'Rank: ', 'suffix':f' / {len(tennis_statistics)}'}))

    fig.update_layout(
            grid = {'rows':2, 'columns':7, 'pattern': "independent"},
            )

    fig.show()

In [2547]:
tennis_ranking_visual('jack.roddick@batfast.com', 'JACK RODDICK')

### Scores against Different Service Speeds

In [2491]:
def score_different_service_speeds_visual(user_email, user_name):
    
    df = tennis_df.loc[(tennis_df['email']==user_email) & (tennis_df['name']==user_name)] 

    average_score_speed_bin_df = df[['email','name','score_adj','speed_bins']].groupby(['email','name','speed_bins']).mean()
    average_score_speed_bin_df.rename(columns={"score_adj":"average_score_per_speed_bin"}, inplace=True)
    average_score_speed_bin_df.reset_index(inplace=True)
    
    fig = go.Figure(
        data = [go.Bar(
                x = list(average_runs_speed_bin_df['speed_bins'].unique()),
                y = [average_score_speed_bin_df.iloc[0]['average_score_per_speed_bin'],
                     average_score_speed_bin_df.iloc[1]['average_score_per_speed_bin'],
                     average_score_speed_bin_df.iloc[2]['average_score_per_speed_bin'],
                     average_score_speed_bin_df.iloc[3]['average_score_per_speed_bin'],
                     average_score_speed_bin_df.iloc[4]['average_score_per_speed_bin'],
                     average_score_speed_bin_df.iloc[5]['average_score_per_speed_bin'],
                     average_score_speed_bin_df.iloc[6]['average_score_per_speed_bin'],
                     average_score_speed_bin_df.iloc[7]['average_score_per_speed_bin'],
                     average_score_speed_bin_df.iloc[8]['average_score_per_speed_bin'],
                     average_score_speed_bin_df.iloc[9]['average_score_per_speed_bin']],
            text = [round(average_score_speed_bin_df.iloc[0]['average_score_per_speed_bin'],2),
                     round(average_score_speed_bin_df.iloc[1]['average_score_per_speed_bin'],2),
                     round(average_score_speed_bin_df.iloc[2]['average_score_per_speed_bin'],2),
                     round(average_score_speed_bin_df.iloc[3]['average_score_per_speed_bin'],2),
                     round(average_score_speed_bin_df.iloc[4]['average_score_per_speed_bin'],2),
                     round(average_score_speed_bin_df.iloc[5]['average_score_per_speed_bin'],2),
                     round(average_score_speed_bin_df.iloc[6]['average_score_per_speed_bin'],2),
                     round(average_score_speed_bin_df.iloc[7]['average_score_per_speed_bin'],2),
                     round(average_score_speed_bin_df.iloc[8]['average_score_per_speed_bin'],2),
                     round(average_score_speed_bin_df.iloc[9]['average_score_per_speed_bin'],2)],
            textposition = 'outside',
            marker_color = ['red','red','red','red','red','red','red','red','red','red']
    )])

    fig.update_layout(title={'text': '<b>Performance vs Different Speeds</b>','x':0.5},
                      xaxis_title_text='<b>Service Speed (mph)</b>',
                     yaxis_title_text='<b>Average Score per Serve</b>')

    fig.update_xaxes(type='category', showgrid=False, tickangle = 0, ticks="outside",
                    showline=True, linewidth=1, linecolor='#121212', tickfont_size=20)

    fig.update_traces(textposition='outside',textfont_size=15)

    fig.layout.plot_bgcolor = 'white'
    fig.layout.paper_bgcolor = 'white'
    fig.layout.yaxis.gridcolor = '#D3D3D3'

    fig.show()



In [2492]:
score_different_service_speeds_visual('jack.roddick@batfast.com', 'JACK RODDICK')

### Power Tennis Visual

In [2493]:
def tennis_power_visual(user_email, user_name):
    
    df = tennis_statistics.loc[(tennis_statistics['email']==user_email) & (tennis_statistics['name']==user_name)] 
    
    fig = go.Figure(go.Indicator(
        mode = "number+gauge",
        gauge = {'shape': "bullet",
                'axis': {'range': [None, 100], 'dtick':10}},
        number = {'suffix': "%", 'font':{'size': 40}},
        value = round(df.iloc[0]['power_percent'],0),
        domain = {'x': [0.1, 1], 'y': [0.2, 0.9]},
        title = {'text': "Avg. Return Power", 'font':{'size': 14},'align':'center'}))


    fig.show()
    
tennis_power_visual('jack.roddick@batfast.com', 'JACK RODDICK')

## Generating Tennis Visuals

In [2494]:
def generate_tennis_visuals(user_email, user_name):
    tennis_balls_faced_visual(user_email, user_name)
    tennis_total_score_visual(user_email, user_name)
    tennis_average_score_visual(user_email, user_name)
    tennis_ranking_visual(user_email, user_name)
    different_length_serve_average_score_visual(user_email, user_name)
    service_speed_visualisation_function(user_email, user_name)
    score_different_service_speeds_visual(user_email, user_name)
    tennis_power_visual(user_email, user_name)

In [2495]:
generate_tennis_visuals('jack.roddick@batfast.com', 'JACK RODDICK')

# Baseball Statistics

In [2496]:
baseball_df = combined_df.loc[combined_df['baseball']==1]
baseball_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1200701 entries, 2391 to 3110857
Data columns (total 43 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   sim_number     1200701 non-null  int64  
 1   timestamp_x    1200701 non-null  object 
 2   batfast_id     1200701 non-null  object 
 3   client_name    1200701 non-null  object 
 4   event_name     1200701 non-null  object 
 5   game_mode      1200701 non-null  object 
 6   score          315095 non-null   object 
 7   speed          848748 non-null   float64
 8   pitch          848748 non-null   float64
 9   swing          848748 non-null   float64
 10  pan            848748 non-null   float64
 11  turn           551342 non-null   float64
 12  r              4203 non-null     float64
 13  theta          4029 non-null     float64
 14  z              0 non-null        float64
 15  power          3970 non-null     float64
 16  machine        1200701 non-null  object 
 17  scori

### Total Baseball Deliveries Faced

In [2497]:
def user_baseball_deliveries_function(dataset):
    user_deliveries_faced_df = dataset.groupby(['email','name']).count()
    user_deliveries_faced_df = user_deliveries_faced_df['timestamp_x'].to_frame()
    user_deliveries_faced_df.rename(columns={"timestamp_x":'pitches_faced'},inplace=True)
    return user_deliveries_faced_df

In [2498]:
user_baseball_deliveries_function(baseball_df)

Unnamed: 0_level_0,Unnamed: 1_level_0,pitches_faced
email,name,Unnamed: 2_level_1
aashiq.mortimer@htomail.co.uk,AASHIQ MORTIMER,30
adil_mortimer@hotmail.co.uk,ADIL MORTIMER,30
afrin_maci@hotmail.com,AFRAN MACI,12
albert.ws99@gmail.com,ALBERT WAYMAN,30
alex.mathews@batfast.com,ALEX MATHEWS,24
...,...,...
york@tenpin.co.uk,ROBIN SMITH,18
york@tenpin.co.uk,SAM,18
york@tenpin.co.uk,TOM HENDERSON,22
york@tenpin.co.uk,TRISH,13


### Total Baseball Runs Scored

In [2499]:
baseball_df['runs'].value_counts()

0.0    1029826
1.0     167411
4.0        982
2.0        839
5.0        822
3.0        413
6.0        408
Name: runs, dtype: int64

In [2500]:
def total_baseball_runs_scored_function(dataset):
    runs_scored_df = dataset[['email','name','runs']].groupby(['email','name']).sum()
    runs_scored_df = runs_scored_df['runs'].to_frame()
    runs_scored_df.rename(columns = {"runs":"total_runs_scored"},inplace=True) 
    return runs_scored_df

In [2501]:
total_baseball_runs_scored_function(baseball_df)

Unnamed: 0_level_0,Unnamed: 1_level_0,total_runs_scored
email,name,Unnamed: 2_level_1
aashiq.mortimer@htomail.co.uk,AASHIQ MORTIMER,0.0
adil_mortimer@hotmail.co.uk,ADIL MORTIMER,0.0
afrin_maci@hotmail.com,AFRAN MACI,0.0
albert.ws99@gmail.com,ALBERT WAYMAN,25.0
alex.mathews@batfast.com,ALEX MATHEWS,13.0
...,...,...
york@tenpin.co.uk,ROBIN SMITH,0.0
york@tenpin.co.uk,SAM,0.0
york@tenpin.co.uk,TOM HENDERSON,0.0
york@tenpin.co.uk,TRISH,0.0


### Total Baseball Points Scored

In [2502]:
baseball_df['points'].value_counts()

 0.0      246716
 10.0      47889
-10.0      15502
 50.0       3714
 100.0      1274
Name: points, dtype: int64

In [2503]:
def total_baseball_points_scored_function(dataset):
    points_scored_df = dataset[['email','name','points']].groupby(['email','name']).sum()
    points_scored_df = points_scored_df['points'].to_frame()
    points_scored_df.rename(columns = {"points":"total_points_scored"},inplace=True) 
    return points_scored_df

In [2504]:
total_baseball_points_scored_function(baseball_df)

Unnamed: 0_level_0,Unnamed: 1_level_0,total_points_scored
email,name,Unnamed: 2_level_1
aashiq.mortimer@htomail.co.uk,AASHIQ MORTIMER,120.0
adil_mortimer@hotmail.co.uk,ADIL MORTIMER,160.0
afrin_maci@hotmail.com,AFRAN MACI,-40.0
albert.ws99@gmail.com,ALBERT WAYMAN,0.0
alex.mathews@batfast.com,ALEX MATHEWS,0.0
...,...,...
york@tenpin.co.uk,ROBIN SMITH,160.0
york@tenpin.co.uk,SAM,200.0
york@tenpin.co.uk,TOM HENDERSON,320.0
york@tenpin.co.uk,TRISH,130.0


### Average Runs and Points Scored

In [2505]:
#It will be assumed that 'Bowled' refers to a 'Strike'
baseball_df['wicket'].value_counts()

Bowled    49266
Name: wicket, dtype: int64

In [2506]:
def average_baseball_runs_points_function(dataset):
    
    #NOTE: baseball average will be calculated as average per pitch
    
    #Obtaining total runs scored and then dividing by pitches faced
    baseball_pitches_faced = user_baseball_deliveries_function(dataset)
    total_baseball_runs_scored = total_baseball_runs_scored_function(dataset)
    
    #Obtaining average runs per pitch
    average_runs_per_pitch = baseball_pitches_faced.merge(total_baseball_runs_scored, 
                                                          left_index=True, right_index=True, how='inner')
    
    average_runs_per_pitch['average_runs'] = average_runs_per_pitch['total_runs_scored']/average_runs_per_pitch['pitches_faced']
    
    #Obtaining total points scored and then dividing by pitches faced
    total_baseball_points_scored = total_baseball_points_scored_function(dataset)
    
    #Obtaining average runs per pitch
    average_points_per_pitch = baseball_pitches_faced.merge(total_baseball_points_scored, 
                                                          left_index=True, right_index=True, how='inner')
    
    average_points_per_pitch['average_points'] = average_points_per_pitch['total_points_scored']/average_runs_per_pitch['pitches_faced']
    
    
    #Merging the two dataframes
    average_runs_points_per_pitch = average_points_per_pitch.merge(average_runs_per_pitch['average_runs'], 
                                                          left_index=True, right_index=True, how='inner')

    return average_runs_points_per_pitch[['average_points','average_runs']]
    

In [2507]:
average_baseball_runs_points_function(baseball_df)

Unnamed: 0_level_0,Unnamed: 1_level_0,average_points,average_runs
email,name,Unnamed: 2_level_1,Unnamed: 3_level_1
aashiq.mortimer@htomail.co.uk,AASHIQ MORTIMER,4.000000,0.000000
adil_mortimer@hotmail.co.uk,ADIL MORTIMER,5.333333,0.000000
afrin_maci@hotmail.com,AFRAN MACI,-3.333333,0.000000
albert.ws99@gmail.com,ALBERT WAYMAN,0.000000,0.833333
alex.mathews@batfast.com,ALEX MATHEWS,0.000000,0.541667
...,...,...,...
york@tenpin.co.uk,ROBIN SMITH,8.888889,0.000000
york@tenpin.co.uk,SAM,11.111111,0.000000
york@tenpin.co.uk,TOM HENDERSON,14.545455,0.000000
york@tenpin.co.uk,TRISH,10.000000,0.000000


### Baseball Comparative Performance

In [2508]:
def baseball_ranking_function(dataset):
    #Generating the necessary DataFrames
    average_baseball_runs_points_df = average_baseball_runs_points_function(baseball_df)
    baseball_pitches_faced = user_baseball_deliveries_function(dataset)
    total_baseball_runs_scored = total_baseball_runs_scored_function(dataset)
    total_baseball_points_scored = total_baseball_points_scored_function(dataset)
    
    rank_df_1 = baseball_pitches_faced.merge(total_baseball_runs_scored,
                                                          left_index=True, right_index=True, how='outer')
    rank_df_2 = rank_df_1.merge(total_baseball_points_scored,
                                                         left_index=True, right_index=True, how='outer')
    rank_df_3 = rank_df_2.merge(average_baseball_runs_points_df,
                                                          left_index=True, right_index=True, how='outer')
    
    
    #Ranking by Total Runs
    rank_df_3['runs_rank'] = rank_df_3['total_runs_scored'].rank(ascending=False)
    rank_df_3['runs_rank_percentile'] = (round(rank_df_3['runs_rank'].rank(pct=True),2))*100
    
    # Ranking by Total Points
    rank_df_3['points_rank'] = rank_df_3['total_points_scored'].rank(ascending=False)
    rank_df_3['points_rank_percentile'] = (round(rank_df_3['points_rank'].rank(pct=True),2))*100
    
    #Ranking Total Deliveries Faced
    rank_df_3['pitches_rank'] = rank_df_3['pitches_faced'].rank(ascending=False)
    rank_df_3['pitches_rank_percentile'] = (round(rank_df_3['pitches_rank'].rank(pct=True),2))*100

    #Ranking by Average Runs
    rank_df_3['average_runs_rank'] = rank_df_3['average_runs'].rank(ascending=False)
    rank_df_3['average_runs_rank_percentile'] = (round(rank_df_3['average_runs_rank'].rank(pct=True),2))*100

    #Ranking by Average Points
    rank_df_3['average_points_rank'] = rank_df_3['average_points'].rank(ascending=False)
    rank_df_3['average_points_rank_percentile'] = (round(rank_df_3['average_points_rank'].rank(pct=True),2))*100
    
    return rank_df_3

In [2509]:
baseball_ranking_function(baseball_df)

Unnamed: 0_level_0,Unnamed: 1_level_0,pitches_faced,total_runs_scored,total_points_scored,average_points,average_runs,runs_rank,runs_rank_percentile,points_rank,points_rank_percentile,pitches_rank,pitches_rank_percentile,average_runs_rank,average_runs_rank_percentile,average_points_rank,average_points_rank_percentile
email,name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
aashiq.mortimer@htomail.co.uk,AASHIQ MORTIMER,30,0.0,120.0,4.000000,0.000000,139.5,66.0,57.5,27.0,91.0,43.0,139.5,66.0,67.0,32.0
adil_mortimer@hotmail.co.uk,ADIL MORTIMER,30,0.0,160.0,5.333333,0.000000,139.5,66.0,49.5,24.0,91.0,43.0,139.5,66.0,58.0,28.0
afrin_maci@hotmail.com,AFRAN MACI,12,0.0,-40.0,-3.333333,0.000000,139.5,66.0,204.5,97.0,165.0,79.0,139.5,66.0,205.0,98.0
albert.ws99@gmail.com,ALBERT WAYMAN,30,25.0,0.0,0.000000,0.833333,32.0,15.0,142.5,68.0,91.0,43.0,20.0,10.0,142.5,68.0
alex.mathews@batfast.com,ALEX MATHEWS,24,13.0,0.0,0.000000,0.541667,41.5,20.0,142.5,68.0,111.0,53.0,32.0,15.0,142.5,68.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
york@tenpin.co.uk,ROBIN SMITH,18,0.0,160.0,8.888889,0.000000,139.5,66.0,49.5,24.0,136.5,65.0,139.5,66.0,38.5,18.0
york@tenpin.co.uk,SAM,18,0.0,200.0,11.111111,0.000000,139.5,66.0,40.0,19.0,136.5,65.0,139.5,66.0,21.5,10.0
york@tenpin.co.uk,TOM HENDERSON,22,0.0,320.0,14.545455,0.000000,139.5,66.0,21.5,10.0,122.0,58.0,139.5,66.0,10.0,5.0
york@tenpin.co.uk,TRISH,13,0.0,130.0,10.000000,0.000000,139.5,66.0,54.0,26.0,156.5,75.0,139.5,66.0,30.5,15.0


### Max-Min Pitch Speed

In [2510]:
def max_min_pitch_speed_function(dataset):
    max_speed_df = fastest_delivery_function(dataset)
    min_speed_df = slowest_delivery_function(dataset)
    
    max_min_speed_df = max_speed_df.merge(min_speed_df, left_index=True, right_index=True, how='outer')
    max_min_speed_df.rename(columns={"max_delivery_speed":"max_pitch_speed",
                                     'min_delivery_speed':'min_pitch_speed'},
                            inplace=True)
    
    return max_min_speed_df

In [2511]:
max_min_pitch_speed_function(baseball_df)

Unnamed: 0_level_0,Unnamed: 1_level_0,max_pitch_speed,min_pitch_speed
email,name,Unnamed: 2_level_1,Unnamed: 3_level_1
aashiq.mortimer@htomail.co.uk,AASHIQ MORTIMER,40.0,40.0
adil_mortimer@hotmail.co.uk,ADIL MORTIMER,40.0,40.0
afrin_maci@hotmail.com,AFRAN MACI,50.0,50.0
albert.ws99@gmail.com,ALBERT WAYMAN,44.0,40.0
alex.mathews@batfast.com,ALEX MATHEWS,45.0,30.0
...,...,...,...
york@tenpin.co.uk,ROBIN SMITH,50.0,35.0
york@tenpin.co.uk,SAM,45.0,35.0
york@tenpin.co.uk,TOM HENDERSON,50.0,40.0
york@tenpin.co.uk,TRISH,35.0,35.0


### Average Pitch Speed

In [2512]:
def average_speed_baseball_function(dataset):
    
    average_speed_df = dataset[['email','name','speed_adj']].groupby(['email','name']).sum()
    average_speed_df.rename(columns = {'speed_adj':'total_speed'}, inplace=True)
    baseball_deliveries_faced = user_deliveries_function(dataset)
    average_speed_df = average_speed_df.merge(baseball_deliveries_faced,
                                    left_index=True, right_index=True, how='outer')

    average_speed_df['average_speed'] = round(average_speed_df['total_speed']/average_speed_df['deliveries_faced'],1)
    average_speed_df.drop(columns=['total_speed','deliveries_faced'], inplace=True)
    average_speed_df.rename(columns={"average_speed":"average_pitch_speed_faced"}, inplace=True)
    return average_speed_df

In [2513]:
average_speed_baseball_function(baseball_df)

Unnamed: 0_level_0,Unnamed: 1_level_0,average_pitch_speed_faced
email,name,Unnamed: 2_level_1
aashiq.mortimer@htomail.co.uk,AASHIQ MORTIMER,40.0
adil_mortimer@hotmail.co.uk,ADIL MORTIMER,40.0
afrin_maci@hotmail.com,AFRAN MACI,50.0
albert.ws99@gmail.com,ALBERT WAYMAN,41.2
alex.mathews@batfast.com,ALEX MATHEWS,33.5
...,...,...
york@tenpin.co.uk,ROBIN SMITH,44.7
york@tenpin.co.uk,SAM,37.8
york@tenpin.co.uk,TOM HENDERSON,44.5
york@tenpin.co.uk,TRISH,35.0


### Pitching Swing Analysis

Whilst movement in the air when it comes to baseball pitches is not called swing, for ease of interpretability, the terms 'away-swing' and 'in-swing' will be kept. Given the likes of sliders, knuckleballs, curveballs (all types of baseball pitches which move in the air) not only move sideways but also up and down as well, keeping the term swing also makes sense given there is not data in the dataset which contains information on the vertical movement of the ball mid-flight. 

In [2514]:
#RUNS
def average_baseball_runs_swing_function(dataset):
    #Identifying respective swing deliveries
    dataset['away_swing_runs'] = np.where(dataset['swing']>0,dataset['runs'],np.NaN)
    dataset['in_swing_runs'] = np.where(dataset['swing']<0,dataset['runs'],np.NaN)
    dataset['straight_runs'] = np.where(dataset['swing']==0,dataset['runs'],np.NaN)
    
    #Calculating average runs for each type of swing
    swing_analysis_df = dataset[['email','name','away_swing_runs','in_swing_runs','straight_runs']].groupby(['email','name']).mean()
    
    return swing_analysis_df

In [2515]:
average_baseball_runs_swing_function(baseball_df)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0_level_0,Unnamed: 1_level_0,away_swing_runs,in_swing_runs,straight_runs
email,name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
aashiq.mortimer@htomail.co.uk,AASHIQ MORTIMER,,,0.000000
adil_mortimer@hotmail.co.uk,ADIL MORTIMER,,,0.000000
afrin_maci@hotmail.com,AFRAN MACI,,,0.000000
albert.ws99@gmail.com,ALBERT WAYMAN,,,0.833333
alex.mathews@batfast.com,ALEX MATHEWS,0.375,1.75,0.250000
...,...,...,...,...
york@tenpin.co.uk,ROBIN SMITH,,,0.000000
york@tenpin.co.uk,SAM,,,0.000000
york@tenpin.co.uk,TOM HENDERSON,,,0.000000
york@tenpin.co.uk,TRISH,,,0.000000


In [2516]:
#POINTS
def average_baseball_points_swing_function(dataset):
    #Identifying respective swing deliveries
    dataset['away_swing_points'] = np.where(dataset['swing']>0,dataset['points'],np.NaN)
    dataset['in_swing_points'] = np.where(dataset['swing']<0,dataset['points'],np.NaN)
    dataset['straight_points'] = np.where(dataset['swing']==0,dataset['points'],np.NaN)
    
    #Calculating average runs for each type of swing
    swing_analysis_df = dataset[['email','name','away_swing_points','in_swing_points','straight_points']].groupby(['email','name']).mean()
    
    return swing_analysis_df

In [2517]:
average_baseball_points_swing_function(baseball_df)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0_level_0,Unnamed: 1_level_0,away_swing_points,in_swing_points,straight_points
email,name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
aashiq.mortimer@htomail.co.uk,AASHIQ MORTIMER,,,4.000000
adil_mortimer@hotmail.co.uk,ADIL MORTIMER,,,5.333333
afrin_maci@hotmail.com,AFRAN MACI,,,-3.333333
albert.ws99@gmail.com,ALBERT WAYMAN,,,0.000000
alex.mathews@batfast.com,ALEX MATHEWS,0.0,0.0,0.000000
...,...,...,...,...
york@tenpin.co.uk,ROBIN SMITH,,,8.888889
york@tenpin.co.uk,SAM,,,11.111111
york@tenpin.co.uk,TOM HENDERSON,,,14.545455
york@tenpin.co.uk,TRISH,,,10.000000


### Baseball Strike Analysis

In [2518]:
baseball_df['wicket'].value_counts()

Bowled    49266
Name: wicket, dtype: int64

In [2519]:
strikes_df = pd.get_dummies(baseball_df['wicket'])
strikes_df.rename(columns={"Bowled":"strikes"},inplace=True)
strikes_df

Unnamed: 0,strikes
2391,0
2392,0
2393,0
2394,0
2395,0
...,...
3110853,0
3110854,0
3110855,0
3110856,0


In [2520]:
baseball_df = baseball_df.merge(strikes_df, left_index=True, right_index=True, how='inner')
baseball_df

Unnamed: 0,sim_number,timestamp_x,batfast_id,client_name,event_name,game_mode,score,speed,pitch,swing,...,cricket,baseball,tennis,away_swing_runs,in_swing_runs,straight_runs,away_swing_points,in_swing_points,straight_points,strikes
2391,34,2019-11-17 17:22:16+00:00,bfs3400256,tenpin,tenpin-luton,Continuous_RR_BB,-10.0,4.0,-9.0,0.0,...,0,1,0,,,0.0,,,-10.0,0
2392,34,2019-11-17 17:22:46+00:00,bfs3400256,tenpin,tenpin-luton,Continuous_RR_BB,-10.0,4.0,-9.0,0.0,...,0,1,0,,,0.0,,,-10.0,0
2393,34,2019-11-17 17:23:07+00:00,bfs3400256,tenpin,tenpin-luton,Continuous_RR_BB,10.0,4.0,-9.0,0.0,...,0,1,0,,,0.0,,,10.0,0
2394,34,2019-11-17 17:23:29+00:00,bfs3400256,tenpin,tenpin-luton,Continuous_RR_BB,-10.0,4.0,-9.0,0.0,...,0,1,0,,,0.0,,,-10.0,0
2395,34,2019-11-17 17:23:58+00:00,bfs3400256,tenpin,tenpin-luton,Continuous_RR_BB,10.0,4.0,-9.0,0.0,...,0,1,0,,,0.0,,,10.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3110853,9,2020-03-13 17:51:22+00:00,bfs901869,batfast,36-hours-sports-relief,Training_BB,1,4.4,-4.2,0.0,...,0,1,0,,,1.0,,,0.0,0
3110854,9,2020-03-13 17:51:34+00:00,bfs901869,batfast,36-hours-sports-relief,Training_BB,1,4.4,-4.2,0.0,...,0,1,0,,,1.0,,,0.0,0
3110855,9,2020-03-13 17:51:46+00:00,bfs901869,batfast,36-hours-sports-relief,Training_BB,1,4.4,-4.2,0.0,...,0,1,0,,,1.0,,,0.0,0
3110856,9,2020-03-13 17:51:57+00:00,bfs901869,batfast,36-hours-sports-relief,Training_BB,1,4.4,-4.2,0.0,...,0,1,0,,,1.0,,,0.0,0


In [2521]:
strikes_df = baseball_df[['email','name','strikes']].groupby(['email','name']).sum()
strikes_df

Unnamed: 0_level_0,Unnamed: 1_level_0,strikes
email,name,Unnamed: 2_level_1
aashiq.mortimer@htomail.co.uk,AASHIQ MORTIMER,0
adil_mortimer@hotmail.co.uk,ADIL MORTIMER,0
afrin_maci@hotmail.com,AFRAN MACI,0
albert.ws99@gmail.com,ALBERT WAYMAN,7
alex.mathews@batfast.com,ALEX MATHEWS,3
...,...,...
york@tenpin.co.uk,ROBIN SMITH,0
york@tenpin.co.uk,SAM,0
york@tenpin.co.uk,TOM HENDERSON,0
york@tenpin.co.uk,TRISH,0


In [2522]:
def strikes_function(dataset):
    
    #Indicating which deliveries were strikes
    strikes_df = pd.get_dummies(dataset['wicket'])
    strikes_df.rename(columns={"Bowled":"strikes"},inplace=True)
    
    #Merging strikes indicators with main dataset
    dataset_df = dataset.merge(strikes_df, left_index=True, right_index=True, how='inner')
    
    #Calculating total strikes for each user group
    total_strikes_df = baseball_df[['email','name','strikes']].groupby(['email','name']).sum()
    
    return total_strikes_df

In [2523]:
strikes_function(baseball_df)

Unnamed: 0_level_0,Unnamed: 1_level_0,strikes
email,name,Unnamed: 2_level_1
aashiq.mortimer@htomail.co.uk,AASHIQ MORTIMER,0
adil_mortimer@hotmail.co.uk,ADIL MORTIMER,0
afrin_maci@hotmail.com,AFRAN MACI,0
albert.ws99@gmail.com,ALBERT WAYMAN,7
alex.mathews@batfast.com,ALEX MATHEWS,3
...,...,...
york@tenpin.co.uk,ROBIN SMITH,0
york@tenpin.co.uk,SAM,0
york@tenpin.co.uk,TOM HENDERSON,0
york@tenpin.co.uk,TRISH,0


### Merging Baseball Statistics

In [2524]:
def generate_baseball_statistics(dataset):
    
    #Generating dataframes using above functions
    baseball_deliveries_faced_df_1 = user_baseball_deliveries_function(dataset)
    baseball_total_runs_scored_df_1 = total_baseball_runs_scored_function(dataset)
    baseball_total_points_scored_df_1 = total_baseball_points_scored_function(dataset)
    baseball_average_runs_points_df_1 = average_baseball_runs_points_function(dataset)
    baseball_ranking_df_1 = baseball_ranking_function(dataset)
    baseball_max_min_speed_df_1 = max_min_pitch_speed_function(dataset)
    baseball_average_speed_df_1 = average_speed_baseball_function(dataset)
    baseball_average_runs_swing_df_1 = average_baseball_runs_swing_function(dataset)
    baseball_average_points_swing_df_1 = average_baseball_points_swing_function(dataset)
    baseball_total_strikes_df_1 = strikes_function(dataset)
    baseball_average_power_df_1 = average_power_function(dataset)
    
    #Merging DataFrames
    
    baseball_statistics_1 = baseball_total_runs_scored_df_1.merge(baseball_deliveries_faced_df_1,
                                                                      left_index=True, right_index=True, how='outer')
    baseball_statistics_2 = baseball_statistics_1.merge(baseball_total_points_scored_df_1,
                                                                      left_index=True, right_index=True, how='outer')
    baseball_statistics_3 = baseball_statistics_2.merge(baseball_average_runs_points_df_1,
                                                                      left_index=True, right_index=True, how='outer')
    baseball_statistics_4 = baseball_statistics_3.merge(baseball_ranking_df_1[['runs_rank','runs_rank_percentile','points_rank',
                                                                               'points_rank_percentile','pitches_rank','pitches_rank_percentile',
                                                                               'average_runs_rank','average_runs_rank_percentile',
                                                                               'average_points_rank','average_points_rank_percentile']],
                                                                      left_index=True, right_index=True, how='outer')
    baseball_statistics_5 = baseball_statistics_4.merge(baseball_max_min_speed_df_1,
                                                                      left_index=True, right_index=True, how='outer')
    baseball_statistics_6 = baseball_statistics_5.merge(baseball_average_speed_df_1,
                                                                      left_index=True, right_index=True, how='outer')
    baseball_statistics_7 = baseball_statistics_6.merge(baseball_average_runs_swing_df_1,
                                                                      left_index=True, right_index=True, how='outer')
    baseball_statistics_8 = baseball_statistics_7.merge(baseball_average_points_swing_df_1,
                                                                      left_index=True, right_index=True, how='outer')
    baseball_statistics_9 = baseball_statistics_8.merge(baseball_total_strikes_df_1,
                                                                      left_index=True, right_index=True, how='outer')
    baseball_statistics_10 = baseball_statistics_9.merge(baseball_average_power_df_1,
                                                                      left_index=True, right_index=True, how='outer')
    
    baseball_statistics = baseball_statistics_10.copy()
    
    baseball_statistics.reset_index(inplace=True)
    
    return baseball_statistics

In [2525]:
baseball_base = combined_df.loc[combined_df['baseball']==1]

In [2526]:
baseball_statistics = generate_baseball_statistics(baseball_base)
baseball_statistics



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/

Unnamed: 0,email,name,total_runs_scored,pitches_faced,total_points_scored,average_points,average_runs,runs_rank,runs_rank_percentile,points_rank,...,min_pitch_speed,average_pitch_speed_faced,away_swing_runs,in_swing_runs,straight_runs,away_swing_points,in_swing_points,straight_points,strikes,power_percent
0,aashiq.mortimer@htomail.co.uk,AASHIQ MORTIMER,0.0,30,120.0,4.000000,0.000000,139.5,66.0,57.5,...,40.0,40.0,,,0.000000,,,4.000000,0,
1,adil_mortimer@hotmail.co.uk,ADIL MORTIMER,0.0,30,160.0,5.333333,0.000000,139.5,66.0,49.5,...,40.0,40.0,,,0.000000,,,5.333333,0,
2,afrin_maci@hotmail.com,AFRAN MACI,0.0,12,-40.0,-3.333333,0.000000,139.5,66.0,204.5,...,50.0,50.0,,,0.000000,,,-3.333333,0,
3,albert.ws99@gmail.com,ALBERT WAYMAN,25.0,30,0.0,0.000000,0.833333,32.0,15.0,142.5,...,40.0,41.2,,,0.833333,,,0.000000,7,
4,alex.mathews@batfast.com,ALEX MATHEWS,13.0,24,0.0,0.000000,0.541667,41.5,20.0,142.5,...,30.0,33.5,0.375,1.75,0.250000,0.0,0.0,0.000000,3,17.85
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
205,york@tenpin.co.uk,ROBIN SMITH,0.0,18,160.0,8.888889,0.000000,139.5,66.0,49.5,...,35.0,44.7,,,0.000000,,,8.888889,0,
206,york@tenpin.co.uk,SAM,0.0,18,200.0,11.111111,0.000000,139.5,66.0,40.0,...,35.0,37.8,,,0.000000,,,11.111111,0,
207,york@tenpin.co.uk,TOM HENDERSON,0.0,22,320.0,14.545455,0.000000,139.5,66.0,21.5,...,40.0,44.5,,,0.000000,,,14.545455,0,
208,york@tenpin.co.uk,TRISH,0.0,13,130.0,10.000000,0.000000,139.5,66.0,54.0,...,35.0,35.0,,,0.000000,,,10.000000,0,


## Baseball Visuals

In [2527]:
def baseball_deliveries_faced_visual(user_email, user_name):
    
    df = baseball_statistics.loc[(baseball_statistics['email']==user_email) & (baseball_statistics['name']==user_name)]
    
    fig = go.Figure()

    fig.add_trace(go.Indicator(
    mode = "number",
    value = df.iloc[0]['pitches_faced'],
    title = "Pitches Faced"))

    fig.show()

In [2528]:
baseball_deliveries_faced_visual('alex.mathews@batfast.com', 'ALEX MATHEWS')

### Total Baseball Runs and Points

In [2529]:
def total_baseball_runs_points_visual(user_email, user_name):
    
    df = baseball_statistics.loc[(baseball_statistics['email']==user_email) & (baseball_statistics['name']==user_name)]
    
    fig = go.Figure()

    fig.add_trace(go.Indicator(
    mode = "number",
    value = df.iloc[0]['total_runs_scored'],
    domain = {'row':0, 'column':0},
    title = "Total Runs"))
    
    fig.add_trace(go.Indicator(
    mode = "number",
    value = df.iloc[0]['total_points_scored'],
    domain = {'row':1, 'column':0},
    title = "Total Points"))
    
    fig.update_layout(
        grid = {'rows':2, 'columns':1, 'pattern': "independent"},
        )
    
    fig.show()

In [2530]:
total_baseball_runs_points_visual('alex.mathews@batfast.com', 'ALEX MATHEWS')

### Average Baseball Runs and Points

In [2531]:
def average_baseball_runs_points_visual(user_email, user_name):
    
    df = baseball_statistics.loc[(baseball_statistics['email']==user_email) & (baseball_statistics['name']==user_name)]
    
    fig = go.Figure()

    fig.add_trace(go.Indicator(
    mode = "number",
    value = df.iloc[0]['average_runs'],
    domain = {'row':0, 'column':0},
    title = "Average Runs per Pitch"))
    
    fig.add_trace(go.Indicator(
    mode = "number",
    value = df.iloc[0]['average_points'],
    domain = {'row':1, 'column':0},
    title = "Average Points per Pitch"))
    
    fig.update_layout(
        grid = {'rows':2, 'columns':1, 'pattern': "independent"},
        )
    
    fig.show()

In [2532]:
average_baseball_runs_points_visual('alex.mathews@batfast.com', 'ALEX MATHEWS')

### Baseball Ranking Visuals

In [2533]:
def baseball_ranking_visual(user_email, user_name):
    
    df = baseball_statistics.loc[(baseball_statistics['email']==user_email) & (baseball_statistics['name']==user_name)] 
    
    fig = go.Figure()

    fig.add_trace(go.Indicator(
        mode = "number",
        value = df.iloc[0]['runs_rank'],
        domain = {'row':0, 'column':0},
        title = '<b>Total Runs<b>',
        number = {'prefix':'Rank: ', 'suffix':f' / {len(baseball_statistics)}'}))

    fig.add_trace(go.Indicator(
        mode = "number",
        value = df.iloc[0]['average_runs_rank'],
        domain = {'row':2, 'column':0},
        title = '<b>Average Runs<b>',
        number = {'prefix':'Rank: ', 'suffix':f' / {len(baseball_statistics)}'}))
    
    fig.add_trace(go.Indicator(
        mode = "number",
        value = df.iloc[0]['points_rank'],
        domain = {'row':0, 'column':2},
        title = '<b>Total Points<b>',
        number = {'prefix':'Rank: ', 'suffix':f' / {len(baseball_statistics)}'}))

    fig.add_trace(go.Indicator(
        mode = "number",
        value = df.iloc[0]['average_points_rank'],
        domain = {'row':2, 'column':2},
        title = '<b>Average Points<b>',
        number = {'prefix':'Rank: ', 'suffix':f' / {len(baseball_statistics)}'}))

    fig.add_trace(go.Indicator(
        mode = "number",
        value = df.iloc[0]['pitches_rank'],
        domain = {'row':1, 'column':1},
        title = '<b>Total Pitches Faced<b>',
        number = {'prefix':'Rank: ', 'suffix':f' / {len(baseball_statistics)}'}))

    fig.update_layout(
            grid = {'rows':3, 'columns':3, 'pattern': "independent"})

    fig.show()

In [2534]:
baseball_ranking_visual('alex.mathews@batfast.com', 'ALEX MATHEWS')

### Pitching Speed Analysis

In [2535]:
def baseball_speed_visualisation(user_email, user_name):
    
    df = baseball_statistics.loc[(baseball_statistics['email']==user_email) & (baseball_statistics['name']==user_name)]

    fig = go.Figure(go.Indicator(
        mode = "gauge+number",
        value = df.iloc[0]['average_pitch_speed_faced'],
        number = {'prefix':"Avg: ",'suffix': "mph", 'font':{'size': 40}},
        title = {'text': "Minimum, Maximum and Average Pitch Speed", 'font':{'size': 25},'align':'center'},
        gauge = {'axis': {'range': [None, 100], 'dtick':10},
                 'bar': {'color':"black"},
                 'steps' : [
                     {'range': [0, df.iloc[0]['min_pitch_speed']], 'color': "lightgray"},
                     {'range': [df.iloc[0]['min_pitch_speed'], df.iloc[0]['max_pitch_speed']], 'color': "gray"}],
                 'threshold' : {'line': {'color': "red", 'width': 4}, 'thickness': 0.75, 'value': df.iloc[0]['max_pitch_speed']}}))

    fig.show()

In [2536]:
baseball_speed_visualisation('alex.mathews@batfast.com', 'ALEX MATHEWS')

### Swing Analysis

In [2537]:
#RUNS
def baseball_runs_swing_visual(user_email, user_name):
    
    df = baseball_statistics.loc[(baseball_statistics['email']==user_email) & (baseball_statistics['name']==user_name)] 
    
    fig_swing = go.Figure(
    data = [go.Bar(
            x = ['Away-swing', 'Straight', 'In-swing'],
            y = [df.iloc[0]['away_swing_runs'], df.iloc[0]['straight_runs'],
                df.iloc[0]['in_swing_runs']],
        text = [round(df.iloc[0]['away_swing_runs'],2), round(df.iloc[0]['straight_runs'],2),
                round(df.iloc[0]['in_swing_runs'],2)],
        textposition = 'outside',
        marker_color = ['red','black','blue']
    )])

    fig_swing.update_layout(title={'text': '<b>Pitch Swing Analysis (Runs)</b>','x':0.5},
                  xaxis_title_text='<b>Swing Direction</b>',
                 yaxis_title_text='<b>Average Runs per Pitch</b>')

    fig_swing.update_xaxes(type='category', showgrid=False, tickangle = 0, ticks="outside",
                showline=True, linewidth=1, linecolor='#121212', tickfont_size=20)

    fig_swing.update_traces(textposition='outside',textfont_size=15)

    fig_swing.layout.plot_bgcolor = 'white'
    fig_swing.layout.paper_bgcolor = 'white'
    fig_swing.layout.yaxis.gridcolor = '#D3D3D3'

    fig_swing.show()

In [2538]:
baseball_runs_swing_visual('alex.mathews@batfast.com', 'ALEX MATHEWS')

In [2539]:
#POINTS
def baseball_points_swing_visual(user_email, user_name):
    
    df = baseball_statistics.loc[(baseball_statistics['email']==user_email) & (baseball_statistics['name']==user_name)] 
    
    fig_swing = go.Figure(
    data = [go.Bar(
            x = ['Away-swing', 'Straight', 'In-swing'],
            y = [df.iloc[0]['away_swing_points'], df.iloc[0]['straight_points'],
                df.iloc[0]['in_swing_points']],
        text = [round(df.iloc[0]['away_swing_points'],2), round(df.iloc[0]['straight_points'],2),
                round(df.iloc[0]['in_swing_points'],2)],
        textposition = 'outside',
        marker_color = ['red','black','blue']
    )])

    fig_swing.update_layout(title={'text': '<b>Pitch Swing Analysis (Points)</b>','x':0.5},
                  xaxis_title_text='<b>Swing Direction</b>',
                 yaxis_title_text='<b>Average Points per Pitch</b>')

    fig_swing.update_xaxes(type='category', showgrid=False, tickangle = 0, ticks="outside",
                showline=True, linewidth=1, linecolor='#121212', tickfont_size=20)

    fig_swing.update_traces(textposition='outside',textfont_size=15)

    fig_swing.layout.plot_bgcolor = 'white'
    fig_swing.layout.paper_bgcolor = 'white'
    fig_swing.layout.yaxis.gridcolor = '#D3D3D3'

    fig_swing.show()

In [2540]:
baseball_points_swing_visual('alex.mathews@batfast.com', 'ALEX MATHEWS')

### Strike Analysis Visuals

In [2541]:
def strike_visuals(user_email, user_name):

    df = baseball_statistics.loc[(baseball_statistics['email']==user_email) & (baseball_statistics['name']==user_name)] 
    
    fig = go.Figure()

    fig.add_trace(go.Indicator(
    mode = "number",
    value = df.iloc[0]['strikes'],
    title = 'Total Strikes'))

    fig.show()

In [2542]:
strike_visuals('alex.mathews@batfast.com', 'ALEX MATHEWS')

### Baseball Power Visuals

In [2543]:
def baseball_power_visual(user_email, user_name):
    
    df = baseball_statistics.loc[(baseball_statistics['email']==user_email) & (baseball_statistics['name']==user_name)] 
    
    fig = go.Figure(go.Indicator(
        mode = "number+gauge",
        gauge = {'shape': "bullet",
                'axis': {'range': [None, 100], 'dtick':10}},
        number = {'suffix': "%", 'font':{'size': 40}},
        value = round(df.iloc[0]['power_percent'],0),
        domain = {'x': [0.1, 1], 'y': [0.2, 0.9]},
        title = {'text': "Avg. Hit Power", 'font':{'size': 14},'align':'center'}))


    fig.show()
    
baseball_power_visual('jack.roddick@batfast.com', 'JACK RODDICK')

## Generating Baseball Visuals

In [2544]:
def generate_baseball_visuals(user_email, user_name):
    
    print("<b>Baseball Pitches Faced<b>")
    baseball_deliveries_faced_visual(user_email, user_name)
    
    print("<b>Baseball Runs and Points<b>")
    total_baseball_runs_points_visual(user_email, user_name)
    
    print("<b>Baseball Average Runs and Points<b>")
    average_baseball_runs_points_visual(user_email, user_name)
    
    print("<b>Baseball Rankings<b>")
    baseball_ranking_visual(user_email, user_name)
    
    print("<b>Speed of Pitches<b>")
    baseball_speed_visualisation(user_email, user_name)
    
    print("<b>Swing Analysis<b>")
    baseball_runs_swing_visual(user_email, user_name)
    baseball_points_swing_visual(user_email, user_name)
    
    print("<b>Total Strikes<b>")
    strike_visuals(user_email, user_name)
    
    print("<b>Average Power Visuals<b>")
    baseball_power_visual(user_email, user_name)

In [2545]:
generate_baseball_visuals('alex.mathews@batfast.com', 'ALEX MATHEWS')

<b>Baseball Pitches Faced<b>


<b>Baseball Runs and Points<b>


<b>Baseball Average Runs and Points<b>


<b>Baseball Rankings<b>


<b>Speed of Pitches<b>


<b>Swing Analysis<b>


<b>Total Strikes<b>


<b>Average Power Visuals<b>
