In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as scs
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier 

In [None]:
df = pd.read_excel('dec16-OutboundForAnalysis.xlsx')
df.head()

In [None]:
df.columns

In [None]:
#drop last 2 columns because they are messing up data

df.drop(columns=['Unnamed: 14','Unnamed: 15'],inplace=True)

In [None]:
df.head()

In [None]:
# Make scatter matrix to visualize 

pd.plotting.scatter_matrix(df, figsize=(20,20))
plt.show()

# EDA On Parameters

## EDA - (1) Approach Vertical

In [None]:
df['Approach Vertical'].plot('hist')

In [None]:
df['Approach Vertical'].describe(), 

In [None]:
approach_mu = df['Approach Vertical'].mean()
approach_std = df['Approach Vertical'].std()

min95 = approach_mu-2*approach_std
max95 = approach_mu+2*approach_std


def get_rank_approach_vertical(vert):
    if vert > approach_mu+1*approach_std:
        return 5
    if vert > approach_mu:
        return 4
    if vert > approach_mu - approach_std:
        return 3
    if vert > approach_mu - 2*approach_std:
        return 2
    return 1

In [None]:
df['approach_vertical_rank'] = df['Approach Vertical'].apply(get_rank_approach_vertical)
df.head()

In [None]:
# object
# attributes (properties, adjectives)
# methods (verbs)

# object.attributes
# object.method()

av_rank_counts = pd.value_counts(df['approach_vertical_rank'].values, sort=False)
av_rank_counts.plot.barh()


In [None]:
# Observation

## EDA - (2) Vertical Jump

In [None]:
df['Vertical Jump'].plot('hist')

In [None]:
df['Vertical Jump'].describe(), 

In [None]:
vertical_mu = df['Vertical Jump'].mean()
vertical_std = df['Vertical Jump'].std()

min95 = vertical_mu-2*vertical_std
max95 = vertical_mu+2*vertical_std


def get_rank_vertical_jump(vert):
    if vert > vertical_mu+1*vertical_std:
        return 5
    if vert > vertical_mu:
        return 4
    if vert > vertical_mu - vertical_std:
        return 3
    if vert > vertical_mu - 2*vertical_std:
        return 2
    return 1

In [None]:
df['vertical_jump_rank'] = df['Vertical Jump'].apply(get_rank_vertical_jump)
df.head()

In [None]:
av_rank_counts_vert_jump = pd.value_counts(df['vertical_jump_rank'].values, sort=False)
av_rank_counts_vert_jump.plot.barh()

In [None]:
# Observations

## EDA - (3) Reaction Shuttle

In [None]:
df['Reaction Shuttle'].plot('hist')

In [None]:
df['Reaction Shuttle'].describe(), 

In [None]:
shuttle_mu = df['Reaction Shuttle'].mean()
shuttle_std = df['Reaction Shuttle'].std()

min95 = shuttle_mu-2*shuttle_std
max95 = shuttle_mu+2*shuttle_std


def get_rank_reaction_shuttle(shut):
    if shut > shuttle_mu+1*shuttle_std:
        return 5
    if shut > shuttle_mu:
        return 4
    if shut > shuttle_mu - shuttle_std:
        return 3
    if shut > shuttle_mu - 2*shuttle_std:
        return 2
    return 1

In [None]:
df['reaction_shuttle_rank'] = df['Reaction Shuttle'].apply(get_rank_reaction_shuttle)
df.head()

In [None]:
av_rank_counts_shuttle = pd.value_counts(df['reaction_shuttle_rank'].values, sort=False)
av_rank_counts_shuttle.plot.barh()

In [None]:
# Observations - need to take outliers out and re weigh/balance data

## EDA - (4) BAM Score

In [None]:
df['BAMScore'].plot('hist')

In [None]:
df['BAMScore'].describe(), 

In [None]:
bam_mu = df['BAMScore'].mean()
bam_std = df['BAMScore'].std()

min95 = bam_mu-2*bam_std
max95 = bam_mu+2*bam_std


def get_rank_bam_score(bam):
    if bam > bam_mu+1*bam_std:
        return 5
    if bam > bam_mu:
        return 4
    if bam > bam_mu - bam_std:
        return 3
    if bam > bam_mu - 2*bam_std:
        return 2
    return 1

In [None]:
df['bam_score_rank'] = df['BAMScore'].apply(get_rank_bam_score)
df.head()

In [None]:
av_rank_bam_score = pd.value_counts(df['bam_score_rank'].values, sort=False)
av_rank_bam_score.plot.barh()

In [None]:
# Observations - A lot of values in 4 and 3 rank - Maybe break up into 10 ranks. We see a very normal distribution here.

## EDA - (5) Wingspan

In [None]:
df['Wingspan'].plot('hist')

In [None]:
df['Wingspan'].describe(), 

In [None]:
wingspan_mu = df['Wingspan'].mean()
wingspan_std = df['Wingspan'].std()

min95 = wingspan_mu-2*wingspan_std
max95 = wingspan_mu+2*wingspan_std


def get_rank_wingspan(wing):
    if wing > wingspan_mu+1*wingspan_std:
        return 5
    if wing > wingspan_mu:
        return 4
    if wing > wingspan_mu - wingspan_std:
        return 3
    if wing > wingspan_mu - 2*wingspan_std:
        return 2
    return 1

In [None]:
df['wingspan_rank'] = df['Wingspan'].apply(get_rank_wingspan)
df.head()

In [None]:
av_rank_wingspan = pd.value_counts(df['wingspan_rank'].values, sort=False)
av_rank_wingspan.plot.barh()

## EDA - (6) Reach

In [None]:
df['Reach'].plot('hist')

In [None]:
df['Reach'].describe(), 

In [None]:
reach_mu = df['Reach'].mean()
reach_std = df['Reach'].std()

min95 = reach_mu-2*reach_std
max95 = reach_mu+2*reach_std


def get_rank_reach(reach):
    if reach > reach_mu+1*reach_std:
        return 5
    if reach > reach_mu:
        return 4
    if reach > reach_mu - reach_std:
        return 3
    if reach > reach_mu - 2*reach_std:
        return 2
    return 1

In [None]:
df['reach_rank'] = df['Reach'].apply(get_rank_reach)
df.head()

In [None]:
av_rank_reach = pd.value_counts(df['reach_rank'].values, sort=False)
av_rank_reach.plot.barh()

In [None]:
# Observations

# EDA - (7) Height

In [None]:
df['Height'].plot('hist')

In [None]:
df['Height'].describe(), 

In [None]:
height_mu = df['Height'].mean()
height_std = df['Height'].std()

min95 = height_mu-2*height_std
max95 = height_mu+2*height_std


def get_rank_height(height):
    if height > height_mu+1*height_std:
        return 5
    if height > height_mu:
        return 4
    if height > height_mu - height_std:
        return 3
    if height > height_mu - 2*height_std:
        return 2
    return 1

In [None]:
df['height_rank'] = df['Height'].apply(get_rank_height)
df.head()

In [None]:
av_rank_height = pd.value_counts(df['height_rank'].values, sort=False)
av_rank_height.plot.barh()

In [None]:
# Observations

# EDA - (8) Weight

In [None]:
df['Weight'].plot('hist')

In [None]:
df['Weight'].describe(), 

In [None]:
weight_mu = df['Weight'].mean()
weight_std = df['Weight'].std()

min95 = weight_mu-2*weight_std
max95 = weight_mu+2*weight_std


def get_rank_weight(weight):
    if weight > weight_mu+1*weight_std:
        return 5
    if weight > weight_mu:
        return 4
    if weight > weight_mu - weight_std:
        return 3
    if weight > weight_mu - 2*weight_std:
        return 2
    return 1

In [None]:
df['weight_rank'] = df['Weight'].apply(get_rank_weight)
df.head()

In [None]:
av_rank_weight = pd.value_counts(df['weight_rank'].values, sort=False)
av_rank_weight.plot.barh()

In [None]:
# Observations

# EDA - (9) Body Comp

In [None]:
df['Body Comp'].plot('hist')

In [None]:
df['Body Comp'].describe(), 

In [None]:
body_comp_mu = df['Body Comp'].mean()
body_comp_std = df['Body Comp'].std()

min95 = body_comp_mu-2*body_comp_std
max95 = body_comp_mu+2*body_comp_std


def get_rank_body_comp(body):
    if body > body_comp_mu+1*wingspan_std:
        return 5
    if body > body_comp_mu:
        return 4
    if body > body_comp_mu - body_comp_std:
        return 3
    if body > body_comp_mu - 2*body_comp_std:
        return 2
    return 1

In [None]:
df['body_comp_rank'] = df['Body Comp'].apply(get_rank_body_comp)
df.head()

In [None]:
av_rank_body_comp = pd.value_counts(df['body_comp_rank'].values, sort=False)
av_rank_body_comp.plot.barh()

In [None]:
# Observations

# EDA - (10) Hand Length

In [None]:
df['Hand Length'].plot('hist')

In [None]:
df['Hand Length'].describe(), 

In [None]:
hand_length_mu = df['Hand Length'].mean()
hand_length_std = df['Hand Length'].std()

min95 = hand_length_mu-2*hand_length_std
max95 = hand_length_mu+2*hand_length_std


def get_rank_hand_length(handlength):
    if handlength > hand_length_mu+1*hand_length_std:
        return 5
    if handlength > hand_length_mu:
        return 4
    if handlength > hand_length_mu - hand_length_std:
        return 3
    if handlength > hand_length_mu - 2*hand_length_std:
        return 2
    return 1

In [None]:
df['hand_length_rank'] = df['Hand Length'].apply(get_rank_hand_length)
df.head()

In [None]:
av_rank_hand_length = pd.value_counts(df['hand_length_rank'].values, sort=False)
av_rank_hand_length.plot.barh()

In [None]:
# Observations

# EDA - (11) Hand Width

In [None]:
df['Hand Width'].plot('hist')

In [None]:
df['Hand Width'].describe(), 

In [None]:
hand_width_mu = df['Hand Width'].mean()
hand_width_std = df['Hand Width'].std()

min95 = hand_width_mu-2*hand_width_std
max95 = hand_width_mu+2*hand_width_std


def get_rank_hand_width(handwidth):
    if handwidth > hand_width_mu+1*hand_width_std:
        return 5
    if handwidth > hand_width_mu:
        return 4
    if handwidth > hand_width_mu - hand_width_std:
        return 3
    if handwidth > hand_width_mu - 2*hand_width_std:
        return 2
    return 1

In [None]:
df['hand_width_rank'] = df['Hand Width'].apply(get_rank_hand_width)
df.head()

In [None]:
av_rank_hand_width = pd.value_counts(df['hand_width_rank'].values, sort=False)
av_rank_hand_width.plot.barh()

# Feature Importance Analysis

In [None]:
df

In [None]:
#df['3/4 Court sprint  '] = df['3/4 Court sprint  '].astype(float)

In [None]:
for index,col in enumerate(df['3/4 Court sprint  ']):
    try:
        float(col)
    except ValueError:
        print (index,col)

In [None]:
df.loc[371,'3/4 Court sprint  ']

In [None]:
df['3/4 Court sprint  '] = df['3/4 Court sprint  '].replace(' ',np.NaN)
df['3/4 Court sprint  ']

In [None]:
x = df.drop(columns=['BAMid', 'BAMScore','bam_score_rank','approach_vertical_rank', 'vertical_jump_rank',
       'reaction_shuttle_rank','wingspan_rank','reach_rank','height_rank','weight_rank','body_comp_rank','hand_length_rank','hand_width_rank'])
y = df[['bam_score_rank']]

In [None]:
x.info()

In [None]:
x = x.replace(to_replace=['Nan','NAN'], value=np.nan)

In [None]:
x = x.fillna(x.mean())
# FORMULA = df.fillna(df.mean())

In [None]:
x.isnull().sum()

# repeat for 4 way agility

In [None]:
#df['4-Way agility'] = df['4-Way agility'].astype(float)

In [None]:
for index,col in enumerate(df['4-Way agility']):
    try:
        float(col)
    except ValueError:
        print (index,col)

In [None]:
df.loc[377,'4-Way agility']

In [None]:
df['4-Way agility'] = df['4-Way agility'].replace(' ',np.NaN)
df['4-Way agility']

In [None]:
x = df.drop(columns=['BAMid', 'BAMScore','bam_score_rank','approach_vertical_rank', 'vertical_jump_rank',
       'reaction_shuttle_rank','wingspan_rank','reach_rank','height_rank','weight_rank','body_comp_rank','hand_length_rank','hand_width_rank'])
y = df[['bam_score_rank']]

In [None]:
x.info()

In [None]:
x = x.replace(to_replace=['Nan','NAN'], value=np.nan)

In [None]:
x = x.fillna(x.mean())
# FORMULA = df.fillna(df.mean())

In [None]:
x.isnull().sum()

In [None]:
rf = RandomForestClassifier()
rf.fit (x,y)

#find string in data that is messing it up

In [None]:
print(x.columns)
print(rf.feature_importances_)

In [None]:
values = rf.feature_importances_
names = x.columns
plt.figure(figsize=(13,8))
plt.grid(zorder=0)
plt.bar(names,values,zorder=2)
plt.xticks(rotation=75)
plt.show

plt.title('Feature Importance with respect to Bam Score')
plt.xlabel('Parameters')
plt.ylabel('Feature Importance')

In [None]:
df_cleaned = x.copy()

In [None]:
df_cleaned['bam_score'] = y
df_cleaned.head(2)

In [None]:
plt.figure(figsize=(13, 8))
sns.violinplot(x='bam_score', y='Reaction Shuttle', data=df_cleaned)
plt.show()

In [None]:
plt.figure(figsize=(8, 5))
sns.lmplot(x='bam_score', y='Reaction Shuttle', data=df_cleaned)
plt.show()

In [None]:
#reaction shuttle is the highiest dilineator for their rank

## EDA - (10) Hand Length cont.

In [None]:
# Hand Length threshold identifier

In [None]:
hl_mu = df['Hand Length'].mean()
hl_std = df['Hand Length'].std()
print(hl_mu, hl_std)
lower_95 = hl_mu-2*hl_std
df.shape, df[df['Hand Length']>7].shape

In [None]:
df.isna().sum()

In [None]:
hl_min = df['Hand Length'].min()

jbs = []
thresholds = np.linspace(hl_min, hl_mu, 10)
for threshold in thresholds:
    vals = df[df['Hand Length'].fillna(hl_mu)>threshold]['Hand Length'].fillna(hl_mu)
    plt.hist(vals, bins=20)
    plt.title("histogram of hand length\n{}".format(threshold))
    plt.show()
    jb = scs.jarque_bera(vals)
    jbs.append(jb[0])

In [None]:
plt.scatter(thresholds, jbs)

In [None]:
# Conclusion - 4.672 threshold

# Observations

### Hand length is messing everything up and I need to cut it. Contains lot's of outliers.
### Let's find these outliers and make a list of them to discuss with martin.
#### --> Once I find the outliers, Martin and I will decide weather or not to fix or drop them.

### Code to remove outliers from Height and Weight Catagories

In [None]:
#df.describe()

In [None]:
#df.loc[df['Weight']>0].describe()

In [None]:
#df.loc[df['Height']>100.0]

In [None]:
#df_no_outlier = df.loc[df['Height']<100.0]

In [None]:
#pd.plotting.scatter_matrix(df_no_outlier, figsize=(20,20))
#plt.show()

In [None]:
#df.loc[df['Height']<60.0]

## Notes for Martin 1/29