# Using Python and Jupyter for Data Analysis

### Install required packages (do this only once):

In [None]:
import sys
!{sys.executable} -m pip install --upgrade pip
!{sys.executable} -m pip install seaborn
!{sys.executable} -m pip install pandas
!{sys.executable} -m pip install matplotlib
!{sys.executable} -m pip install statsmodels
!{sys.executable} -m pip install scipy
!{sys.executable} -m pip install sklearn

### Load a few libraries

In [None]:
import pandas as pd               # This is the workhorse for everything Data Science in Python
import matplotlib.pyplot as plt   # very powerful python plotting library
import seaborn as sns             # high level plots
import statsmodels.api as sm      # a Python statistics module 
import scipy                      # a big Python module for everything science data exploration
import numpy as np                # very big Python module for numerical calculations
from sklearn.linear_model import LinearRegression  # we are just using the LR part of the ML module

#### Read the data

In [None]:
df = pd.read_csv('data/data1000.csv')  # That's just the CSV data you've received

In [None]:
# How does that look like?
df

In [None]:
df.keys()

#### Create a histogram plot of reaction times of females and males

In [None]:
# Some global settings used a lot below
GROUPBY = ['Gender']   # dataset will be grouped by these columns
VALUE_COLUMN = 'Reaction_time'   # the column used as a value (should be numerical)
X_TITLE = VALUE_COLUMN           # the title for the x-axis
Y_TITLE = GROUPBY                # the title for the y-axis
SCALED = True                    # normalized histograms or not

In [None]:
df_G = df.groupby(GROUPBY)       # perform the grouping

In [None]:
# This is pretty powerful: it plots the histograms for all groups using a single command. The rest of the keywords are just to tidy things up a bit.

_ = df_G[VALUE_COLUMN].plot.hist(figsize=(12,8), legend=True, alpha=0.5, density=SCALED)

#### Fixing the labels using the figure directly:

In [None]:
fig = plt.figure(figsize=[12,8])  # start a figure with a certain size
ax = fig.gca()                    # get the figure axes
df_G[VALUE_COLUMN].hist(ax=ax, legend=True, alpha=0.5, density=SCALED) # draw the histograms on it.
_ = ax.set_xlabel(X_TITLE)
_ = ax.set_ylabel('Frequency')
if SCALED: ax.set_ylabel('Normalized Frequency')
plt.show()

### Now we are trying to deal with the 'outliers':

We need to do this for the groups separately:

In [None]:
# This is assuming that we have just two groups:
df_f = df[VALUE_COLUMN].where(df['Gender'] == 'F')
df_m = df[VALUE_COLUMN].where(df['Gender'] == 'M')

In [None]:
def remove_outliers(s, low=None, high=None, index=False):
    # just a small helper function to remove outliers above or below certain quantiles
    if not low:
        low = 0.25   # standard for outlier definition
    if not high:
        high = 1 - low
    iqr = scipy.stats.iqr(s, nan_policy='omit')
    if not index:
        return s[s.between(s.quantile(low)-1.5*iqr, s.quantile(high)+1.5*iqr)]
    else:
        return s.between(s.quantile(low)-1.5*iqr, s.quantile(high)+1.5*iqr)

In [None]:
def outlier_cutoff(s):  
    iqr = scipy.stats.iqr(s, nan_policy='omit')
    return {'iqr':iqr, 'lower bound': s.quantile(0.25)-1.5*iqr, 'upper bound': s.quantile(0.75)+1.5*iqr}

In [None]:
df_fc = remove_outliers(df_f, low=0.25)
df_mc = remove_outliers(df_m, low=0.25)
df_f = df[VALUE_COLUMN].where(df['Gender'] == 'F')

In [None]:
fig = plt.figure(figsize=[12,8])  # start a new figure with a certain size
ax = fig.gca()                    # get the figure axes
_ = df_fc.plot.hist(ax=ax, figsize=(12,8), legend=True, alpha=0.5, density=SCALED)
df_mc.plot.hist(ax=ax,legend=True, alpha=0.5, density=SCALED)
#_ = ax.set_xlim((0,1.5))
_ = ax.legend(["Female", "Male"])
_ = ax.set_xlabel(X_TITLE)
_ = ax.set_ylabel('Frequency')
if SCALED: ax.set_ylabel('Normalized Frequency')


### we can achieve the same without even knowing the number or name of the groups

In [None]:
# define a few things upfront
group_names = list(df_G.groups.keys())  # all the gender group names present in the data
quant = 0.25  # quantile to remove outliers 

fig = plt.figure(figsize=[12,8])  # start a new figure with a certain size
ax = fig.gca()                    # get the figure axes

_ = remove_outliers(df_G.get_group(group_names[0])[VALUE_COLUMN], low=quant).plot.hist(ax=ax, figsize=(12,8), legend=True, alpha=0.5, density=SCALED)
for g in group_names[1:]:  # create a subplot for all remaining groups
    try:
        _ = remove_outliers(df_G.get_group(g)[VALUE_COLUMN], low=quant).plot.hist(ax=ax, figsize=(12,8), legend=True, alpha=0.5, density=SCALED)
    except: # sometimes the group keys have NaN values: Not good!
        next
_ = ax.legend(group_names)
_ = ax.set_xlabel(X_TITLE)
_ = ax.set_ylabel('Frequency')
if SCALED: ax.set_ylabel('Normalized Frequency')

Let's do something a bit more exciting!

In [None]:
fig = plt.figure(figsize=[12,8])  # start a new figure with a certain size
ax = fig.gca()                    # get the figure axes

_ = df.boxplot(ax=ax, by=GROUPBY, column=VALUE_COLUMN, vert=False)
_ = ax.set_xlabel(X_TITLE)
_ = ax.set_ylabel(Y_TITLE)
_ = ax.set_title('')

In [None]:
fig = plt.figure(figsize=[12,8])  # start a new figure with a certain size
ax = fig.gca()                    # get the figure axes

dfc = df
# dfc = df.loc[df.Reaction_time <= 1.5]

if len(GROUPBY) == 2:
    _ = sns.violinplot(data=dfc, x=GROUPBY[0], y=VALUE_COLUMN, hue=GROUPBY[1], split=True, inner="quartile", bw=0.25, cut=0)
else:
    dfc['dummy'] = 'A'
    _ = sns.violinplot(data=dfc, x='dummy', y=VALUE_COLUMN, hue=GROUPBY[0], width=0.25, split=True, inner="quartile", bw=0.25, cut=0)
    _ = ax.set_xlabel(X_TITLE)
    _ = ax.set_xticklabels('')
_ = ax.set_ylabel(Y_TITLE)
_ = ax.set_title('')

In [None]:
fig = plt.figure(figsize=[12,8])  # start a new figure with a certain size
ax = fig.gca()                    # get the figure axes

_ = sns.regplot(x=df['Height'],y=df['Arm_Span'])

fig = plt.figure(figsize=[12,8])  # start a new figure with a certain size
ax = fig.gca()                    # get the figure axes
_ = sns.residplot(x=df['Height'],y=df['Arm_Span'])

### Same but using a feature rich stats library

In [None]:
X = sm.add_constant(df['Height'])  # make sure the algorithm has enough degrees of freedom
Y = df['Arm_Span']                 # use Arm_Span; Hypothesis: Arm_Span ~= Height
model = sm.OLS(Y, X).fit()         # Perform the fit
print(model.summary())             # show fitting summary

### R-squared is 0.508, a reasonable fit, but the data has quite a few outliers

### Cleanup outliers:

In [None]:
out = model.outlier_test()     # check for outliers

select = abs(out['student_resid']) <= 2.  

yfit_df = pd.DataFrame(model.fittedvalues)  # the fitted values (on the line)

Xdf = pd.DataFrame(X.values[:,1])
Ydf = pd.DataFrame(Y)
Xclean = Xdf.loc[(select).values].values # remove all X coords with a residual > 2
Yclean = Ydf.loc[(select).values].values # remove all Y coords with a residual > 2
yfit_clean = yfit_df.loc[(select).values].values

print("{0} outliers identified:".format(len(df['Height']) - len(Xclean)))

# perform the fit without outliers
Xfit = sm.add_constant(Xclean)
cfit = sm.OLS(Yclean, Xfit).fit()
print("Fit results without outliers:")
print(cfit.summary())

### R-squared is now 0.791: quite an improvement!

In [None]:
fig = plt.figure(figsize=[12,8])  # start a new figure with a certain size
ax = fig.gca()                    # get the figure axes

_ = plt.scatter(df['Height'], Y)
_ = plt.plot(df['Height'], model.fittedvalues, color="red")     # fit line with outliers
_ = plt.scatter(Xclean, Yclean, color="orange")                 # points selected
_ = plt.plot(Xclean, cfit.fittedvalues, color="green")          # fit line without outliers
ax.set_xlabel('Height [cm]')
ax.set_ylabel('Arm span [cm]')
plt.show()

In [None]:
print(outlier_cutoff(df_f))

In [None]:
np.sqrt(0.)