# Exploratory Data Analysis


This Notebook examines the data at a high (summary) level.

---------
```
Zach Wolpe
zachcolinwolpe@gmail.com
29 May 2021
```
---------

In [13]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import pickle
import os
import re
import sys
sys.path.append('../process data/')
import plotly.graph_objects as go
import plotly.express as px
from process_data_raw_class import batch_processing

In [4]:
with open('../data objects/batch_processing_object.pkl', 'rb') as file2:
    bp = pickle.load(file2)

In [5]:
bp.describe_data()



        ------------------------------------------------------------------
            self.path            : raw data loc
            self.metadata        : mturk metadata
            self.mapping         : reference table
            self.data_times      : reference times table
            self.participants    : list of participant identifiers
            self.parti_code      : list of participant codes
            self.n               : total number of samples
            self.wcst_paths      : paths to wcst  raw data
            self.nback_paths     : paths to nback raw data
            self.corsi_paths     : paths to corsi raw data
            self.fitts_paths     : paths to fitts raw data
            self.navon_paths     : paths to navon raw data
            self.wcst_data       : wcst  dataframe
            self.nback_data      : nback dataframe
            self.corsi_data      : corsi dataframe
            self.fitts_data      : fitts dataframe
            self.navon_data    

# Transformations

Perform any transformations required:
 - remove _NaN_ data
 - select desired columns
 - (de)encode dummy variables

In [6]:
import warnings
warnings.filterwarnings('ignore')
pd.options.display.max_rows = 4000


# ------- Demographics Encoding --------x
# q: Gender
# - male
# - female
# - other
# - prefer not to say

# q: Handedness
# - right
# - left
# - ambidextrous

# q: What is your highest level of education?
# - primary school
# - high school
# - university
# - graduate school

# l: income
# q: Compared with the average, what is your income on a scale from 1 to 10 with 5 being average?
# - {min=1,max=10,left=low,right=high,start=5}

# l: computer_hours
# q: How many hours do you spend playing computer games (per week)
# - {min=0,max=100,left=low,right=high,start=0}
# ------- Demographics Encoding --------x


df = bp.individual_data[['participant', 'participant_file', 'user_agent', 'Welcome_Screen_T', 'participant_code_a', 'feedback_T', 'age_T', 'age_a', 'gender_T', 'gender_a',
                        'handedness_T', 'handedness_a', 'education_T', 'education_a', 'income_T', 'income_a', 'income_s', 'computer_hours_T', 'computer_hours_a', 'computer_hours_s']]

# ---- extract clean data ----x
df             = df[df['age_a'].replace(np.NaN, 'na').str.isnumeric()]          # remove nonsensical data
df.iloc[:, 3:] = df.iloc[:, 3:].astype('float')                                 # convert to float
df             = df[df['gender_a'].notnull()]                                   # Nan data

# ---- create age groupings ----x
bins            = [0, 25, 35, 45, 55, 65, 120]
labels          = ['18-24', '25-34', '35-44', '45-54', '55-64', '65+']
df['age_group'] = pd.cut(df['age_a'], bins, labels=labels, include_lowest=True)

# ---- gender ----x
df['gender_a'][df['gender_a'] == 1] = 'male'
df['gender_a'][df['gender_a'] == 2] = 'female'
df['gender_a'][df['gender_a'] == 3] = 'other'
df['gender_a'][df['gender_a'] == 4] = 'other'

# ---- handedness ----x
df['handedness_a'][df['handedness_a'] == 1] = 'right'
df['handedness_a'][df['handedness_a'] == 2] = 'left'
df['handedness_a'][df['handedness_a'] == 3] = 'ambidextrous'

# ---- education ----x
df['education_a'][df['education_a'] == 1] = 'primary school'
df['education_a'][df['education_a'] == 2] = 'high school'
df['education_a'][df['education_a'] == 3] = 'university'
df['education_a'][df['education_a'] == 4] = 'graduate school'



# Categorical Distributions

In [122]:
def pie_chart(dummy_var, labels, colors, title):
    sub    = df[[dummy_var]].value_counts()
    values = sub.tolist()
    fig = go.Figure(data=[go.Pie(labels=labels, values=values, hole=.4)])
    fig.update_traces(textfont_size=15, marker=dict(colors=colors, line=dict(color='white', width=0)))
    fig.update(layout_title_text=title)
    fig.show()

In [121]:
# ---- gender pie plot ----x
pie_chart(dummy_var='gender_a', labels=['male', 'female', 'other'], colors=['steelblue', 'cyan', 'darkred'], title='Gender Distribution')

# ---- gender pie plot ----x
pie_chart(dummy_var='education_a', labels=['university', 'graduate school', 'high school'], title='Education Distribution',
    colors=['rgb(177, 127, 38)', 'rgb(129, 180, 179)', 'rgb(205, 152, 36)'])

# ---- handedness pie plot ----x
pie_chart(dummy_var='handedness_a', labels=['right', 'left', 'ambidextrous'], title='Handedness Distribution', colors=px.colors.sequential.RdBu)

# ---- age distribution ----x
pie_chart(dummy_var='age_group', labels=np.unique(df[['age_group']]).tolist(), title='Age Distribution', colors=px.colors.sequential.GnBu)

# Continuous Variable Dsitributions

In [None]:
grouping = ['gender_a', 'education_a', 'handedness_a', 'age_group']
var      = ['age_a', 'income_a', 'computer_hours_a']

df[['age_group']].value_counts()

In [23]:
def distributional_plots(continuous_var, cat_var, categories, labels, colors, xlab, ylab, title):

    fig = go.Figure()
    for c in range(len(categories)):
        fig.add_trace(go.Histogram(
            x           =df[continuous_var][df[cat_var] == categories[c]],
            # histnorm    ='percent',
            name        =labels[c], 
            marker_color=colors[c],
            opacity     =1
        ))
    fig.update_layout(
        barmode         ='overlay',
        title_text      =title, 
        xaxis_title_text=xlab, 
        yaxis_title_text=ylab, 
        bargap          =0.05, 
        bargroupgap     =0.1 
    )
    fig.update_layout(barmode='group')
    fig.show()


In [24]:
# ---- Age Distribution ----x
distributional_plots(
    continuous_var='age_a', cat_var='gender_a', categories=['male', 'female'], labels=['male', 'female'], colors=['lightblue', 'pink'], 
    xlab='Age', ylab='Count', title='Age Distribution (by gender)')

# ---- Income Distribution ----x
distributional_plots(
    continuous_var='income_a', cat_var='gender_a', categories=['male', 'female'], labels=['male', 'female'], colors=['lightblue', 'pink'], 
    xlab='Income', ylab='Count', title='Income Distribution (by gender)')

# ---- Computer Hours Distribution ----x
distributional_plots(
    continuous_var='computer_hours_a', cat_var='gender_a', categories=['male', 'female'], labels=['male', 'female'], colors=['lightblue', 'pink'], 
    xlab='Computer Hours', ylab='Count', title='Computer Hours Distribution (by gender)')

In [25]:
# ---- Age Distribution ----x
distributional_plots(
    continuous_var='age_a', cat_var='education_a', categories=np.unique(df[['education_a']]).tolist(), labels=np.unique(df[['education_a']]).tolist(), 
    colors=['#00537a', '#ffa9bc', '#8fe1ff'], 
    xlab='Age', ylab='Count', title='Age Distribution (by education)')

# ---- Income Distribution ----x
distributional_plots(
    continuous_var='income_a', cat_var='education_a', categories=np.unique(df[['education_a']]).tolist(), labels=np.unique(df[['education_a']]).tolist(), 
    colors=['#00537a', '#ffa9bc', '#8fe1ff'], 
    xlab='Income', ylab='Count', title='Income Distribution (by education)')

# ---- Computer Hours Distribution ----x
distributional_plots(
    continuous_var='computer_hours_a', cat_var='education_a', categories=np.unique(df[['education_a']]).tolist(), labels=np.unique(df[['education_a']]).tolist(), 
    colors=['#00537a', '#ffa9bc', '#8fe1ff'], 
    xlab='Computer Hours', ylab='Count', title='Computer Hours Distribution (by education)')

In [26]:
# ---- Age Distribution ----x
distributional_plots(
    continuous_var='age_a', cat_var='handedness_a', categories=np.unique(df[['handedness_a']]).tolist(), labels=np.unique(df[['handedness_a']]).tolist(), 
    colors=px.colors.sequential.GnBu, 
    xlab='Age', ylab='Count', title='Age Distribution (by handedness)')

# ---- Income Distribution ----x
distributional_plots(
    continuous_var='income_a', cat_var='handedness_a', categories=np.unique(df[['handedness_a']]).tolist(), labels=np.unique(df[['handedness_a']]).tolist(), 
    colors=px.colors.sequential.GnBu, 
    xlab='Income', ylab='Count', title='Income Distribution (by handedness)')

# ---- Computer Hours Distribution ----x
distributional_plots(
    continuous_var='computer_hours_a', cat_var='handedness_a', categories=np.unique(df[['handedness_a']]).tolist(), labels=np.unique(df[['handedness_a']]).tolist(), 
    colors=px.colors.sequential.GnBu, 
    xlab='Computer Hours', ylab='Count', title='Computer Hours Distribution (by handedness)')

In [27]:
# ---- Age Distribution ----x
distributional_plots(
    continuous_var='age_a', cat_var='age_group', categories=np.unique(df[['age_group']]).tolist(), labels=np.unique(df[['age_group']]).tolist(), 
    colors=px.colors.sequential.RdBu, 
    xlab='Age', ylab='Count', title='Age Distribution (by age_group)')

# ---- Income Distribution ----x
distributional_plots(
    continuous_var='income_a', cat_var='age_group', categories=np.unique(df[['age_group']]).tolist(), labels=np.unique(df[['age_group']]).tolist(), 
    colors=px.colors.sequential.RdBu, 
    xlab='Income', ylab='Count', title='Income Distribution (by age_group)')

# ---- Computer Hours Distribution ----x
distributional_plots(
    continuous_var='computer_hours_a', cat_var='age_group', categories=np.unique(df[['age_group']]).tolist(), labels=np.unique(df[['age_group']]).tolist(), 
    colors=px.colors.sequential.RdBu, 
    xlab='Computer Hours', ylab='Count', title='Computer Hours Distribution (by age_group)')