# Exploratory Data Analysis


This Notebook examines the data at a high (summary) level.

---------
```
Zach Wolpe
zachcolinwolpe@gmail.com
29 May 2021
```
---------

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import pickle
import os
import re
import sys
sys.path.append('../process data/')
import plotly.graph_objects as go
from process_data_raw_class import batch_processing

In [2]:
with open('../data objects/batch_processing_object.pkl', 'rb') as file2:
    bp = pickle.load(file2)

In [3]:
bp.describe_data()



        ------------------------------------------------------------------
            self.path            : raw data loc
            self.metadata        : mturk metadata
            self.mapping         : reference table
            self.data_times      : reference times table
            self.participants    : list of participant identifiers
            self.parti_code      : list of participant codes
            self.n               : total number of samples
            self.wcst_paths      : paths to wcst  raw data
            self.nback_paths     : paths to nback raw data
            self.corsi_paths     : paths to corsi raw data
            self.fitts_paths     : paths to fitts raw data
            self.navon_paths     : paths to navon raw data
            self.wcst_data       : wcst  dataframe
            self.nback_data      : nback dataframe
            self.corsi_data      : corsi dataframe
            self.fitts_data      : fitts dataframe
            self.navon_data    

In [4]:
import plotly.express as px
df = bp.individual_data[['age_a']]
fig = px.histogram(df, x='age_a')
fig.show()



In [24]:
 bp.individual_data.columns

 

Index(['participant', 'participant_file', 'version', 'server-time',
       'user_agent', 'screen_width', 'screen_height', 'html_width',
       'html_height', 'user_time', 'T', 'Welcome_Screen',
       'Welcome_Screen_stime', 'Welcome_Screen_T', 'Welcome_Screen_t',
       'participant_code', 'participant_code_stime', 'participant_code_t',
       'participant_code_a', 'feedback', 'feedback_stime', 'feedback_T',
       'feedback_t', 'age', 'age_stime', 'age_T', 'age_t', 'age_a', 'gender',
       'gender_stime', 'gender_T', 'gender_t', 'gender_a', 'handedness',
       'handedness_stime', 'handedness_T', 'handedness_t', 'handedness_a',
       'education', 'education_stime', 'education_T', 'education_t',
       'education_a', 'income', 'income_stime', 'income_T', 'income_t',
       'income_a', 'income_s', 'computer_hours', 'computer_hours_stime',
       'computer_hours_T', 'computer_hours_t', 'computer_hours_a',
       'computer_hours_s', 'wcst_task', 'wcst_task_stime', 'wcst_task_T',
      

In [8]:
pd.options.display.max_rows = 4000





# ------- Demographics Encoding --------x

# q: Gender
# - male
# - female
# - other
# - prefer not to say

# q: Handedness
# - right
# - left
# - ambidextrous

# q: What is your highest level of education?
# - primary school
# - high school
# - university
# - graduate school

# l: income
# q: Compared with the average, what is your income on a scale from 1 to 10 with 5 being average?
# - {min=1,max=10,left=low,right=high,start=5}

# l: computer_hours
# q: How many hours do you spend playing computer games (per week)
# - {min=0,max=100,left=low,right=high,start=0}

# ------- Demographics Encoding --------x


df = bp.individual_data[['participant', 'participant_file', 'user_agent', 'Welcome_Screen_T', 'participant_code_a', 'feedback_T', 'age_T', 'age_a', 'gender_T', 'gender_a',
                        'handedness_T', 'handedness_a', 'education_T', 'education_a', 'income_T', 'income_a', 'income_s', 'computer_hours_T', 'computer_hours_a', 'computer_hours_s']]

# ---- extract clean data ----x
df             = df[df['age_a'].replace(np.NaN, 'na').str.isnumeric()]          # remove nonsensical data
df.iloc[:, 3:] = df.iloc[:, 3:].astype('float')                                 # convert to float
df             = df[df['gender_a'].notnull()]                                   # Nan data

# ---- create age groupings ----x
bins            = [0, 25, 35, 45, 55, 65, 120]
labels          = ['18-24', '25-34', '35-44', '45-54', '55-64', '65+']
df['age_group'] = pd.cut(df['age_a'], bins, labels=labels, include_lowest=True)

# ---- gender ----x
df['gender_a'][df['gender_a'] == 1] = 'male'
df['gender_a'][df['gender_a'] == 2] = 'female'
df['gender_a'][df['gender_a'] == 3] = 'other'
df['gender_a'][df['gender_a'] == 4] = 'other'

# ---- handedness ----x
df['handedness_a'][df['handedness_a'] == 1] = 'right'
df['handedness_a'][df['handedness_a'] == 2] = 'left'
df['handedness_a'][df['handedness_a'] == 3] = 'ambidextrous'

# ---- education ----x
df['education_a'][df['education_a'] == 1] = 'primary school'
df['education_a'][df['education_a'] == 2] = 'high school'
df['education_a'][df['education_a'] == 3] = 'university'
df['education_a'][df['education_a'] == 4] = 'graduate school'





A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/i

In [15]:
# ---- select grouping ----x
grouping = ['gender_a', 'education_a', 'handedness_a', 'age_group']
var      = ['age_a', 'income_a', 'computer_hours_a']



grps = grouping
vrs  = var
fig = go.Figure()
from plotly.subplots import make_subplots

fig = make_subplots(rows=len(vrs), cols=len(grps))





In [13]:
grps = grouping
vrs  = var
fig = go.Figure()
from plotly.subplots import make_subplots

fig = make_subplots(rows=len(vrs), cols=len(grps))



# for v in vrs:
    # --- for each continuous variable ----x

c=0; v=vrs[1]
for gp in grps:
    c+=1

    gg = np.unique(df[gp])
    for g in gg:

        fig.add_trace(go.Histogram(
            x           =df[v][df[gp] == g],
            histnorm    ='percent',
            name        =g, 
            opacity     =1
        ), row=1, col=c)


fig.update_layout(
    barmode         ='overlay',
    title_text      ='Age Distribution', 
    xaxis_title_text='Age', 
    yaxis_title_text='Count', 
    bargap          =0.05, 
    bargroupgap     =0.1 
)

fig.update_layout(barmode='group')
fig.show()

In [217]:
df.columns

Index(['participant', 'participant_file', 'user_agent', 'Welcome_Screen_T',
       'participant_code_a', 'feedback_T', 'age_T', 'age_a', 'gender_T',
       'gender_a', 'handedness_T', 'handedness_a', 'education_T',
       'education_a', 'income_T', 'income_a', 'income_s', 'computer_hours_T',
       'computer_hours_a', 'computer_hours_s', 'age_group'],
      dtype='object')

In [223]:
import plotly.express as px

fig = px.scatter(df, x="age_a", y="income_a", color="age_group",
                 )
fig.show()

In [181]:
import plotly.graph_objects as go


x = 'income_a'
x = 'computer_hours_a'

fig = go.Figure()
fig.add_trace(go.Histogram(
    x           =df[x][df['gender_a'] == 'male'],
    histnorm    ='percent',
    name        ='male', 
    marker_color='lightblue',
    opacity     =1
))
fig.add_trace(go.Histogram(
    x           =df[x][df['gender_a'] == 'female'],
    histnorm    ='percent',
    name        ='female', 
    marker_color='pink',
    opacity     =1
))

fig.update_layout(
    barmode         ='overlay',
    title_text      ='Age Distribution', 
    xaxis_title_text='Age', 
    yaxis_title_text='Count', 
    bargap          =0.05, 
    bargroupgap     =0.1 
)

fig.update_layout(barmode='group')



fig.show()

In [185]:





# bins = [18, 30, 40, 50, 60, 70, 120]
# labels = ['18-29', '30-39', '40-49', '50-59', '60-69', '70+']
# ages['agerange'] = pd.cut(ages.age, bins, labels = labels,include_lowest = True)

# print(ages)

0      25-34
1      18-24
2      25-34
3      45-54
4      25-34
5      25-34
6      18-24
7      18-24
8      25-34
9      35-44
10     25-34
14     35-44
15     25-34
16     35-44
17     45-54
18     25-34
19     45-54
20     35-44
21     18-24
22     25-34
23     25-34
24     25-34
25     55-64
26     45-54
27     55-64
28     25-34
29     35-44
30     25-34
31     25-34
32     35-44
33     35-44
34     25-34
35     25-34
36     25-34
37     18-24
38     25-34
41     25-34
42     25-34
43     18-24
44     18-24
46     18-24
47     35-44
48     18-24
50     55-64
54     18-24
55     25-34
56     35-44
57     25-34
58     18-24
59     18-24
61     45-54
63     25-34
64     35-44
65     18-24
66     25-34
67     35-44
68     45-54
69     55-64
70     35-44
71     18-24
72     35-44
74     25-34
75     25-34
76     25-34
77     18-24
78     25-34
79     55-64
80     18-24
81     45-54
82     35-44
83     25-34
84     45-54
85     35-44
86     25-34
87     35-44
88     25-34
89     25-34

In [96]:
import dash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output
import plotly.express as px
import numpy as np


np.random.seed(2020)

app = dash.Dash(__name__)

app.layout = html.Div([
    dcc.Graph(id="graph"),
    html.P("Mean:"),
    dcc.Slider(id="mean", min=-3, max=3, value=0, 
               marks={-3: '-3', 3: '3'}),
    html.P("Standard Deviation:"),
    dcc.Slider(id="std", min=1, max=3, value=1, 
               marks={1: '1', 3: '3'}),
])

@app.callback(
    Output("graph", "figure"), 
    [Input("mean", "value"), 
     Input("std", "value")])
def display_color(mean, std):
    data = np.random.normal(mean, std, size=500)
    fig = px.histogram(data, nbins=30, range_x=[-10, 10])
    return fig

app.run_server(host = '127.0.0.1', debug=True)

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
[2m   Use a production WSGI server instead.[0m
 * Debug mode: on


SystemExit: 1

NameError: name 'f' is not defined