# Clean Processed Data
## Data Preprocessing

The data has been processed & captured int a _class object: batch_processing_ - storing the experimental & user demographic data in pandas dataframes.

#### Objective
This notebook aims to identify & handle missing, invalid or incomplete entries.

#### Output
- Store an instance of the _class object: batch_processing_ after additional cleaning.
- Store a description of the transformations performed
- The resulting data should be ready for:
    - Statistics test
    - Modelling
    - Rendering graphics
    
---------
```
Zach Wolpe
zachcolinwolpe@gmail.com
03 June 2021
```
---------


In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import pickle
import os
import re
import sys
sys.path.append('../process data/')
import plotly.graph_objects as go
import plotly.express as px
from process_data_raw_class import batch_processing

In [2]:
with open('../data objects/batch_processing_object.pkl', 'rb') as file2:
    bp = pickle.load(file2)

In [3]:
bp.describe_data()



        ------------------------------------------------------------------
            self.path            : raw data loc
            self.metadata        : mturk metadata
            self.mapping         : reference table
            self.data_times      : reference times table
            self.participants    : list of participant identifiers
            self.parti_code      : list of participant codes
            self.n               : total number of samples
            self.wcst_paths      : paths to wcst  raw data
            self.nback_paths     : paths to nback raw data
            self.corsi_paths     : paths to corsi raw data
            self.fitts_paths     : paths to fitts raw data
            self.navon_paths     : paths to navon raw data
            self.wcst_data       : wcst  dataframe
            self.nback_data      : nback dataframe
            self.corsi_data      : corsi dataframe
            self.fitts_data      : fitts dataframe
            self.navon_data    

# Identify Missing Data

- Identify missing or corrupt entries
- relate those entries to participants


In [16]:
import warnings
warnings.filterwarnings('ignore')
pd.options.display.max_rows = 4000


# ------- Demographics Encoding --------x
# q: Gender
# - male
# - female
# - other
# - prefer not to say

# q: Handedness
# - right
# - left
# - ambidextrous

# q: What is your highest level of education?
# - primary school
# - high school
# - university
# - graduate school

# l: income
# q: Compared with the average, what is your income on a scale from 1 to 10 with 5 being average?
# - {min=1,max=10,left=low,right=high,start=5}

# l: computer_hours
# q: How many hours do you spend playing computer games (per week)
# - {min=0,max=100,left=low,right=high,start=0}
# ------- Demographics Encoding --------x


df = bp.individual_data[['participant', 'participant_file', 'user_agent', 'Welcome_Screen_T', 'participant_code_a', 'feedback_T', 'age_T', 'age_a', 'gender_T', 'gender_a',
                        'handedness_T', 'handedness_a', 'education_T', 'education_a', 'income_T', 'income_a', 'income_s', 'computer_hours_T', 'computer_hours_a', 'computer_hours_s']]

# ---- extract clean data ----x
df             = df[df['age_a'].replace(np.NaN, 'na').str.isnumeric()]          # remove nonsensical data
df.iloc[:, 3:] = df.iloc[:, 3:].astype('float')                                 # convert to float
original       = df.copy()                                                      # store original
df             = df[df['gender_a'].notnull()]                                   # Nan data

# ---- create age groupings ----x
bins            = [0, 25, 35, 45, 55, 65, 120]
labels          = ['18-24', '25-34', '35-44', '45-54', '55-64', '65+']
df['age_group'] = pd.cut(df['age_a'], bins, labels=labels, include_lowest=True)

# ---- gender ----x
df['gender_a'][df['gender_a'] == 1] = 'male'
df['gender_a'][df['gender_a'] == 2] = 'female'
df['gender_a'][df['gender_a'] == 3] = 'other'
df['gender_a'][df['gender_a'] == 4] = 'other'

# ---- handedness ----x
df['handedness_a'][df['handedness_a'] == 1] = 'right'
df['handedness_a'][df['handedness_a'] == 2] = 'left'
df['handedness_a'][df['handedness_a'] == 3] = 'ambidextrous'

# ---- education ----x
df['education_a'][df['education_a'] == 1] = 'primary school'
df['education_a'][df['education_a'] == 2] = 'high school'
df['education_a'][df['education_a'] == 3] = 'university'
df['education_a'][df['education_a'] == 4] = 'graduate school'

df.head()

Unnamed: 0,participant,participant_file,user_agent,Welcome_Screen_T,participant_code_a,feedback_T,age_T,age_a,gender_T,gender_a,...,handedness_a,education_T,education_a,income_T,income_a,income_s,computer_hours_T,computer_hours_a,computer_hours_s,age_group
0,816404.0,s.6463d918-7e29-49b0-942d-f1a9faab6ad6.txt,mozilla/5.0,1621951320945.0,816404.0,8719.0,3556.0,28.0,2023.0,female,...,right,2604.0,university,6457.0,6.0,6.0,11528.0,20.0,20.0,25-34
1,221478.0,s.fa171c49-91e1-44b0-b9e3-9937dca02dca.txt,mozilla/5.0,1621951860689.0,221478.0,6663.0,14735.0,25.0,1917.0,female,...,right,3588.0,graduate school,49250.0,7.0,7.0,8375.0,60.0,60.0,18-24
2,192208.0,s.b36334f4-c3dd-4d1b-b2e4-f0bee8be7d31.txt,mozilla/5.0,1621953589918.0,192208.0,17878.0,3448.0,29.0,1691.0,male,...,right,2366.0,university,8341.0,6.0,6.0,21202.0,5.0,5.0,25-34
3,803102.0,s.35c74094-fc8a-4fc8-8ac2-3b4522be09ff.txt,mozilla/5.0,1621950896852.0,803102.0,11907.0,36996.0,47.0,2425.0,male,...,right,5373.0,university,4701.0,7.0,7.0,4018.0,74.0,74.0,45-54
4,844810.0,s.727f9e4c-2eab-4d01-8176-cd62d77d550a.txt,mozilla/5.0,1621951255736.0,844810.0,17782.0,3288.0,32.0,1905.0,female,...,right,3031.0,university,14055.0,6.0,6.0,4410.0,22.0,22.0,25-34


In [28]:
bp.describe_data()



        ------------------------------------------------------------------
            self.path            : raw data loc
            self.metadata        : mturk metadata
            self.mapping         : reference table
            self.data_times      : reference times table
            self.participants    : list of participant identifiers
            self.parti_code      : list of participant codes
            self.n               : total number of samples
            self.wcst_paths      : paths to wcst  raw data
            self.nback_paths     : paths to nback raw data
            self.corsi_paths     : paths to corsi raw data
            self.fitts_paths     : paths to fitts raw data
            self.navon_paths     : paths to navon raw data
            self.wcst_data       : wcst  dataframe
            self.nback_data      : nback dataframe
            self.corsi_data      : corsi dataframe
            self.fitts_data      : fitts dataframe
            self.navon_data    

In [96]:
null_participants = []
p = original[original['gender_a'].isnull()]
p.iloc[0,]['participant']

904550.0

In [80]:
pd.options.display.max_rows = 4000
x = np.random.choice(range(10200), 10)

bp.navon_data.iloc[x,]

Unnamed: 0,participant,participant_code,large_letter,small_letter,level_of_target,level_of_target_n,status,reaction_time_ms
2775,929907.0,s.7d1721ff-7535-4673-9864-c9548c05fcaf.txt,S,S,none,0,1,1189
5241,690573.0,s.083ae4d4-255b-488b-873c-7764dcee6253.txt,T,T,none,0,1,1550
37,816404.0,s.6463d918-7e29-49b0-942d-f1a9faab6ad6.txt,S,S,none,0,2,721
6850,414065.0,s.a5726c35-b42d-4aa9-924c-2b897c6dbb19.txt,H,H,global,2,2,2466
6963,230048.0,s.0e5bb56c-95ec-44b1-b326-e6e4bf95238f.txt,H,H,global,2,1,717
8454,573978.0,s.319f364a-2980-43b0-95bc-4af1dffac40d.txt,S,S,none,0,1,1361
4508,895200.0,s.be0eaa1c-a662-4162-bd05-38f87df95c35.txt,H,H,global,2,2,2233
73,221478.0,s.fa171c49-91e1-44b0-b9e3-9937dca02dca.txt,S,S,none,0,2,542
1193,866118.0,s.f08c5757-ac07-4a3b-bd2b-ace2ff9f7410.txt,T,T,local,1,1,595
1105,122240.0,s.99ea9504-2716-4ae3-9d5c-d2b95c65e1fe.txt,T,T,none,0,1,737
