In [92]:
import pandas as pd
import numpy as np

In [93]:
# read in datasets
reading_history   = pd.read_csv("data/db/reading_history_database.csv")
assessment_scores = pd.read_csv("data/db/screening_assessment_scores.csv")
print("reading history matrix dimensions:   ", reading_history.shape)
print("assessment scores matrix dimensions: ", assessment_scores.shape)

# find duplicate any duplicate keys ('Participant Numbers')
rd_h = reading_history['Participant Number'].value_counts()
as_s = assessment_scores['Participant Number'].value_counts()
rd_h = rd_h[rd_h > 1]
as_s = as_s[as_s > 1]

print("\nreading_history duplicate keys:")
print(rd_h.sort_index())
print("\nassesment_scores duplicate keys:")
print(as_s.sort_index())

reading history matrix dimensions:    (455, 34)
assessment scores matrix dimensions:  (453, 26)

reading_history duplicate keys:
1161    2
2019    2
2234    2
2575    2
4744    2
5343    2
5559    2
7806    2
9010    2
Name: Participant Number, dtype: int64

assesment_scores duplicate keys:
1161    2
2019    2
2234    2
4744    2
5343    2
5559    2
6970    2
7806    2
9010    2
9379    2
Name: Participant Number, dtype: int64


In [94]:
# shorten the column names for readability
shortened_cols = ["Q"+str(q) for q in range(1,34)]
shortened_cols.insert(0, "Participant")
reading_history.columns = shortened_cols

In [95]:
assessment_scores.rename(columns={ assessment_scores.columns[0]: "Participant" }, inplace=True)

In [96]:
# merge 2 databases on 'Participant' as key into one
merged_history_and_scores = pd.merge(reading_history, assessment_scores, how='inner', on='Participant')

In [97]:
# reset 'Participant' as the index
merged_history_and_scores.set_index('Participant', inplace=True)
print("merged history and scores dimensions: ", merged_history_and_scores.shape)

merged history and scores dimensions:  (472, 58)


In [98]:
# delete irrelevant columns (comments entered by subjects)
del merged_history_and_scores['Q30']
del merged_history_and_scores['Q32']

In [99]:
# drop rows where all of the elements are NaN 
merged_history_and_scores = merged_history_and_scores.dropna(axis=0, how='all')
#print("Number of missing values (NaN) by column:")
#print(merged_history_and_scores.isnull().sum())
#print("\nDimensions after dropping empty rows:", merged_history_and_scores.shape)

In [100]:
# Q29 cleanup
# make all lowercase and strip whitespace
print("BEFORE:")
print("all possible values in Q29: ", merged_history_and_scores.Q29.unique())
print("value counts: \n", merged_history_and_scores.Q29.value_counts())

print("\nAFTER:")
merged_history_and_scores['Q29'] = merged_history_and_scores['Q29'].apply(lambda x: x.lower().strip())

def not_sure_add_underscore(x):
    if x == 'not sure':
        return('not_sure')
    else:
        return(x)
merged_history_and_scores['Q29'] = merged_history_and_scores['Q29'].apply(not_sure_add_underscore) 
print("all possible values in Q29: ", merged_history_and_scores.Q29.unique())
print("value counts: \n", merged_history_and_scores.Q29.value_counts())

# create dummy set
new_Q29 = pd.get_dummies(merged_history_and_scores['Q29'], prefix='Q29')


BEFORE:
all possible values in Q29:  ['No' 'Yes' 'Not Sure' 'Not sure' 'No ']
value counts: 
 No          335
Yes          55
Not sure     47
Not Sure     21
No            1
Name: Q29, dtype: int64

AFTER:
all possible values in Q29:  ['no' 'yes' 'not_sure']
value counts: 
 no          336
not_sure     68
yes          55
Name: Q29, dtype: int64


In [101]:
# Q31 cleanup: as unordered (yes, no, not sure)

In [102]:
# Q33 cleanup: as ordered categories ("some college, etc.")
#print("all possible values in Q33: ")
#merged_history_and_scores.Q33.value_counts()