In [134]:
import pandas as pd
import numpy as np

In [135]:
# read in both datasets
reading_history_df   = pd.read_csv("data/db/reading_history_database.csv")
assessment_scores_df = pd.read_csv("data/db/screening_assessment_scores.csv")
print("reading history matrix original dimensions:   ", reading_history_df.shape)
print("assessment scores matrix original dimensions: ", assessment_scores_df.shape)

reading history matrix original dimensions:    (455, 34)
assessment scores matrix original dimensions:  (453, 26)


In [136]:
# pre-process reading_history database

# shorten the column names for readability
shortened_cols = ["Q"+str(q) for q in range(1,34)]
shortened_cols.insert(0, "Participant")
reading_history_df.columns = shortened_cols

# delete irrelevant columns (comments entered by subjects)
del reading_history_df['Q30']
del reading_history_df['Q32']

# remove all NULL rows
reading_history_df = reading_history_df.dropna(axis=0, how='all')

# find and remove any duplicate keys ('Participant')    #df[~df.name.isin(value_list)]
rd_h = reading_history_df['Participant'].value_counts()
reading_history_duplicates = rd_h[rd_h > 1]
print("\nreading_history duplicate keys:")
print(reading_history_duplicates.sort_index())

duplicate_participants_to_remove = reading_history_duplicates.index.tolist()
reading_history_df = reading_history_df[~reading_history_df.Participant.isin(duplicate_participants_to_remove)]

# remove row that has 'Participant' number as "INCOMPLETE"
reading_history_df = reading_history_df[~reading_history_df.Participant.isin(['INCOMPLETE'])]

print("pre-processed reading history matrix dimensions: ", reading_history_df.shape)
print(reading_history_df.isnull().sum())


reading_history duplicate keys:
1161    2
2019    2
2234    2
2575    2
4744    2
5343    2
5559    2
7806    2
9010    2
Name: Participant, dtype: int64
pre-processed reading history matrix dimensions:  (433, 32)
Participant     0
Q1              0
Q2              0
Q3              0
Q4              0
Q5              0
Q6              0
Q7              0
Q8              0
Q9              0
Q10             0
Q11             0
Q12             0
Q13             0
Q14             0
Q15             0
Q16             0
Q17             0
Q18             0
Q19             0
Q20             0
Q21             0
Q22             0
Q23             0
Q24             0
Q25             0
Q26             2
Q27             1
Q28            45
Q29             0
Q31             1
Q33             1
dtype: int64


In [137]:
# pre-process assessment_scores
as_s = assessment_scores_df['Participant Number'].value_counts()
as_s = as_s[as_s > 1]

print("\nassesment_scores duplicate keys:")
print(as_s.sort_index())

assessment_scores.rename(columns={ assessment_scores.columns[0]: "Participant" }, inplace=True)


assesment_scores duplicate keys:
1161    2
2019    2
2234    2
4744    2
5343    2
5559    2
6970    2
7806    2
9010    2
9379    2
Name: Participant Number, dtype: int64


In [138]:
# merge 2 databases on 'Participant' as key into one
merged_history_and_scores = pd.merge(reading_history, assessment_scores, how='inner', on='Participant')

In [139]:
# reset 'Participant' as the index
merged_history_and_scores.set_index('Participant', inplace=True)
print("merged history and scores dimensions: ", merged_history_and_scores.shape)

merged history and scores dimensions:  (460, 56)


In [140]:
# drop rows where all of the elements are NaN 
merged_history_and_scores = merged_history_and_scores.dropna(axis=0, how='all')
#print("Number of missing values (NaN) by column:")
#print(merged_history_and_scores.isnull().sum())
#print("\nDimensions after dropping empty rows:", merged_history_and_scores.shape)

In [141]:
# Q29 cleanup
# make all lowercase and strip whitespace
print("BEFORE:")
print("all possible values in Q29: ", merged_history_and_scores.Q29.unique())
print("value counts: \n", merged_history_and_scores.Q29.value_counts())

print("\nAFTER:")
merged_history_and_scores['Q29'] = merged_history_and_scores['Q29'].apply(lambda x: x.lower().strip())

def not_sure_add_underscore(x):
    if x == 'not sure':
        return('not_sure')
    else:
        return(x)
merged_history_and_scores['Q29'] = merged_history_and_scores['Q29'].apply(not_sure_add_underscore) 
print("all possible values in Q29: ", merged_history_and_scores.Q29.unique())
print("value counts: \n", merged_history_and_scores.Q29.value_counts())

# create dummy set
new_Q29 = pd.get_dummies(merged_history_and_scores['Q29'], prefix='Q29')


BEFORE:
all possible values in Q29:  ['No' 'Yes' 'Not Sure' 'Not sure' 'No ']
value counts: 
 No          335
Yes          55
Not sure     47
Not Sure     21
No            1
Name: Q29, dtype: int64

AFTER:
all possible values in Q29:  ['no' 'yes' 'not_sure']
value counts: 
 no          336
not_sure     68
yes          55
Name: Q29, dtype: int64


In [142]:
# Q31 cleanup: as unordered (yes, no, not sure)

In [143]:
# Q33 cleanup: as ordered categories ("some college, etc.")
#print("all possible values in Q33: ")
#merged_history_and_scores.Q33.value_counts()