In [165]:
import pandas as pd
import numpy as np

In [166]:
DEBUG = False

# read in both datasets
reading_history_df   = pd.read_csv("data/db/reading_history_database.csv")
assessment_scores_df = pd.read_csv("data/db/screening_assessment_scores.csv")
print("reading history matrix original dimensions:   ", reading_history_df.shape)
print("assessment scores matrix original dimensions: ", assessment_scores_df.shape)

reading history matrix original dimensions:    (455, 34)
assessment scores matrix original dimensions:  (453, 26)


# pre-process reading_history database

In [167]:
# ----- shorten the column names for readability
shortened_cols = ["Q"+str(q) for q in range(1,34)]
shortened_cols.insert(0, "Participant")
reading_history_df.columns = shortened_cols

In [168]:
# ------ delete irrelevant columns (comments entered by subjects)
del reading_history_df['Q30']
del reading_history_df['Q32']

In [169]:
# ------- remove all NULL rows
reading_history_df = reading_history_df.dropna(axis=0, how='all')

In [170]:
# ------- find and remove any duplicate keys ('Participant')    #df[~df.name.isin(value_list)]
rd_h = reading_history_df['Participant'].value_counts()
reading_history_duplicates = rd_h[rd_h > 1]
if DEBUG: print("\nreading_history duplicate keys:")
if DEBUG: print(reading_history_duplicates.sort_index())

duplicate_participants_to_remove = reading_history_duplicates.index.tolist()
reading_history_df = reading_history_df[~reading_history_df.Participant.isin(duplicate_participants_to_remove)]

In [171]:
# ------- remove row that has 'Participant' number as "INCOMPLETE"
reading_history_df = reading_history_df[~reading_history_df.Participant.isin(['INCOMPLETE'])]

In [172]:
# ------- Q26 cleanup ( fill in NULL values)

In [173]:
# ------- Q27 cleanup (fill in NULL values)

In [174]:
# ------- Q28 cleanup (fill in NULL values)

In [175]:
# ------- Q29 cleanup (make all lowercase and strip whitespace)
if DEBUG: print("BEFORE:")
if DEBUG: print("all possible values in Q29: ", reading_history_df.Q29.unique())
if DEBUG: print("value counts: \n", reading_history_df.Q29.value_counts())

if DEBUG: print("\nAFTER:")
reading_history_df['Q29'] = reading_history_df['Q29'].apply(lambda x: x.lower().strip())

def add_underscore_to_not_sure(x):
    if x == 'not sure':
        return('not_sure')
    else:
        return(x)
reading_history_df['Q29'] = reading_history_df['Q29'].apply(add_underscore_to_not_sure) 
if DEBUG: print("all possible values in Q29: ", reading_history_df.Q29.unique())
if DEBUG: print("value counts: \n", reading_history_df.Q29.value_counts())

# create dummy set
new_Q29 = pd.get_dummies(reading_history_df['Q29'], prefix='Q29')

print("pre-processed reading history matrix dimensions: ", reading_history_df.shape)
if DEBUG: print("\nNULL values: \n", reading_history_df.isnull().sum())


pre-processed reading history matrix dimensions:  (433, 32)


In [176]:
# ------- Q31 cleanup (make all lowercase and strip whitespace)

# fill in NULL value with most frequent
most_frequent_value_in_Q31 = reading_history_df['Q31'].value_counts().index[0]
reading_history_df.Q31.fillna(most_frequent_value_in_Q31, inplace=True)

if DEBUG: print("NULL values: \n", reading_history_df.isnull().sum())

# lower case and strip whitespace
reading_history_df['Q31'] = reading_history_df['Q31'].apply(lambda x: x.lower().strip())

In [177]:
# ------- Q33 cleanup

In [178]:
#reading_history_df.head(50)

# pre-process assessment_scores

In [179]:
# pre-process assessment_scores
as_s = assessment_scores_df['Participant Number'].value_counts()
as_s = as_s[as_s > 1]

print("\nassesment_scores duplicate keys:")
print(as_s.sort_index())

assessment_scores_df.rename(columns={ assessment_scores_df.columns[0]: "Participant" }, inplace=True)


assesment_scores duplicate keys:
1161    2
2019    2
2234    2
4744    2
5343    2
5559    2
6970    2
7806    2
9010    2
9379    2
Name: Participant Number, dtype: int64


In [180]:
# merge 2 databases on 'Participant' as key into one
merged_history_and_scores = pd.merge(reading_history_df, assessment_scores_df, how='inner', on='Participant')

In [181]:
# reset 'Participant' as the index
merged_history_and_scores.set_index('Participant', inplace=True)
print("merged history and scores dimensions: ", merged_history_and_scores.shape)

merged history and scores dimensions:  (425, 56)


In [182]:
# drop rows where all of the elements are NaN 
merged_history_and_scores = merged_history_and_scores.dropna(axis=0, how='all')
#print("Number of missing values (NaN) by column:")
#print(merged_history_and_scores.isnull().sum())
#print("\nDimensions after dropping empty rows:", merged_history_and_scores.shape)

In [183]:
# Q31 cleanup: as unordered (yes, no, not sure)

In [184]:
# Q33 cleanup: as ordered categories ("some college, etc.")
#print("all possible values in Q33: ")
#merged_history_and_scores.Q33.value_counts()