forked from aaowens/PSID.jl
-
Notifications
You must be signed in to change notification settings - Fork 0
/
use_codebook.jl
198 lines (177 loc) · 6.89 KB
/
use_codebook.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
#Transforms a string like \"This time in 1996\" to \"This time in YEAR\"
function year2year(s)
rtest = r"(19|20)\d{2}"
replace(s, rtest => "YEAR")
end
#If x or y are supersets of each other, keep the superset. Otherwise OR them
function checkerror(x, y)
x, y = year2year(x), year2year(y)
if x == y
return x
elseif x ⊆ y
return y
elseif y ⊆ x
return x
else # give up
return "$x PSIDOR $y"
end
end
dropY(s) = parse(Int, match(r"(19|20)\d{2}", s).match)
#Check if this label describes a missing value code
function checkmissing(s)
for r in (r"NA", r"DK", r"Inap.", r"Wild code", r"Missing")
occursin(r, s) && return true
end
return false
end
#Check if this is a continuous variable
function iscontinuous(k)
for key in k
out = tryparse(Float64, key)
if out === nothing
return true
end
end
return false
end
dropcomma(s) = String([c for c in s if !(c == ',')])
#Try to parse this value as a float
function parse2(s, v)
out = tryparse(Float64, dropcomma(s))
# if this isn't a Float, maybe it was a range "-89.0 - -0.4"
# TODO fix this
if out === nothing
#@show s v
return missing
else
return out
end
end
narrowtypes(A) = [a for a in A]
"""
Inputs:
name: The variable ID we want to match
var2ind_dict: The crosswalk table
df_vars: The data
codebook_df: The codebook table
fastfind: Dict mapping from variable IDs to their index in the codebook
Processes a variable ID, finds all years thats match, and collects the labels
"""
function process_varname(name, var2ind_dict, df_vars, codebook_df, fastfind)
## Find the row in the crosswalk we can find this variable in
myrow = var2ind_dict[name]
## Fetch all the names in that row
dfvar = df_vars[myrow, :]
mynames = [ r for r in dfvar if r !== missing]
## Need to figure out the variable label expansion
# Can I just take the union?
codevec = [fastfind[s] for s in values(mynames)]
codedict = [codebook_df.codedict[i] for i in codevec]
un = Dict{String, String}()
merge!(checkerror, un, codedict...)
map!(trimlabel, values(un))
varnames = Dict{String, Tuple{String, String, Vector{Float64}}}(
codebook_df.YEAR[i] => (codebook_df.NAME[i], codebook_df.LABEL[i],
codebook_df.excluding[i]) for i in codevec)
varnames, iscontinuous(keys(un)), un
end
function process_varname!(name, var2ind_dict, df_vars, codebook_df, fastfind)
## Find the row in the crosswalk we can find this variable in
myrow = var2ind_dict[name]
## Fetch all the names in that row
dfvar = df_vars[myrow, :]
mynames = [ r for r in dfvar if r !== missing]
## Need to figure out the variable label expansion
# Can I just take the union?
# Check for missing xml codebook entries.
# Construct new entries based on the specific variable reference in the user input json.
for s in values(mynames)
if s in collect(keys(fastfind))
else
@warn "Could not find $s in xml codebook. Constructing output codebook record from $name"
syear = names(dfvar)[coalesce.(collect(dfvar),"") .== s][1][2:end]
push!(codebook_df,[s,syear, codebook_df[fastfind[name],3:end]...])
push!(fastfind, s => length(codebook_df.NAME))
end
end
codevec = [fastfind[s] for s in values(mynames)]
codedict = [codebook_df.codedict[i] for i in codevec]
un = Dict{String, String}()
merge!(checkerror, un, codedict...)
map!(trimlabel, values(un))
varnames = Dict{String, Tuple{String, String, Vector{Float64}}}(
codebook_df.YEAR[i] => (codebook_df.NAME[i], codebook_df.LABEL[i],
codebook_df.excluding[i]) for i in codevec)
varnames, iscontinuous(keys(un)), un
end
"""
Sometimes the labels uses a comma in one year and a semicolon in another,
but are otherwise identical.
This function parses the different labels and drops these duplicates.
It also keeps only labels which are unique after cleaning, and constructs
a label which is a union of the parts (A OR B OR C)
"""
function trimlabel(s)
sp = strip.(split(s, "PSIDOR"))
# find common substrings
# For each string in s, check if it occurs in another string in s
# If so, drop it from s
# If not, push it to the new string list
# For each index in s
# Check if s[i] is in s \ excluded
# If so, add this index to the excluded list
clean(x) = lowercase(dropcomma(x))
setsp = Set(sp)
#cleaned = Set(clean.(sp))
excluded = Int[]
# We want to find the unique (after cleaning) labels
# Iterate through the set and check if we have seen this label before
# If not, add it to the seen list
for i in eachindex(sp)
targind = setdiff(1:length(sp), union(i, excluded))
cleaned = clean.(sp[targind])
any(clean(sp[i]) ⊆ c for c in cleaned) && push!(excluded, i)
end
newsp = sp[setdiff(1:length(sp), excluded)]
if length(newsp) == 1
return newsp[1]
else
return reduce((x, y) -> "$x OR $y", newsp[2:end], init = newsp[1])
end
end
"""
Processes input JSON file
Reads the crosswalk and codebook table from disk and
harmonizes the labels. Constructs the output JSON
"""
function process_input(inputjson)
@assert last(splitext(inputjson)) == ".json"
codebook_json = jsontable(read("output/codebook.json", String));
codebook_df = DataFrame(codebook_json);
codebook_df.codedict = [Dict(string(x) => y for (x, y) in dt) for dt in codebook_df.codedict]
#@infiltrate
crosswalk_df = DataFrame(XLSX.readtable("psid.xlsx", "MATRIX"))
crosswalk_df = mapcols(narrowtypes, crosswalk_df)
## Need a map from VAR to the right row
df_vars = crosswalk_df[!, r"^Y.+"]
var2ind_dict = Dict{String, Int}()
##
for col in eachcol(df_vars)
x = Dict(col[i] => i for i in 1:length(col) if col[i] !== missing)
merge!(checkerror, var2ind_dict, x)
end
## Need to figure out the variable label expansion
fastfind = Dict(codebook_df.NAME[i] => i for i in 1:length(codebook_df.NAME))
# Check if this label denotes a missing value code. If so, this value is an excluding value
codebook_df.excluding = [[parse2(k, v) for (k, v) in d if checkmissing(v)] |> skipmissing |> narrowtypes for d in codebook_df.codedict]
### Do the final processing of the input JSON, produce the output
read_input = JSON3.read(read(inputjson, String), Vector{VarInput})
process_varinput(v::VarInput) = VarInfo5(v.name_user, v.unit, process_varname!(v.varID, var2ind_dict, df_vars, codebook_df, fastfind)...)
procvar = process_varinput.(read_input)
write("output/user_output.json", JSON3.write(procvar))
modpath = dirname(pathof(PSID))
indpath = "$modpath/ind_input.json"
read_input = JSON3.read(read(indpath, String), Vector{VarInput})
procvar = process_varinput.(read_input)
write("output/ind_output.json", JSON3.write(procvar))
end