forked from cncf/devstats.archive
-
Notifications
You must be signed in to change notification settings - Fork 0
/
import_affs.go
363 lines (337 loc) · 10.3 KB
/
import_affs.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
package main
import (
"database/sql"
"encoding/json"
"fmt"
"os"
"regexp"
"strings"
"time"
lib "devstats"
)
// gitHubUsers - list of GitHub user data from cncf/gitdm.
type gitHubUsers []gitHubUser
// gitHubUser - single GitHug user entry from cncf/gitdm `github_users.json` JSON.
type gitHubUser struct {
Login string `json:"login"`
Email string `json:"email"`
Affiliation string `json:"affiliation"`
Name string `json:"name"`
}
// stringSet - set of strings
type stringSet map[string]struct{}
// mapStringSet - this is a map from string to Set of strings
type mapStringSet map[string]stringSet
// mapIntArray - this is a map form string to array of ints
type mapIntArray map[string][]int
// affData - holds single affiliation data
type affData struct {
Login string
Company string
From time.Time
To time.Time
}
// decode emails with ! instead of @
func emailDecode(line string) string {
re := regexp.MustCompile(`([^\s!]+)!([^\s!]+)`)
return re.ReplaceAllString(line, `$1@$2`)
}
// Search for given actor using his/her login
// Returns first author found with maximum ID or sets ok=false when not found
func findActor(db *sql.DB, ctx *lib.Ctx, login string, maybeHide func(string) string) (actor lib.Actor, ok bool) {
login = maybeHide(login)
rows := lib.QuerySQLWithErr(
db,
ctx,
fmt.Sprintf("select id, name from gha_actors where login=%s order by id desc limit 1", lib.NValue(1)),
login,
)
defer func() { lib.FatalOnError(rows.Close()) }()
var name *string
for rows.Next() {
lib.FatalOnError(rows.Scan(&actor.ID, &name))
actor.Login = login
if name != nil {
actor.Name = *name
}
ok = true
}
lib.FatalOnError(rows.Err())
return
}
// Search for given actor ID(s) using His/Her login
// Return list of actor IDs with that login
func findActorIDs(db *sql.DB, ctx *lib.Ctx, login string, maybeHide func(string) string) (actIDs []int) {
login = maybeHide(login)
rows := lib.QuerySQLWithErr(
db,
ctx,
fmt.Sprintf("select id from gha_actors where login=%s", lib.NValue(1)),
login,
)
defer func() { lib.FatalOnError(rows.Close()) }()
var aid int
for rows.Next() {
lib.FatalOnError(rows.Scan(&aid))
actIDs = append(actIDs, aid)
}
lib.FatalOnError(rows.Err())
return
}
// returns first value from stringSet
func firstKey(strMap stringSet) string {
for key := range strMap {
return key
}
return ""
}
// Adds non-existing actor
func addActor(con *sql.DB, ctx *lib.Ctx, login, name string, maybeHide func(string) string) int {
hlogin := maybeHide(login)
name = maybeHide(name)
aid := lib.HashStrings([]string{login})
lib.ExecSQLWithErr(con, ctx,
"insert into gha_actors(id, login, name) "+lib.NValues(3),
lib.AnyArray{aid, hlogin, name}...,
)
return aid
}
// Imports given JSON file.
func importAffs(jsonFN string) {
// Environment context parse
var ctx lib.Ctx
ctx.Init()
// Connect to Postgres DB
con := lib.PgConn(&ctx)
defer func() { lib.FatalOnError(con.Close()) }()
// To handle GDPR
maybeHide := lib.MaybeHideFunc(lib.GetHidden(lib.HideCfgFile))
// Parse github_users.json
var users gitHubUsers
data, err := lib.ReadFile(&ctx, jsonFN)
if err != nil {
lib.FatalOnError(err)
return
}
lib.FatalOnError(json.Unmarshal(data, &users))
// Process users affiliations
emptyVal := struct{}{}
loginEmails := make(mapStringSet)
loginNames := make(mapStringSet)
loginAffs := make(mapStringSet)
eNames, eEmails, eAffs := 0, 0, 0
for _, user := range users {
// Email decode ! --> @
user.Email = emailDecode(user.Email)
login := user.Login
// Email
email := user.Email
if email != "" {
_, ok := loginEmails[login]
if !ok {
loginEmails[login] = stringSet{}
}
loginEmails[login][email] = emptyVal
} else {
eEmails++
}
// Name
name := user.Name
if name != "" {
_, ok := loginNames[login]
if !ok {
loginNames[login] = stringSet{}
}
loginNames[login][name] = emptyVal
} else {
eNames++
}
// Affiliation
aff := user.Affiliation
if aff != "NotFound" && aff != "(Unknown)" && aff != "?" {
_, ok := loginAffs[login]
if !ok {
loginAffs[login] = stringSet{}
}
loginAffs[login][aff] = emptyVal
} else {
eAffs++
}
}
lib.Printf(
"Processing non-empty: %d names, %d emails lists and %d affiliations lists\n",
len(loginNames), len(loginEmails), len(loginAffs),
)
lib.Printf("Empty/Not found: names: %d, emails: %d, affiliations: %d\n", eNames, eEmails, eAffs)
// Login - Names should be 1:1
added, updated := 0, 0
for login, names := range loginNames {
if len(names) > 1 {
lib.Printf("Warning: login has multiple names: %v: %+v\n", login, names)
//lib.Fatalf("login has multiple names: %v: %+v", login, names)
}
name := firstKey(names)
// Try to find actor by login
actor, ok := findActor(con, &ctx, login, maybeHide)
if !ok {
// If no such actor, add with artificial ID (just like data from pre-2015)
addActor(con, &ctx, login, name, maybeHide)
added++
} else if name != actor.Name {
// If actor found, but with different name (actually with name == "" after standard GHA import), update name
// Because there can be the same actor (by id) with different IDs (pre-2015 and post 2015), update His/Her name
// for all records with this login
lib.ExecSQLWithErr(con, &ctx,
"update gha_actors set name="+lib.NValue(1)+" where login="+lib.NValue(2),
lib.AnyArray{maybeHide(name), maybeHide(login)}...,
)
updated++
}
}
lib.Printf("%d non-empty names, added actors: %d, updated actors: %d\n", len(loginNames), added, updated)
// Login - Email(s) 1:N
cacheActIDs := make(mapIntArray)
added, allEmails := 0, 0
for login, emails := range loginEmails {
actIDs := findActorIDs(con, &ctx, login, maybeHide)
if len(actIDs) < 1 {
// Can happen if user have github login but name = "" or null
// In that case previous loop by loginName didn't add such user
actIDs = append(actIDs, addActor(con, &ctx, login, "", maybeHide))
added++
}
// Store given login's actor IDs in the case
cacheActIDs[login] = actIDs
for email := range emails {
// One actor can have multiple emails but...
// One email can also belong to multiple actors
// This happens when actor was first defined in pre-2015 era (so He/She have negative ID then)
// And then in new API era 2015+ that actor was active too (so He/Sha will
// have entry with valid GitHub actor_id > 0)
for _, aid := range actIDs {
lib.ExecSQLWithErr(con, &ctx,
lib.InsertIgnore("into gha_actors_emails(actor_id, email) "+lib.NValues(2)),
lib.AnyArray{aid, maybeHide(email)}...,
)
allEmails++
}
}
}
lib.Printf("%d emails lists, added actors: %d, all emails: %d\n", len(loginEmails), added, allEmails)
// Login - Affiliation should be 1:1, but it is sometimes 1:2 or 1:3
// There are some ambigous affiliations in github_users.json
// For such cases we're picking up the one with most entries
// And then if more than 1 with the same number of entries, then pick up first
unique, nonUnique, allAffs := 0, 0, 0
defaultStartDate := time.Date(1970, 1, 1, 0, 0, 0, 0, time.UTC)
defaultEndDate := time.Date(2099, 1, 1, 0, 0, 0, 0, time.UTC)
companies := make(stringSet)
var affList []affData
for login, affs := range loginAffs {
var affsAry []string
if len(affs) > 1 {
// This login has different affiliations definitions in the input JSON
// Look for an affiliation that list most companies
maxNum := 1
for aff := range affs {
num := len(strings.Split(aff, ", "))
if num > maxNum {
maxNum = num
}
}
// maxNum holds max number of companies listed in any of affiliations
for aff := range affs {
ary := strings.Split(aff, ", ")
// Just pick first affiliation defin ition that lists most companies
if len(ary) == maxNum {
affsAry = ary
break
}
}
// Count this as non-unique
nonUnique++
} else {
// This is a good definition, only one list of companies affiliation for this GitHub user login
affsAry = strings.Split(firstKey(affs), ", ")
unique++
}
// Affiliation has a form "com1 < dt1, com2 < dt2, ..., com(N-1) < dt(N-1), comN"
// We have array of companies affiliation with eventual end date: array item is:
// "company name" or "company name < date", lets iterate and parse it
prevDate := defaultStartDate
for _, aff := range affsAry {
var dtFrom, dtTo time.Time
ary := strings.Split(aff, " < ")
company := strings.TrimSpace(ary[0])
if len(ary) > 1 {
// "company < date" form
dtFrom = prevDate
dtTo = lib.TimeParseAny(ary[1])
} else {
// "company" form
dtFrom = prevDate
dtTo = defaultEndDate
}
companies[company] = emptyVal
affList = append(affList, affData{Login: login, Company: company, From: dtFrom, To: dtTo})
prevDate = dtTo
allAffs++
}
}
lib.Printf(
"%d affiliations, unique: %d, non-unique: %d, all user-company connections: %d\n",
len(loginAffs), unique, nonUnique, allAffs,
)
// Add companies
for company := range companies {
lib.ExecSQLWithErr(con, &ctx,
lib.InsertIgnore("into gha_companies(name) "+lib.NValues(1)),
lib.AnyArray{maybeHide(company)}...,
)
}
lib.Printf("Processed %d companies\n", len(companies))
// Add affiliations
added, cached, nonCached := 0, 0, 0
for _, aff := range affList {
login := aff.Login
// Check if we have that actor IDs cached
actIDs, ok := cacheActIDs[login]
if !ok {
actIDs = findActorIDs(con, &ctx, login, maybeHide)
if len(actIDs) < 1 {
// Can happen if user have github login but email = "" or null
// In that case previous loop by loginEmail didn't add such user
actIDs = append(actIDs, addActor(con, &ctx, login, "", maybeHide))
added++
}
cacheActIDs[login] = actIDs
nonCached++
} else {
cached++
}
company := aff.Company
dtFrom := aff.From
dtTo := aff.To
for _, aid := range actIDs {
lib.ExecSQLWithErr(con, &ctx,
lib.InsertIgnore(
"into gha_actors_affiliations(actor_id, company_name, dt_from, dt_to) "+lib.NValues(4)),
lib.AnyArray{aid, maybeHide(company), dtFrom, dtTo}...,
)
}
}
lib.Printf(
"Processed %d affiliations, added %d actors, cache hit: %d, miss: %d\n",
len(affList), added, cached, nonCached,
)
}
func main() {
dtStart := time.Now()
if len(os.Args) < 2 {
lib.Printf("Required argument: filename.json\n")
os.Exit(1)
}
importAffs(os.Args[1])
dtEnd := time.Now()
lib.Printf("Time: %v\n", dtEnd.Sub(dtStart))
}