Permalink
Newer
Older
100755 482 lines (403 sloc) 12.7 KB
1
#!/usr/bin/env perl
2
3
use strict;
4
use warnings;
5
6
use 5.010;
7
8
use IPC::System::Simple;
9
use autodie qw(:all);
10
11
use LWP::UserAgent;
12
use List::Util qw(sum max);
13
use List::MoreUtils qw(uniq);
@berekuk
Dec 7, 2012
14
use Term::ANSIColor qw(:constants);
15
local $Term::ANSIColor::AUTORESET = 1;
17
use JSON;
19
my @reasons = (
20
'Writing the code the way I want',
21
'Making the world a better place',
22
'Moral satisfaction',
23
'Improving the software I use',
24
'Learning',
25
'Creativity',
26
'Socializing',
27
'Collaborating with other people',
28
'Getting feedback',
29
'Getting praise',
30
'Building a career',
31
'Making money',
32
'Job requirement',
33
);
34
35
my @motivation_levels = (
@berekuk
Dec 9, 2012
36
"It discourages me",
37
"N/A",
38
"Doesn't apply",
39
"Don't care",
40
"Weakly motivating",
41
"Motivating",
42
"Strongly motivating",
43
);
44
45
my $result = {
46
motivation_levels => \@motivation_levels,
47
reasons => \@reasons,
48
q2a => {
49
(map { $_ => \@motivation_levels } @reasons),
50
'How often do you contribute to the open source, on average? (Coding, reporting bugs, writing blog posts all counts.)' => [
51
'Almost daily',
52
'Several times per week',
53
'Several times per month',
54
'Several times per year',
55
'Even less often',
56
'N/A',
57
],
58
'Would you like to contribute more?' => [
59
'Yes',
60
'No',
61
"I'd like to contribute less",
62
'N/A',
63
],
64
'Given the amount of free time you have now, would you contribute more if the environment was perfectly friendly, perfectly rewarding, and you knew that your actions make a great impact?' => [
65
'No',
66
'Probably',
67
'Surely',
68
'N/A',
69
],
70
'How long have you been involved in the open source community?' => [
71
"Don't remember / Don't want to answer",
72
"less than 1 year",
73
"1-3 years",
74
"4-6 years",
75
"7-10 years",
76
"more than 10 years",
77
'N/A',
78
],
79
}
80
};
81
82
my $entries;
83
my @questions;
84
85
sub fetch_data {
86
my $data_file = 'data';
87
if ($ENV{REFETCH}) {
88
# you may notice entry 56 is missing
89
# that's ok, I manually removed it - it was double-posted
90
my $response = LWP::UserAgent->new->get('http://berekuk.wufoo.eu/export/report/perl-motivation-raw-data.txt');
91
die $response->status_line unless $response->is_success;
92
open my $fh, '>', $data_file;
93
print {$fh} $response->content;
94
close $fh;
97
return qx(cat $data_file);
98
}
99
100
sub cleanup_data {
101
my $entries = shift;
103
for my $question (
104
# these fields can't be quantified
105
'List any other possible reasons for your participation:',
106
"Anything else you'd like to share on the topic of this survey:",
107
"CPAN ID, if you have one and you're ok with sharing it:",
108
# technical fields from wufoo
109
"Entry Id",
110
"Date Created",
111
"Last Page Accessed",
112
"Completion Status",
113
) {
114
for my $entry (@$entries) {
115
delete $entry->{$question};
117
}
118
}
119
120
sub load_data {
121
my $content = fetch_data();
122
123
my @lines = split /\n/, $content;
124
s/\r$// for @lines;
125
126
my $columns_line = shift @lines;
127
my @columns = split /\t/, $columns_line;
128
129
my $entries = [];
130
for my $line (@lines) {
131
my @values = split /\t/, $line;
132
133
my $entry = {};
134
for my $cid (0 .. $#columns) {
135
my $value = $values[$cid];
136
$value = 'N/A' if $value eq '';
137
138
$entry->{$columns[$cid]} = $value;
140
push @$entries, $entry;
143
cleanup_data($entries);
144
@questions = grep { defined $entries->[0]{$_} } @columns;
146
return $entries;
149
sub print_histogram {
150
my ($entries, $question) = @_;
152
my @options = @{ $result->{q2a}{$question} };
153
154
my %stat = map { $_ => 0 } @options;
155
for my $entry (@$entries) {
156
my $answer = $entry->{$question};
157
die "Unexpected answer '$answer'" unless defined $stat{$answer};
158
$stat{$answer}++;
159
}
160
161
$stat{"No answer"} = delete $stat{""} if $stat{""};
162
163
my $max_length = max(map { length } @options);
164
my $max_ascii_width = 20;
165
166
$result->{histogram}{$question} = \%stat;
167
168
for (@options) {
169
my $key = $_.(' ' x ($max_length - length($_)));
170
my $value = $stat{$_}.(' ' x (3 - length($stat{$_})));
171
my $ascii = '#' x int($max_ascii_width * $value / scalar @$entries);
172
$ascii .= ' ' x ($max_ascii_width - length $ascii);
173
$ascii = "|$ascii|";
174
say "$key $value $ascii";
175
}
178
sub slice {
179
my ($entries, $question, $slice_answers) = @_;
180
181
my %seen_answers;
183
my $slice = [];
184
for my $entry (@$entries) {
185
my $answer = $entry->{$question};
186
push @$slice, $entry if grep { $answer eq $_ } @$slice_answers;
187
$seen_answers{$answer}++;
188
}
189
190
# sanity check - check that each slice_answer is seen at least once - that's true, afaik
191
for my $slice_answer (@$slice_answers) {
192
die "No answer '$slice_answer' is seen for the question '$question'" unless $seen_answers{$slice_answer};
196
}
197
198
sub motiweight {
199
my ($level) = @_;
200
if ($level eq 'Weakly motivating') {
201
return 1;
202
}
203
if ($level eq "Motivating") {
204
return 2;
205
}
206
if ($level eq "Strongly motivating") {
207
return 3;
208
}
209
if ($level eq "It discourages me") {
211
}
212
213
# sanity check
214
unless (grep { $level eq $_ } ("Don't care", "Doesn't apply", "N/A")) {
215
die "Unexpected motivation level '$level'";
216
}
217
return 0;
218
}
219
220
sub all_motiweights {
221
my ($entries, $reason) = @_;
222
return map { motiweight($_->{$reason}) } @$entries;
223
}
224
225
sub average_motiweight {
226
my ($entries, $reason) = @_;
227
228
my @motiweights = all_motiweights($entries, $reason);
229
return sum(@motiweights) / scalar(@motiweights);
230
}
231
232
# run Rscript and get its output
233
sub call_r {
234
my ($command) = @_;
235
236
my $output = qx(Rscript -e '$command'); # FIXME - quote $command
237
if ($?) {
238
die "Rscript failed: $?";
240
return $output;
243
# get c(...) code for Rscript
244
sub r_cstring {
245
my @values = @_;
246
return 'c('.join(',', @values).')';
247
}
248
249
250
sub compare_slice_reasons {
251
my ($first, $second) = @_;
252
253
my $significant_level = 0.1;
254
say "Statistically significant differences (p < $significant_level):";
255
256
my $total = 0;
257
my @result;
258
for my $reason (@reasons) {
259
260
my ($avg_first, $avg_second) = map {
261
sprintf("%.4f", average_motiweight($_, $reason))
262
} ($first, $second);
263
264
my ($c_first, $c_second) = map {
265
r_cstring(all_motiweights($_, $reason))
266
} ($first, $second);
267
268
# Welch's t test - check whether there's a significant difference between answers for two slices
269
# see http://en.wikipedia.org/wiki/Welch%27s_t_test and http://stat.ethz.ch/R-manual/R-patched/library/stats/html/t.test.html for details
270
my $pvalue = call_r("cat(t.test($c_first, $c_second)\$p.value)");
271
my $confidence = sprintf "%.4f", 1 - $pvalue;
272
273
if ($pvalue < $significant_level) {
274
say "* $reason ($avg_first vs $avg_second); confidence=$confidence";
275
push @result, {
276
reason => $reason,
277
averages => [$avg_first, $avg_second],
278
confidence => $confidence,
282
unless (@result) {
283
say "* none";
284
}
285
return @result;
288
sub print_all_histograms {
290
for my $question (@questions) {
291
say GREEN "=== $question ===";
292
print_histogram($entries, $question);
293
say '';
294
}
297
sub compare_by_question {
298
my ($entries, $question, $first_set, $second_set) = @_;
299
300
my ($first_slice, $second_slice) = map {
301
slice(
302
$entries,
303
$question => $_
304
);
305
} ($first_set, $second_set);
306
@berekuk
Dec 7, 2012
307
say GREEN "Comparing the reasons by how people answer the '$question' question";
308
say "First group: [", join('; ', @$first_set), "], total: ", scalar(@$first_slice);
309
say "Second group: [", join('; ', @$second_set), "], total: ", scalar(@$second_slice);
310
311
my @compare_result = compare_slice_reasons($first_slice, $second_slice);
312
push @{ $result->{compares} }, {
313
question => $question,
314
sets => [$first_set, $second_set],
315
group_sizes => [ map { scalar @$_ } ($first_slice, $second_slice) ],
316
result => \@compare_result,
317
};
318
say '';
319
}
320
321
sub print_slice_comparisons {
322
my ($entries) = @_;
324
compare_by_question(
326
'How long have you been involved in the open source community?',
327
[
328
'less than 1 year',
329
'1-3 years',
330
'4-6 years',
331
],
332
[
333
# ignoring people in the middle to increase the gap and get more statistically significant results
334
# '7-10 years',
335
'more than 10 years',
338
339
compare_by_question(
341
'How often do you contribute to the open source, on average? (Coding, reporting bugs, writing blog posts all counts.)',
342
[
343
'Almost daily',
344
'Several times per week',
345
],
346
[
347
# 'Several times per month',
348
'Several times per year',
349
'Even less often',
353
compare_by_question(
354
$entries,
355
'Would you like to contribute more?',
356
[
357
'Yes',
358
],
359
[
360
'No',
361
"I'd like to contribute less",
362
]
363
);
366
sub print_correlations {
367
my ($entries) = @_;
368
369
my $significant_level = 0.3;
@berekuk
Dec 7, 2012
370
say GREEN "==== Correlations ====";
371
for my $i (0 .. $#reasons) {
372
for my $j ($i + 1 .. $#reasons) {
373
my ($reason1, $reason2) = ($reasons[$i], $reasons[$j]);
374
my ($c1, $c2) = map {
375
r_cstring(all_motiweights($entries, $_))
376
} ($reason1, $reason2);
377
378
my $cor = call_r("cat(cor($c1, $c2))");
379
next unless $cor > $significant_level;
380
say "Correlation($reason1, $reason2): $cor";
381
}
382
}
383
say '';
384
}
385
386
sub print_dominations {
387
my ($entries) = @_;
388
389
my $significant_level = 5;
@berekuk
Dec 7, 2012
390
say GREEN "==== Dominations ====";
391
say "Displaying the reason pairs where the number of responders for which the first reason is more important than the second";
392
say "is at least $significant_level times more than the number of responders for which the second reason is more important";
393
say "Legend: [number of resonders with a>b, with a=b, with a<b]";
394
say '';
395
for my $i (0 .. $#reasons) {
396
for my $j ($i + 1 .. $#reasons) {
397
my ($reason1, $reason2) = ($reasons[$i], $reasons[$j]);
398
399
my ($gt, $lt, $eq) = (0, 0, 0);
400
for my $entry (@$entries) {
401
my ($m1, $m2) = map { motiweight($entry->{$_}) } ($reason1, $reason2);
402
my $cmp = $m1 <=> $m2;
403
$gt++ if $m1 > $m2;
404
$lt++ if $m1 < $m2;
405
$eq++ if $m1 == $m2;
406
}
407
408
if ($gt < $lt) {
409
($gt, $lt) = ($lt, $gt);
410
($reason1, $reason2) = ($reason2, $reason1);
411
}
412
413
if (
414
($gt / $lt) > $significant_level
415
) {
416
say "[$gt, $eq, $lt] $reason1 > $reason2";
417
push @{$result->{dominations}}, {
418
gt => $gt,
419
eq => $eq,
420
lt => $lt,
421
left => $reason1,
422
right => $reason2,
423
};
424
}
425
}
426
}
427
}
428
429
sub print_txt {
430
my ($type) = @_;
431
open my $fh, '<', $type;
432
while (my $reason = <$fh>) {
433
chomp $reason;
434
$reason =~ s/\\n/\n/g;
435
say $reason;
436
say "---";
437
438
# so far all links in comments (all 2 of them) have been at the end of line, so we can be greedy
439
$reason =~ s{(http://.*)}{<a href="$1">$1</a>}g;
440
441
# we wrap reasons in <pre>, so this is not necessary
442
$reason =~ s/\n/<br>/g;
443
444
push @{ $result->{$type} }, $reason;
445
}
446
close $fh;
447
}
448
449
sub print_other_reasons {
450
say GREEN "==== Other reasons for participation ====";
451
print_txt('other-reasons');
452
}
453
454
sub print_comments {
455
say GREEN "==== Comments ====";
456
print_txt('comments');
457
}
458
459
sub generate_json {
460
my $json = JSON->new->utf8->pretty->encode($result);
461
open my $fh, '>', 'results.json';
462
print {$fh} "var r = $json;";
463
close $fh;
464
}
465
466
sub main {
467
my $entries = load_data();
468
$result->{entries} = $entries;
470
print_all_histograms($entries);
471
print_slice_comparisons($entries);
472
print_correlations($entries);
473
print_dominations($entries);
474
475
print_other_reasons();
476
print_comments();
477
478
generate_json();
481
main unless caller;