Browse files

first commit

  • Loading branch information...
0 parents commit e687ad787e23660dc811623d1093f2793aece2e1 @bcollier committed Mar 31, 2011
Showing with 191 additions and 0 deletions.
  1. +67 −0 datacleaning.sql
  2. +36 −0 pull_surveydata.sql
  3. +1 −0 readme.txt
  4. +87 −0 scrap_code.sql
67 datacleaning.sql
@@ -0,0 +1,67 @@
+UPDATE wpsurvey.data set filter_$ = 0;
+
+UPDATE wpsurvey.data
+ SET filter_$ = 1
+ WHERE filter_1 + filter_2 + filter_3 + filter_4 + filter_5 + filter_6 + filter_7 + filter_8 > 4
+ AND A5A = "en";
+
+UPDATE wpsurvey.data
+ SET filter_$ = 1
+ WHERE NOT A5A = "en"
+
+UPDATE wpsurvey.data
+ SET A12_1 = case when LENGTH(TRIM(A12_1)) > 0 then A12_1 else null end,
+ A12_2 = case when LENGTH(TRIM(A12_2)) > 0 then A12_2 else null end;
+
+UPDATE wpsurvey.data SET A12_1 = REPLACE(A12_1, ',', '') WHERE A12_1 is not null;
+UPDATE wpsurvey.data SET A12_1 = REPLACE(A12_1, '$', '') WHERE A12_1 is not null;
+UPDATE wpsurvey.data SET A12_1 = REPLACE(A12_1, '-', '') WHERE A12_1 is not null;
+UPDATE wpsurvey.data SET A12_1 = REPLACE(A12_1, ' ', '') WHERE A12_1 is not null;
+UPDATE wpsurvey.data SET A12_1 = REPLACE(A12_1, '*', '') WHERE A12_1 is not null;
+UPDATE wpsurvey.data SET A12_1 = REPLACE(A12_1, '(', '') WHERE A12_1 is not null;
+UPDATE wpsurvey.data SET A12_1 = REPLACE(A12_1, ')', '') WHERE A12_1 is not null;
+UPDATE wpsurvey.data SET A12_1 = REPLACE(A12_1, '+', '') WHERE A12_1 is not null;
+UPDATE wpsurvey.data SET A12_1 = REPLACE(UPPER(A12_1), 'USD', '') WHERE A12_1 is not null;
+UPDATE wpsurvey.data SET A12_1 = REPLACE(UPPER(A12_1), 'US', '') WHERE A12_1 is not null;
+UPDATE wpsurvey.data SET A12_1 = REPLACE(UPPER(A12_1), 'EUR', '') WHERE A12_1 is not null;
+UPDATE wpsurvey.data SET A12_1 = REPLACE(UPPER(A12_1), 'DOL', '') WHERE A12_1 is not null;
+UPDATE wpsurvey.data SET A12_1 = REPLACE(UPPER(A12_1), 'P', '') WHERE A12_1 is not null;
+UPDATE wpsurvey.data SET A12_1 = REPLACE(UPPER(A12_1), 'K', '') WHERE A12_1 is not null;
+UPDATE wpsurvey.data SET A12_1 = REPLACE(UPPER(A12_1), 'PES', '') WHERE A12_1 is not null;
+UPDATE wpsurvey.data SET A12_1 = REPLACE(UPPER(A12_1), 'Ð', '') WHERE A12_1 is not null;
+UPDATE wpsurvey.data SET A12_1 = REPLACE(UPPER(A12_1), 'DO', '') WHERE A12_1 is not null;
+UPDATE wpsurvey.data SET A12_1 = REPLACE(UPPER(A12_1), 'RM', '') WHERE A12_1 is not null;
+UPDATE wpsurvey.data SET A12_1 = REPLACE(UPPER(A12_1), 'RE', '') WHERE A12_1 is not null;
+UPDATE wpsurvey.data SET A12_1 = REPLACE(UPPER(A12_1), 'DE', '') WHERE A12_1 is not null;
+UPDATE wpsurvey.data SET A12_1 = REPLACE(UPPER(A12_1), 'K', '') WHERE A12_1 is not null;
+UPDATE wpsurvey.data SET A12_1 = REPLACE(UPPER(A12_1), 'Ñ', '') WHERE A12_1 is not null;
+
+UPDATE wpsurvey.data SET A12_1 = SUBSTRING(A12_1,1,LOCATE('.', A12_1) -1 ) WHERE NOT A12_1 REGEXP '^[0-9]+$' and LOCATE('.', A12_1) > 0 ;
+
+UPDATE wpsurvey.data
+ SET A12_1 = case when LENGTH(TRIM(A12_1)) > 0 then A12_1 else null end,
+ A12_2 = case when LENGTH(TRIM(A12_2)) > 0 then A12_2 else null end;
+
+UPDATE wpsurvey.data SET filter_$ = 1 WHERE NOT A12_1 REGEXP '^[0-9]+$' ;
+
+UPDATE wpsurvey.data SET A11a = -999 WHERE A11a is null and filter_$ = 0;
+UPDATE wpsurvey.data SET A12_1 = -999 WHERE A12_1 is null and filter_$ = 0;
+UPDATE wpsurvey.data SET A12_2 = -999 WHERE A12_2 is null and filter_$ = 0;
+UPDATE wpsurvey.data SET B8 = -999 WHERE B8 is null and filter_$ = 0;
+UPDATE wpsurvey.data SET B8a = -999 WHERE B8a is null and filter_$ = 0;
+UPDATE wpsurvey.data SET B9 = -999 WHERE B9 is null and filter_$ = 0;
+UPDATE wpsurvey.data SET B10 = -999 WHERE B10 is null and filter_$ = 0;
+UPDATE wpsurvey.data SET B11 = -999 WHERE B11 is null and filter_$ = 0;
+
+/* these 4 statements need to be run together to filter countries that we don't have conversion rates for */
+CREATE TEMPORARY TABLE temp_table (country VARCHAR (255));
+INSERT INTO temp_table (SELECT DISTINCT (d2.A12_2) FROM wpsurvey.data d2 WHERE NOT EXISTS (SELECT 1 FROM cur_to_USD WHERE CountryCode = A12_2) and d2.A12_2 IS NOT NULL and d2.filter_$ = 0 and not d2.A12_2 = -999);
+UPDATE wpsurvey.data SET filter_$ = 1 WHERE A12_2 IN (SELECT country from temp_table) AND filter_$ = 0;
+DROP TABLE temp_table;
+
+
+
+
+
+
+
36 pull_surveydata.sql
@@ -0,0 +1,36 @@
+SELECT filter_2 as 'occasional_editor',
+ filter_3 as 'content_author',
+ filter_4 as 'editor',
+ filter_5 as 'administrator',
+ filter_6 as 'ex-contributor',
+ case A1 when 'male' then 1 when 'female' then 0 else -1 end as 'male',
+ A2 as 'age',
+ A3_nationality as 'nationality',
+ A4_Residence as 'residence',
+ A5A as 'contrib_lang',
+ A5B as 'read_lang',
+ A7 as 'high_educ',
+ A8 as 'years_educ',
+ case A9 when 1 then 1 when 2 then 0 end as 'partner',
+ case A10 when 1 then 1 when 2 then 0 end as 'children',
+ A11 as 'employment_stat',
+ IFNULL(A11a, -999) as 'occupation',
+ /* A12_1 as 'inc_amt', */
+ /* A12_2 as 'inc_currency', */
+ case A12_1 when -999 then -999 else (SELECT (A12_1 * USDPerUnits) FROM cur_to_USD WHERE A12_2 = CountryCode) end as inc_amt_adj,
+ /* case A12_1 when -999 then -999 else (SELECT (USDPerUnits) FROM cur_to_USD WHERE A12_2 = CountryCode) end as USDPerUnits, */
+ IFNULL(A12_3, 0) as 'noincomereponse',
+ B2 as 'access_level',
+ B8 as 'wp_interaction',
+ B8a as 'wp_f2f_interaction',
+ B9 as 'art_started',
+ B10 as 'art_edited',
+ B11 as 'art_translated'
+ FROM wpsurvey.data
+ WHERE A5A = "en"
+ AND filter_1 = 0 /* filters out 21845 readers only 40079 down to 18234*/
+ AND filter_8 = 0 /* filters out 510 other non-contributors 40079 down to 39569 */
+ AND filter_$ = 0 /* used for data cleaning and filtering out non-english participants */
+ AND A1 > -1 /* gender is known filters 93 */
+ AND B2 > 1 /* filters 5154 where data access level is unregistered */
+
1 readme.txt
@@ -0,0 +1 @@
+SQL Code to clean data and pull Wikipedia General User Survey data. R code to analyze the results.
87 scrap_code.sql
@@ -0,0 +1,87 @@
+SELECT *
+ FROM wpsurvey.data
+ WHERE A5A = "en" OR A5B = "en"
+ LIMIT 1000;
+
+ SELECT count(*)
+ FROM wpsurvey.data
+ WHERE A5A = "en"
+ AND filter_1 = 0
+ AND filter_8 = 0
+ LIMIT 1000;
+
+SELECT count(*)
+ FROM wpsurvey.data
+ WHERE A5B = "en"
+ LIMIT 1000;
+
+ UPDATE wpsurvey.data
+ SET filter_$ = 0;
+
+SELECT DISTINCT A12_2
+ FROM wpsurvey.data
+ WHERE A5A = "en"
+ AND filter_1 = 0
+ AND filter_8 = 0
+ AND filter_$ = 0
+ AND A1 > -1 /* gender is known */
+
+
+/* need to get the conversion information for these countries, for now I guess we don't include them in our sample */
+ SELECT A12_2, count(id) FROM wpsurvey.data WHERE NOT EXISTS (SELECT 1 FROM cur_to_USD WHERE CountryCode = A12_2) and A12_2 IS NOT NULL and filter_$ = 0 and not A12_2 = -999 GROUP BY A12_2 HAVING count(id) > 5;
+
+ CREATE TEMPORARY TABLE temp_table (count VARCHAR (255));
+
+ INSERT INTO temp_table
+ (SELECT DISTINCT (d2.A12_2) FROM wpsurvey.data d2 WHERE NOT EXISTS (SELECT 1 FROM cur_to_USD WHERE CountryCode = A12_2) and d2.A12_2 IS NOT NULL and d2.filter_$ = 0 and not d2.A12_2 = -999);
+
+ SELECT * FROM temp_table;
+
+ /* use this block to limit to only people who provided USD non-zero income
+ AND LENGTH(TRIM(A12_2)) > 0
+ AND A12_2 = 'USD'
+ AND A12_1 > 0 */
+
+
+/* AND A5B = A5A */
+/* contribute and read the same language filters out 336 people*/
+/* LIMIT 100;
+*/
+/*TODO: Ask about nationality numeric codes (A3_nationality) */
+
+ /* DATA CLEANING RELATED SELECT STATEMENTS BELOW */
+
+
+
+SELECT B10, count(id) FROM wpsurvey.data GROUP BY B10 ORDER BY B10;
+
+
+
+ SELECT DISTINCT (d2.A12_2) FROM wpsurvey.data d2 WHERE NOT EXISTS (SELECT 1 FROM cur_to_USD WHERE CountryCode = d2.A12_2) and d2.A12_2 IS NOT NULL and d2.filter_$ = 0 and not d2.A12_2 = -999;
+
+
+SELECT A11a FROM wpsurvey.data WHERE A11a is null and filter_$ = 0;
+
+
+UPDATE wpsurvey.data
+ SET A11a = IFNULL(-999, A11a),
+ A12_1 = IFNULL(-999, A12_1),
+ A12_2 = IFNULL(-999, A12_2);
+
+
+
+
+SELECT *
+ FROM wpsurvey.data
+ WHERE filter_1 + filter_2 + filter_3 + filter_4 + filter_5 + filter_6 + filter_7 + filter_8 > 4
+ AND A5A = "en";
+
+select count(A12_1) from wpsurvey.data WHERE NOT A12_1 REGEXP '^[0-9]+$' ;
+select A12_1 from wpsurvey.data WHERE NOT A12_1 REGEXP '^[0-9]+$' ;
+SELECT A12_1 FROM wpsurvey.data where LENGTH(TRIM(A12_1)) = 0;
+
+select A12_1,A12_2, LOCATE('.', A12_1), SUBSTRING(A12_1,1,LOCATE('.', A12_1) -1 ) from wpsurvey.data WHERE NOT A12_1 REGEXP '^[0-9]+$' and LOCATE('.', A12_1) > 0 ;
+
+
+select A12_1,LOCATE('.', A12_1), SUBSTRING(A12_1,0,LENGTH(A12_1) - LOCATE('.', A12_1)) from wpsurvey.data WHERE NOT A12_1 REGEXP '^[0-9]+$' ;
+

0 comments on commit e687ad7

Please sign in to comment.