# Cleaning and exploring data

In [43]:
import pandas as pd
from glob import glob

import xavy.explore as xe
import xavy.dataframes as xd

## Load data

In [None]:
filenames = glob('../dados/brutos/w3c-wg-members_*.csv')
raw_members_df = pd.concat([pd.read_csv(f) for f in filenames])

## Explore raw data

**Findings**
1. Organization name and URL are biunivocal
2. The only organization with no URL is W3C or its invited experts
3. There are people affiliated to different organizations
4. Invited experts may come from companies
5. E-mails identify people, but people can have more than one email:
    1. All members have a listed email.
    2. Ignoring people with no picture, all pictures are biunivocal to emails
    3. There are no emails with more than one name associated to it.
    4. **There are people with more than one email**

### Basic stuff

In [10]:
xe.checkMissing(raw_members_df)

[1mColunas com valores faltantes:[0m
    coluna       N      %
2  org_url   406.0  14.69
4   github  1029.0  37.24


In [8]:
xe.mapUnique(raw_members_df)


[1mname: [0m1851 unique values.
[1m(sample) [0mAiring Deng,  Alex Turner,  Andy Wingo,  Anthony Joseph Castillo,  Ashwin Balasubramaniyan,  Etienne Segonzac,  Ganesh Annan,  Jacob Rossi,  Jeff Owenson,  Jer Noble,  Keith Winstein,  Leigh Garner,  Marcos Caceres,  Masato Ito,  Max Hata,  Philip Eliasson,  Philipp-Alexander Blum,  Roderick Sheeter,  Rolf Lindemann,  Tiffany Burtin

[1morg_name: [0m230 unique values.
[1m(sample) [0mACCESS CO., LTD.,  Amazon,  AudioEye, Inc.,  Defense Information Systems Agency,  Entersekt,  Fastly,  Fondazione LIA,  Fraunhofer Gesellschaft,  J. Paul Getty Trust,  Mavennet Systems Inc.,  Natural Resources Canada,  OpenLink Software Inc.,  Protocol Labs,  Samsung Electronics Co., Ltd.,  Snake Nation,  Stanford University,  The New York Times,  Thomson Reuters Corp.,  Tigim,  UnitedHealth Group

[1morg_url: [0m229 unique values.
[1m(sample) [0mhttps://www.w3.org/organizations/10911/,  https://www.w3.org/organizations/112585/,  https://www.w3.org

### Sanity checks

#### Organization URL

In [13]:
xe.one2oneViolations(raw_members_df, 'org_name', 'org_url')

Series([], Name: org_url, dtype: object)

In [14]:
xe.one2oneViolations(raw_members_df, 'org_url', 'org_name')

Series([], Name: org_name, dtype: object)

In [16]:
raw_members_df.loc[raw_members_df['org_url'].isnull(), 'org_name'].value_counts()

org_name
W3C Invited Experts    288
W3C                    118
Name: count, dtype: int64

#### Picture

In [20]:
picture_df = raw_members_df.loc[raw_members_df['picture'] != 'https://www.w3.org/assets/website-2021/svg/avatar.svg']

In [22]:
xe.one2oneQ(picture_df, 'email', 'picture')

True

#### Email

In [30]:
xd.print_array_series(xe.one2oneViolations(raw_members_df, 'name', 'email'))

[1mBenjamin Poulain: [0mbpoulain@apple.com / benjamin@webkit.org
[1mHidde de Vries: [0mhidde-logius@hiddedevries.nl / hidde@hiddedevries.nl
[1mJan Williams: [0mjwilliams@paciellogroup.com / jwilliams@tpgi.com
[1mJim Evans: [0mjames.h.evans.jr@gmail.com / james.evans@salesforce.com
[1mLei Zhao: [0mzhaolei@migu.cn / zhaolei07@baidu.com
[1mMichael Jones: [0mmichael_b_jones@hotmail.com / michael.jones@mattr.global
[1mThomas Nguyen: [0mtungnh@google.com / tomnguyen@google.com


In [31]:
xe.one2oneViolations(raw_members_df, 'email', 'name')

Series([], Name: name, dtype: object)

#### Affiliation

In [33]:
xd.print_array_series(xe.one2oneViolations(raw_members_df, 'name', 'org_name'))

[1mHidde de Vries: [0mLogius / W3C Invited Experts
[1mJim Evans: [0mW3C Invited Experts / Salesforce
[1mLei Zhao: [0mChina Mobile Communications Corporation / Baidu, Inc.
[1mMichael Jones: [0mW3C Invited Experts / Mattr Limited


In [34]:
raw_members_df

Unnamed: 0,name,org_name,org_url,email,github,picture,role,group_tag
0,Daniel Veditz,Mozilla Foundation,https://www.w3.org/organizations/35507/,dveditz@mozilla.com,https://github.com/dveditz,https://www.w3.org/thumbnails/100/avatar-image...,chairs,webappsec
1,Mike West,Google LLC,https://www.w3.org/organizations/35662/,mkwst@google.com,https://github.com/mikewest,https://www.w3.org/thumbnails/100/avatar-image...,chairs,webappsec
2,Philippe Le Hegaret,W3C,,plh@w3.org,https://github.com/plehegar,https://www.w3.org/thumbnails/100/avatar-image...,staff,webappsec
3,Simone Onofri,W3C,,simone@w3.org,https://github.com/simoneonofri,https://www.w3.org/thumbnails/100/avatar-image...,staff,webappsec
4,David Adrian,Google LLC,https://www.w3.org/organizations/35662/,dadrian@google.com,https://github.com/dadrian,https://www.w3.org/assets/website-2021/svg/ava...,participants,webappsec
...,...,...,...,...,...,...,...,...
93,Valerie Young,Igalia,https://www.w3.org/organizations/62028/,spectranaut@igalia.com,https://github.com/spectranaut,https://www.w3.org/assets/website-2021/svg/ava...,participants,aria
94,Kate Zhao,Thomson Reuters Corp.,https://www.w3.org/organizations/111458/,kate.zhao@thomsonreuters.com,https://github.com/KateZhaoTR,https://www.w3.org/assets/website-2021/svg/ava...,participants,aria
95,Xiao (Helen) Zhou,University of Illinois,https://www.w3.org/organizations/51381/,xhzhou@illinois.edu,https://github.com/helen-libit,https://www.w3.org/thumbnails/100/avatar-image...,participants,aria
96,Filippo Zorzi,UsableNet,https://www.w3.org/organizations/138132/,filippo.zorzi@usablenet.com,https://github.com/filippo-zorzi,https://www.w3.org/assets/website-2021/svg/ava...,participants,aria


In [41]:
raw_members_df.query('role != "staff"')['org_name'].value_counts(normalize=True).head(20)

org_name
Google LLC                          0.150756
W3C Invited Experts                 0.106156
Microsoft Corporation               0.063030
Apple Inc.                          0.053446
Intel Corporation                   0.028013
Mozilla Foundation                  0.026539
W3C                                 0.025065
ByteDance                           0.018430
Igalia                              0.016587
Meta                                0.015481
Adobe                               0.011795
TPGi                                0.010321
Digital Bazaar                      0.010321
Samsung Electronics Co., Ltd.       0.009583
TetraLogical Services Ltd           0.008846
British Broadcasting Corporation    0.007741
Alibaba Group                       0.007741
Rakuten Group, Inc.                 0.007372
Baidu, Inc.                         0.007372
Entersekt                           0.007003
Name: proportion, dtype: float64

In [42]:
raw_members_df.query('role == "chairs"')['org_name'].value_counts(normalize=True).head(20)

org_name
W3C Invited Experts                 0.205128
Google LLC                          0.115385
Mozilla Foundation                  0.051282
Intel Corporation                   0.051282
Microsoft Corporation               0.038462
Adobe                               0.038462
British Broadcasting Corporation    0.038462
TetraLogical Services Ltd           0.038462
Apple Inc.                          0.038462
Igalia                              0.025641
Akamai Technologies                 0.025641
TPGi                                0.012821
Samsung Electronics Co., Ltd.       0.012821
Shopify                             0.012821
Geonovum                            0.012821
Spherical Cow Consulting            0.012821
Baidu, Inc.                         0.012821
Alibaba Group                       0.012821
Huawei                              0.012821
Fastly                              0.012821
Name: proportion, dtype: float64

# Trash

In [44]:
groups_df = pd.read_csv('../dados/brutos/w3c_wg_2024-07-04.csv')

In [45]:
groups_df

Unnamed: 0,name,url,description,tag
0,Accessibility Education and Outreach Working G...,https://www.w3.org/groups/wg/eowg/,The mission of the Accessibility Education and...,eowg
1,Accessibility Guidelines Working Group,https://www.w3.org/groups/wg/ag/,The mission of the Accessibility Guidelines Wo...,ag
2,Accessible Platform Architectures Working Group,https://www.w3.org/groups/wg/apa/,The mission of the Accessible Platform Archite...,apa
3,Accessible Rich Internet Applications Working ...,https://www.w3.org/groups/wg/aria/,The mission of the Accessible Rich Internet Ap...,aria
4,Audio Working Group,https://www.w3.org/groups/wg/audio/,The mission of the Audio Working Group is to a...,audio
5,Browser Testing and Tools Working Group,https://www.w3.org/groups/wg/browser-tools-tes...,The mission of the Browser Testing and Tools W...,browser-tools-testing
6,Cascading Style Sheets (CSS) Working Group,https://www.w3.org/groups/wg/css/,The mission of the group is to develop and mai...,css
7,Dataset Exchange Working Group,https://www.w3.org/groups/wg/dx/,The mission of the Dataset Exchange WG is to:\...,dx
8,Decentralized Identifier Working Group,https://www.w3.org/groups/wg/did/,The mission of the Decentralized Identifier Wo...,did
9,Devices and Sensors Working Group,https://www.w3.org/groups/wg/das/,The mission of the Devices and Sensors Working...,das
