# General Pattern

## general profiles data

Data preprocssing / cleaning:

Filter out users with no tags: tags: ["N/A"]

final_gender_mapping = {

    "N/A": "N/A",
    "Prefer not to say": "N/A",
    "Female": "Female",
    "Woman": "Female",
    "Male": "Male",
    "Man": "Male",
    "Others" : "Others",
    "Non-binary" : "Others",
    "My pronouns are she/her.  Beyond that, I'm just me" : "Others", 
    "She/They/He": "Others",
    "She/they" : "Others",
    
}

ethnicity_mapping = {

    "N/A": "N/A",
    "Prefer not to say": "N/A",
    
    "Latino": "Latino",
    "Hispanic / Latino / Spanish": "Latino",
    "Latino / Hispanic": "Latino",
    
    "Black": "Black",
    "Black / African American": "Black",
    "Black / African / Caribbean": "Black",
    "Black / African / Caribbean / Black British": "Black",
    
    
    "White": "White",
    "White / Caucasian": "White",
    
    "Asian": "Asian",
    "South Asian": "Asian",
    "East Asian": "Asian",
    "Asian / Asian British": "Asian",

    "Middle Eastern": "Middle Eastern",
    "Middle Eastern / North African": "Middle Eastern",
    
    "Other ethnic group" : "Other ethnic group"
}

In [6]:
import os
import json
import pandas as pd

base_dir = os.path.abspath("..")
file_path = os.path.join(base_dir, "data", "scraped_data", "general_profiles_data.json")
output_path = os.path.join(base_dir, "data", "data_cleaned", "general_profiles_data.json")

# Load JSON file
with open(file_path, "r") as file:
    data = json.load(file)

# Filter out users with no tags: "tags": ["N/A"]]
users_with_no_tags = {}
filtered_data = {}
for user, info in data.items():
    if info["tags"] != ["N/A"]:
        filtered_data[user] = info
    else:
        users_with_no_tags[user] = info

# Gender: define super-categories
gender_mapping = {
    "N/A": "N/A",
    "Prefer not to say": "N/A",
    "Female": "Female",
    "Woman": "Female",
    "Male": "Male",
    "Man": "Male",
    "Others": "Others",
    "Non-binary": "Others",
    "My pronouns are she/her.  Beyond that, I'm just me": "Others",
    "She/They/He": "Others",
    "She/they": "Others",
}

# Ethnicity: define super-categories
ethnicity_mapping = {

    "N/A": "N/A",
    "Prefer not to say": "N/A",
    
    "Latino": "Latino",
    "Hispanic / Latino / Spanish": "Latino",
    "Latino / Hispanic": "Latino",
    
    "Black": "Black",
    "Black / African American": "Black",
    "Black / African / Caribbean": "Black",
    "Black / African / Caribbean / Black British": "Black",
    
    
    "White": "White",
    "White / Caucasian": "White",
    
    "Asian": "Asian",
    "South Asian": "Asian",
    "East Asian": "Asian",
    "Asian / Asian British": "Asian",

    "Middle Eastern": "Middle Eastern",
    "Middle Eastern / North African": "Middle Eastern",
    
    "Other ethnic group" : "Other ethnic group"
}
# Map Gernder and Ethnicity demographics of each user
for user_info in filtered_data.values():
    gender = user_info["demographics"].get("gender", "N/A")
    ethnicity = user_info["demographics"].get("ethnicity", "N/A")
    
    user_info["demographics"]["gender"] = gender_mapping.get(gender, "Others")
    user_info["demographics"]["ethnicity"] = ethnicity_mapping.get(ethnicity, "Other ethnic group")

# Save to JSON
with open(output_path, "w") as outfile:
    json.dump(filtered_data, outfile, indent=2)
    
    
percentage_with_tags = (len(filtered_data) / len(data)) * 100

print(f"Original number of users: {len(data)}")
print(f"Number of users with no tags ('N/A'): {len(users_with_no_tags)}")
print(f"Number of users after filtering (with tags): {len(filtered_data)} -> {percentage_with_tags:.2f}%")

Original number of users: 11518
Number of users with no tags ('N/A'): 2964
Number of users after filtering (with tags): 8554 -> 74.27%


In [5]:
df = pd.DataFrame([
    {
        "gender": info["demographics"].get("gender", "N/A"),
        "ethnicity": info["demographics"].get("ethnicity", "N/A")
    }
    for info in filtered_data.values()
])

print("\nGender distribution after mapping:")
print(df["gender"].value_counts())

print("\nEthnicity distribution after mapping:")
print(df["ethnicity"].value_counts())


Gender distribution after mapping:
gender
Female    4440
N/A       2076
Male      2005
Others      33
Name: count, dtype: int64

Ethnicity distribution after mapping:
ethnicity
White                 4453
N/A                   3524
Asian                  190
Other ethnic group     143
Black                  130
Latino                  93
Middle Eastern          21
Name: count, dtype: int64


# Community-Specific Patterns

## profiles by comm data

Data preprocssing / cleaning:

Filter out users with no tags: tags: ["N/A"]

In [15]:
import os
import json
import pandas as pd

base_dir = os.path.abspath("..")
file_path = os.path.join(base_dir, "data", "scraped_data", "profiles_by_comm_data.json")
output_path = os.path.join(base_dir, "data", "data_cleaned", "profiles_by_comm_data.json")

# Load JSON file
with open(file_path, "r") as file:
    data = json.load(file)

# Filter out users with no tags: "tags" = ["N/A"]
users_total = 0
users_with_no_tags_unique = {}
users_with_no_tags_duplicate = 0
filtered_data = {}
for community, users in data.items():
    
    filtered_users = {}
    for user, info in users.items():        
        users_total += 1
        
        if info["tags"] != ["N/A"]:
            filtered_users[user] = info
        else:
            users_with_no_tags_unique[user] = info  # unique usernames with no tags
            users_with_no_tags_duplicate += 1  # all mentions with no tags
        
    if filtered_users:
        filtered_data[community] = filtered_users

# Save JSON file
with open(output_path, "w") as file:
    json.dump(filtered_data, file, indent=2)

# Stats
unique_users_with_tags = users_total - len(users_with_no_tags_unique)
mentions_with_tags = users_total - users_with_no_tags_duplicate

unique_tags_percent = (unique_users_with_tags / users_total) * 100
mention_tags_percent = (mentions_with_tags / users_total) * 100

print(f"Original number of users (mentions): {users_total}")
print(f"Number of users with no tags ('N/A') (unique): {len(users_with_no_tags_unique)}")
print(f"Number of UNQIUE users with tags (users may me members in multiple comm): {unique_users_with_tags} -> {unique_tags_percent:.2f}%")

print(f"\n*Number of users with tags (mentions): {mentions_with_tags} -> {mention_tags_percent:.2f}%")

Original number of users (mentions): 37777
Number of users with no tags ('N/A') (unique): 19165
Number of UNQIUE users with tags (users may me members in multiple comm): 18612 -> 49.27%

*Number of users with tags (mentions): 15757 -> 41.71%
