# Data Overview

In [1]:
import json
import pandas as pd
import numpy as np
import os

base_dir = os.path.abspath("..")
file_path = os.path.join(base_dir, "data", "scraped_data", "general_profiles_data.json")

# Load JSON file
with open(file_path, "r", encoding="utf-8") as file:
    data = json.load(file)
    
# Convert JSON to DataFrame
user_profiles = []

for username, profile in data.items():
    user_profiles.append({
        "username": username,
        "gender": profile["demographics"].get("gender", "N/A"),
        "country": profile["demographics"].get("country", "N/A"),
        "age": profile["demographics"].get("age", "N/A"),
        "ethnicity": profile["demographics"].get("ethnicity", "N/A"),
        "tags": profile.get("tags", ["N/A"]),
    })

df = pd.DataFrame(user_profiles)
# df.head()

users_with_no_tags = df[df["tags"].apply(lambda x: x == ["N/A"])]

print(f"Total users: {len(df)}")
print(f"Users with no tags (tags == ['N/A']): {len(users_with_no_tags)}")

Total users: 11518
Users with no tags (tags == ['N/A']): 2964


# Remove users with no tags (since cannot use them for analysis)

In [2]:
# Filter out users with ["N/A"] tags
df = df[df["tags"].apply(lambda x: x != ["N/A"])]
print(f"Remaining users after filtering: {len(df)}")

Remaining users after filtering: 8554


# General Statistics

In [3]:
# 1. Gender distribution
gender_counts = df["gender"].value_counts()

# 2. Demographics
demographic_stats = df[["gender", "country", "age", "ethnicity"]].describe(include="all")

# 3. Unique tags
all_tags = [tag for tag_list in df["tags"] for tag in tag_list if tag != 'N/A']
unique_tags = set(all_tags)
num_unique_tags = len(unique_tags)

# 4. Users without any tags
users_without_tags = [tags for tags in df["tags"] if tags == ['N/A']]
num_users_without_tags = len(users_without_tags)

# 5. Average / Median number of tags per user
df["num_tags"] = df["tags"].apply(lambda tags: len(tags) if tags != ['N/A'] else 0)
avg_tags_per_user = df["num_tags"].mean()
median_tags_per_user = df["num_tags"].median()

results = {
    "Gender Distribution": gender_counts.to_dict(),
    "Total Users": len(df),
    "Demographics Overview": demographic_stats.to_dict(),
    "Total Unique Tags": num_unique_tags,
    "Users without Tags": num_users_without_tags,
    "Average Tags per User": avg_tags_per_user,
    "Median Tags per User": median_tags_per_user,
}

pd.set_option("display.max_colwidth", None)  # Show full content in each column
pd.set_option("display.max_columns", None)  # Show all columns
pd.set_option("display.expand_frame_repr", False)  # Prevent line wrapping-
pd.set_option("display.max_rows", None)  # Show all rows if needed

df_results = pd.DataFrame(list(results.items()), columns=["Metric", "Value"])

df_results

Unnamed: 0,Metric,Value
0,Gender Distribution,"{'Female': 3334, 'N/A': 2057, 'Male': 1489, 'Woman': 1106, 'Man': 516, 'Other': 24, 'Prefer not to say': 19, 'Non-binary': 6, 'My pronouns are she/her. Beyond that, I'm just me': 1, 'She/They/He': 1, 'She/they': 1}"
1,Total Users,8554
2,Demographics Overview,"{'gender': {'count': 8554, 'unique': 11, 'top': 'Female', 'freq': 3334}, 'country': {'count': 8554, 'unique': 91, 'top': 'United Kingdom', 'freq': 3584}, 'age': {'count': 8554, 'unique': 76, 'top': 'N/A', 'freq': 4805}, 'ethnicity': {'count': 8554, 'unique': 17, 'top': 'White', 'freq': 3637}}"
3,Total Unique Tags,2140
4,Users without Tags,0
5,Average Tags per User,4.610942
6,Median Tags per User,3.0


# Demographics

In [4]:
# Extract demographics for better overview
demographics_df = pd.DataFrame([df_results[df_results["Metric"] == "Demographics Overview"]["Value"].values[0]])
demographics_df

Unnamed: 0,gender,country,age,ethnicity
0,"{'count': 8554, 'unique': 11, 'top': 'Female', 'freq': 3334}","{'count': 8554, 'unique': 91, 'top': 'United Kingdom', 'freq': 3584}","{'count': 8554, 'unique': 76, 'top': 'N/A', 'freq': 4805}","{'count': 8554, 'unique': 17, 'top': 'White', 'freq': 3637}"


## Country

In [5]:
# Occurrences of each country
country_counts = df["country"].value_counts()

# Percentage of each country
country_percentage = (country_counts / len(df)) * 100

# Filter countries with percentage > 5%
# countries_above_5_percent = country_percentage[country_percentage > 5]

# Convert to DataFrame
country_percentage = pd.DataFrame({
    "Country": country_percentage.index,
    "Count": country_counts.loc[country_percentage.index].values,
    "Percentage": country_percentage.values
})

print(country_percentage)

                 Country  Count  Percentage
0         United Kingdom   3584   41.898527
1                    N/A   2273   26.572364
2          United States   1886   22.048165
3                 Canada    176    2.057517
4              Australia    113    1.321019
5                  India     73    0.853402
6                Ireland     68    0.794950
7            New Zealand     39    0.455927
8                 France     31    0.362404
9           South Africa     22    0.257190
10                 Spain     20    0.233809
11               Germany     19    0.222118
12               Belgium     13    0.151976
13                 Italy     12    0.140285
14           Netherlands     11    0.128595
15               Nigeria     10    0.116904
16  United Arab Emirates      8    0.093523
17                Greece      8    0.093523
18                Sweden      8    0.093523
19                Turkey      7    0.081833
20           Switzerland      6    0.070143
21              Guernsey      6 

## Age

In [6]:
# Define age groups
age_bins = [0, 18, 30, 40, 50, 60, 70, float("inf")]
age_labels = ["Under 18", "18-30", "30-40", "40-50", "50-60", "60-70", "Over 70"]

# Convert age column to numeric, replacing "N/A" with NaN
df["age"] = pd.to_numeric(df["age"], errors="coerce")

# Categorize users into age groups
df["age_group"] = pd.cut(df["age"], bins=age_bins, labels=age_labels, right=False)

# Count occurrences of each age group
age_group_counts = df["age_group"].value_counts(dropna=False)

# Calculate percentage of each age group
age_group_percentage = (age_group_counts / len(df)) * 100

# Count "N/A" values and calculate their percentage
na_count = df["age"].isna().sum()
na_percentage = (na_count / len(df)) * 100

age_stats_df = pd.DataFrame({
    "Age Group": age_group_counts.index.astype(str),
    "Count": age_group_counts.values,
    "Percentage": age_group_percentage.values
})

print(age_stats_df)

  Age Group  Count  Percentage
0       nan   4805   56.172551
1   Over 70    845    9.878419
2     60-70    815    9.527706
3     50-60    744    8.697685
4     30-40    526    6.149170
5     40-50    477    5.576339
6     18-30    342    3.998130
7  Under 18      0    0.000000


## Gender

In [7]:
# GENDER DISTRIBUTION (Original Groups)

# Occurrences of each gender
gender_counts = df["gender"].value_counts()

# Percentage of each gender
gender_percentage = (gender_counts / len(df)) * 100

gender_stats_df = pd.DataFrame({
    "Gender": gender_counts.index,
    "Count": gender_counts.values,
    "Percentage": gender_percentage.values
})

print(gender_stats_df)

                                                Gender  Count  Percentage
0                                               Female   3334   38.975918
1                                                  N/A   2057   24.047229
2                                                 Male   1489   17.407061
3                                                Woman   1106   12.929624
4                                                  Man    516    6.032266
5                                                Other     24    0.280570
6                                    Prefer not to say     19    0.222118
7                                           Non-binary      6    0.070143
8   My pronouns are she/her.  Beyond that, I'm just me      1    0.011690
9                                          She/They/He      1    0.011690
10                                            She/they      1    0.011690


In [8]:
# GENDER DISTRIBUTION (4 Groups: N/A, Others, Male, Female)

# Define gender mapping
final_gender_mapping = {
    "N/A": "N/A",
    "Prefer not to say": "N/A",
    "Female": "Female",
    "Woman": "Female",
    "Male": "Male",
    "Man": "Male",
    "Others" : "Others",
    "Non-binary" : "Others",
    "My pronouns are she/her.  Beyond that, I'm just me" : "Others", 
    "She/They/He": "Others",
    "She/they" : "Others",
}

# Apply mapping, "Others" for any unspecified value
df["gender"] = df["gender"].apply(lambda x: final_gender_mapping.get(x, "Others"))

# Gender distribution
final_gender_counts = df["gender"].value_counts()
final_gender_percentage = (final_gender_counts / len(df)) * 100

final_gender_stats_df = pd.DataFrame({
    "Gender": final_gender_counts.index,
    "Count": final_gender_counts.values,
    "Percentage": final_gender_percentage.values
})

print(final_gender_stats_df)

   Gender  Count  Percentage
0  Female   4440   51.905541
1     N/A   2076   24.269348
2    Male   2005   23.439327
3  Others     33    0.385784


## Ethnicity

In [9]:
# Occurrences of each ethnicity
ethnicity_counts = df["ethnicity"].value_counts()

# Percentage of each ethnicity
ethnicity_percentage = (ethnicity_counts / len(df)) * 100

ethnicity_percentage = pd.DataFrame({
    "Ethnicity": ethnicity_counts.index,
    "Count": ethnicity_counts.values,
    "Percentage": ethnicity_percentage.values
})

# Filter ethnicities with percentage above 5%
# ethnicities_above_5_percent = ethnicity_stats_df[ethnicity_stats_df["Percentage"] > 5]

print(ethnicity_percentage)

                                      Ethnicity  Count  Percentage
0                                         White   3637   42.518120
1                                           N/A   3408   39.841010
2                             White / Caucasian    816    9.539397
3                                   South Asian    133    1.554828
4                             Prefer not to say    116    1.356091
5                Mixed / Multiple ethnic groups    100    1.169044
6                      Black / African American     66    0.771569
7                   Hispanic / Latino / Spanish     63    0.736498
8                            Other ethnic group     43    0.502689
9                                    East Asian     38    0.444237
10                  Black / African / Caribbean     33    0.385784
11  Black / African / Caribbean / Black British     31    0.362404
12                            Latino / Hispanic     30    0.350713
13                               Middle Eastern     17    0.19

In [13]:
# Define ethnicity mapping
ethnicity_mapping = {
    "N/A": "N/A",
    "Prefer not to say": "N/A",
    
    "Latino": "Latino",
    "Hispanic / Latino / Spanish": "Latino",
    "Latino / Hispanic": "Latino",
    
    "Black": "Black",
    "Black / African American": "Black",
    "Black / African / Caribbean": "Black",
    "Black / African / Caribbean / Black British": "Black",
    
    
    "White": "White",
    "White / Caucasian": "White",
    
    "Asian": "Asian",
    "South Asian": "Asian",
    "East Asian": "Asian",
    "Asian / Asian British": "Asian",

    "Middle Eastern": "Middle Eastern",
    "Middle Eastern / North African": "Middle Eastern",
    
    "Other ethnic group" : "Other ethnic group"
}

# Apply mapping
df["ethnicity_mapped"] = df["ethnicity"].apply(lambda x: ethnicity_mapping.get(x, x))

# Ethnicity distribution
ethnicity_counts = df["ethnicity_mapped"].value_counts()
ethnicity_percentage = (ethnicity_counts / len(df)) * 100

# Identify groups above 5% significance
# significant_ethnicities = ethnicity_percentage[ethnicity_percentage > 5].index.tolist()
significant_ethnicities = ethnicity_percentage

# Assign all other groups to "Others"
df["ethnicity_final"] = df["ethnicity_mapped"].apply(lambda x: x if x in significant_ethnicities else "Others")

# Final ethnicity distribution
final_ethnicity_counts = df["ethnicity_final"].value_counts()
final_ethnicity_percentage = (final_ethnicity_counts / len(df)) * 100

final_ethnicity_stats_df = pd.DataFrame({
    "Ethnicity": final_ethnicity_counts.index,
    "Count": final_ethnicity_counts.values,
    "Percentage": final_ethnicity_percentage.values
})

print(final_ethnicity_stats_df)

                        Ethnicity  Count  Percentage
0                           White   4453   52.057517
1                             N/A   3524   41.197101
2                           Asian    177    2.069207
3                           Black    130    1.519757
4  Mixed / Multiple ethnic groups    100    1.169044
5                          Latino     93    1.087211
6              Other ethnic group     43    0.502689
7                  Middle Eastern     21    0.245499
8           Asian / Asian British     13    0.151976
