In [1]:
import os
import yaml
import pandas as pd

directory_path = "/Users/mervekeskin/Desktop/Knowledge_Graphs_Summer2024/country_files/"

all_files = os.listdir(directory_path)

# Filter out files ending with 'Concepts.yml'
concepts_files = [f for f in all_files if f.endswith('Concepts.yml')]

##### Ethnic Group

In [2]:
# lists to store the data
ethnic_groups = []
string_tokens = []
wikidata_qnodes = []
languages_most_spoken = []
religions_most_practiced = []
lives_in = []
countries = []

# Extract country names and sort them 
country_names = sorted(set(file_name.replace('Concepts.yml', '') for file_name in concepts_files))
country_map = {name: i+1 for i, name in enumerate(country_names)}

for file_name in concepts_files:
    file_path = os.path.join(directory_path, file_name)
    country_name = file_name.replace('Concepts.yml', '')
    country_code = country_map[country_name]
    try:
        with open(file_path, 'r') as file:
            data = yaml.safe_load(file)
            concepts = data.get('concepts', {})
            for key, value in concepts.items():
                if key.endswith('EthnicGroup'):
                    ethnic_groups.append(key)
                    string_tokens.append(value.get('stringTokens', None) if 'stringTokens' in value else None)
                    wikidata_qnodes.append(value.get('wikidataQnode', None) if 'wikidataQnode' in value else None)
                    languages_most_spoken.append(value.get('languagesMostSpoken', None) if 'languagesMostSpoken' in value else None)
                    religions_most_practiced.append(value.get('religionsMostPracticed', None) if 'religionsMostPracticed' in value else None)
                    lives_in.append(value.get('livesIn', None) if 'livesIn' in value else None)
                    countries.append(country_name)
    except yaml.YAMLError as exc:
        print(f"Error parsing {file_name}: {exc}")

ethnic_groups_data = {
    "Country": countries,
    "Country Numeric Code": [country_map[country] for country in countries],
    "Ethnic Group": ethnic_groups,
    "String Tokens": string_tokens,
    "Wikidata Qnode": wikidata_qnodes,
    "Languages Most Spoken": languages_most_spoken,
    "Religions Most Practiced": religions_most_practiced,
    "Lives In": lives_in
}

ethnic_groups_df = pd.DataFrame(ethnic_groups_data)
ethnic_groups_df = ethnic_groups_df.applymap(lambda x: [""] if x == [] else x)

# Save 
file_path_excel_ethnic = "/Users/mervekeskin/Desktop/EthnicGroups_Concepts_Compiled_Updated.xlsx"
ethnic_groups_df.to_excel(file_path_excel_ethnic, index=False)

Error parsing UgandaConcepts.yml: while parsing a flow sequence
  in "/Users/mervekeskin/Desktop/Knowledge_Graphs_Summer2024/country_files/UgandaConcepts.yml", line 146, column 13
expected ',' or ']', but got '<scalar>'
  in "/Users/mervekeskin/Desktop/Knowledge_Graphs_Summer2024/country_files/UgandaConcepts.yml", line 146, column 23


  ethnic_groups_df = ethnic_groups_df.applymap(lambda x: [""] if x == [] else x)


In [3]:
# Define the columns 
columns_to_check = [
    "Wikidata Qnode",
    "Languages Most Spoken",
    "Religions Most Practiced",
    "Lives In"
]

# counters for empty and [""] values
empty_counts = {column: 0 for column in columns_to_check}
explicit_empty_counts = {column: 0 for column in columns_to_check}
total_counts = {column: 0 for column in columns_to_check}
total_rows = len(ethnic_groups_df)

# function to check if a value is explicitly [""] or ['']
def is_explicit_empty(value):
    if isinstance(value, list):
        return len(value) == 1 and value[0] == ""
    return False

# function to check if a value is empty
def is_empty(value):
    if isinstance(value, list):
        return len(value) == 0
    return pd.isna(value)

# count the empty and [""] values
for column in columns_to_check:
    for value in ethnic_groups_df[column]:
        if is_empty(value):
            empty_counts[column] += 1
        if is_explicit_empty(value):
            explicit_empty_counts[column] += 1
        if is_empty(value) or is_explicit_empty(value):
            total_counts[column] += 1

# percentages
empty_percentages = {column: (count / total_rows) * 100 for column, count in empty_counts.items()}
explicit_empty_percentages = {column: (count / total_rows) * 100 for column, count in explicit_empty_counts.items()}
total_percentages = {column: (count / total_rows) * 100 for column, count in total_counts.items()}

print("Percentage of empty values for each column:")
for column, percentage in empty_percentages.items():
    print(f"{column}: {percentage:.2f}%")

print("\nPercentage of [''] values for each column:")
for column, percentage in explicit_empty_percentages.items():
    print(f"{column}: {percentage:.2f}%")

print("\nTotal percentage of empty and [''] values for each column:")
for column, percentage in total_percentages.items():
    print(f"{column}: {percentage:.2f}%")

Percentage of empty values for each column:
Wikidata Qnode: 0.00%
Languages Most Spoken: 20.00%
Religions Most Practiced: 8.37%
Lives In: 8.60%

Percentage of [''] values for each column:
Wikidata Qnode: 0.00%
Languages Most Spoken: 1.63%
Religions Most Practiced: 2.79%
Lives In: 11.40%

Total percentage of empty and [''] values for each column:
Wikidata Qnode: 0.00%
Languages Most Spoken: 21.63%
Religions Most Practiced: 11.16%
Lives In: 20.00%


##### Linguistic Group

In [4]:
linguistic_groups = []
string_tokens_linguistic = []
wikidata_qnodes_linguistic = []
lives_in_linguistic = []
associated_ethnic_group = []
countries_linguistic = []

country_names = sorted(set(file_name.replace('Concepts.yml', '') for file_name in concepts_files))
country_map = {name: i+1 for i, name in enumerate(country_names)}

for file_name in concepts_files:
    file_path = os.path.join(directory_path, file_name)
    country_name = file_name.replace('Concepts.yml', '')
    country_code = country_map[country_name]
    try:
        with open(file_path, 'r') as file:
            data = yaml.safe_load(file)
            concepts = data.get('concepts', {})
            for key, value in concepts.items():
                if key.endswith('LinguisticGroup'):
                    linguistic_groups.append(key)
                    string_tokens_linguistic.append(value.get('stringTokens', None) if 'stringTokens' in value else None)
                    wikidata_qnodes_linguistic.append(value.get('wikidataQnode', None) if 'wikidataQnode' in value else None)
                    lives_in_linguistic.append(value.get('livesIn', None) if 'livesIn' in value else None)
                    associated_ethnic_group.append(value.get('associatedethnicgroup', None) if 'associatedethnicgroup' in value else None)
                    countries_linguistic.append(country_name)
    except yaml.YAMLError as exc:
        print(f"Error parsing {file_name}: {exc}")

linguistic_groups_data = {
    "Country": countries_linguistic,
    "Country Numeric Code": [country_map[country] for country in countries_linguistic],
    "Linguistic Group": linguistic_groups,
    "String Tokens": string_tokens_linguistic,
    "Wikidata Qnode": wikidata_qnodes_linguistic,
    "Lives In": lives_in_linguistic,
    "Associated Ethnic Group": associated_ethnic_group
}

linguistic_groups_df = pd.DataFrame(linguistic_groups_data)
linguistic_groups_df = linguistic_groups_df.applymap(lambda x: [""] if x == [] else x)

file_path_excel_linguistic = "/Users/mervekeskin/Desktop/LinguisticGroups_Concepts_Compiled_Updated.xlsx"
linguistic_groups_df.to_excel(file_path_excel_linguistic, index=False)

Error parsing UgandaConcepts.yml: while parsing a flow sequence
  in "/Users/mervekeskin/Desktop/Knowledge_Graphs_Summer2024/country_files/UgandaConcepts.yml", line 146, column 13
expected ',' or ']', but got '<scalar>'
  in "/Users/mervekeskin/Desktop/Knowledge_Graphs_Summer2024/country_files/UgandaConcepts.yml", line 146, column 23


  linguistic_groups_df = linguistic_groups_df.applymap(lambda x: [""] if x == [] else x)


In [5]:
columns_to_check = [
    "String Tokens", 
    "Wikidata Qnode",
    "Associated Ethnic Group",
    "Lives In"
]

empty_counts = {column: 0 for column in columns_to_check}
explicit_empty_counts = {column: 0 for column in columns_to_check}
total_counts = {column: 0 for column in columns_to_check}
total_rows = len(linguistic_groups_df)

def is_explicit_empty(value):
    if isinstance(value, list):
        return len(value) == 1 and value[0] == ""
    return False

def is_empty(value):
    if isinstance(value, list):
        return len(value) == 0
    return pd.isna(value)

for column in columns_to_check:
    for value in linguistic_groups_df[column]:
        if is_empty(value):
            empty_counts[column] += 1
        if is_explicit_empty(value):
            explicit_empty_counts[column] += 1
        if is_empty(value) or is_explicit_empty(value):
            total_counts[column] += 1

# percentages
empty_percentages = {column: (count / total_rows) * 100 for column, count in empty_counts.items()}
explicit_empty_percentages = {column: (count / total_rows) * 100 for column, count in explicit_empty_counts.items()}
total_percentages = {column: (count / total_rows) * 100 for column, count in total_counts.items()}

print("Percentage of empty values for each column:")
for column, percentage in empty_percentages.items():
    print(f"{column}: {percentage:.2f}%")

print("\nPercentage of [''] values for each column:")
for column, percentage in explicit_empty_percentages.items():
    print(f"{column}: {percentage:.2f}%")

print("\nTotal percentage of empty and [''] values for each column:")
for column, percentage in total_percentages.items():
    print(f"{column}: {percentage:.2f}%")

Percentage of empty values for each column:
String Tokens: 0.00%
Wikidata Qnode: 0.00%
Associated Ethnic Group: 12.13%
Lives In: 4.09%

Percentage of [''] values for each column:
String Tokens: 0.00%
Wikidata Qnode: 0.00%
Associated Ethnic Group: 25.88%
Lives In: 57.02%

Total percentage of empty and [''] values for each column:
String Tokens: 0.00%
Wikidata Qnode: 0.00%
Associated Ethnic Group: 38.01%
Lives In: 61.11%


##### Political Party

In [6]:
political_parties = []
string_tokens_parties = []
wikidata_qnodes_parties = []
associated_ethnic_group_parties = []
years_active_as_party = []
ideological_orientation_parties = []
years_active_as_entity_parties = []
countries_parties = []

country_names = sorted(set(file_name.replace('Concepts.yml', '') for file_name in concepts_files))
country_map = {name: i+1 for i, name in enumerate(country_names)}

for file_name in concepts_files:
    file_path = os.path.join(directory_path, file_name)
    country_name = file_name.replace('Concepts.yml', '')
    country_code = country_map[country_name]
    try:
        with open(file_path, 'r') as file:
            data = yaml.safe_load(file)
            concepts = data.get('concepts', {})
            for key, value in concepts.items():
                if key.endswith('Party') or key.endswith('PoliticalParty'):
                    political_parties.append(key)
                    string_tokens_parties.append(value.get('stringTokens', None) if 'stringTokens' in value else None)
                    wikidata_qnodes_parties.append(value.get('wikidataQnode', None) if 'wikidataQnode' in value else None)
                    associated_ethnic_group_parties.append(value.get('associatedEthnicGroup', None) if 'associatedEthnicGroup' in value else None)
                    years_active_as_party.append(value.get('yearsActiveasParty', None) if 'yearsActiveasParty' in value else None)
                    ideological_orientation_parties.append(value.get('ideologicalOrientation', None) if 'ideologicalOrientation' in value else None)
                    years_active_as_entity_parties.append(value.get('yearsActiveasEntity', None) if 'yearsActiveasEntity' in value else None)
                    countries_parties.append(country_name)
    except yaml.YAMLError as exc:
        print(f"Error parsing {file_name}: {exc}")

political_parties_data = {
    "Country": countries_parties,
    "Country Numeric Code": [country_map[country] for country in countries_parties],
    "Political Party": political_parties,
    "String Tokens": string_tokens_parties,
    "Wikidata Qnode": wikidata_qnodes_parties,
    "Associated Ethnic Group": associated_ethnic_group_parties,
    "Years Active as Party": years_active_as_party,
    "Ideological Orientation": ideological_orientation_parties,
    "Years Active as Entity": years_active_as_entity_parties
}

political_parties_df = pd.DataFrame(political_parties_data)
political_parties_df = political_parties_df.applymap(lambda x: [""] if x == [] else x)

file_path_excel_parties = "/Users/mervekeskin/Desktop/PoliticalParties_Concepts_Compiled_Updated.xlsx"
political_parties_df.to_excel(file_path_excel_parties, index=False)

Error parsing UgandaConcepts.yml: while parsing a flow sequence
  in "/Users/mervekeskin/Desktop/Knowledge_Graphs_Summer2024/country_files/UgandaConcepts.yml", line 146, column 13
expected ',' or ']', but got '<scalar>'
  in "/Users/mervekeskin/Desktop/Knowledge_Graphs_Summer2024/country_files/UgandaConcepts.yml", line 146, column 23


  political_parties_df = political_parties_df.applymap(lambda x: [""] if x == [] else x)


In [7]:
columns_to_check = [
    "String Tokens", 
    "Wikidata Qnode",
    "Associated Ethnic Group",
    "Years Active as Party",
    "Ideological Orientation",
    "Years Active as Entity"
]

empty_counts = {column: 0 for column in columns_to_check}
explicit_empty_counts = {column: 0 for column in columns_to_check}
total_counts = {column: 0 for column in columns_to_check}
total_rows = len(political_parties_df)

def is_explicit_empty(value):
    if isinstance(value, list):
        return len(value) == 1 and value[0] == ""
    return False

def is_empty(value):
    if isinstance(value, list):
        return len(value) == 0
    return pd.isna(value)

for column in columns_to_check:
    for value in political_parties_df[column]:
        if is_empty(value):
            empty_counts[column] += 1
        if is_explicit_empty(value):
            explicit_empty_counts[column] += 1
        if is_empty(value) or is_explicit_empty(value):
            total_counts[column] += 1

# percentages
empty_percentages = {column: (count / total_rows) * 100 for column, count in empty_counts.items()}
explicit_empty_percentages = {column: (count / total_rows) * 100 for column, count in explicit_empty_counts.items()}
total_percentages = {column: (count / total_rows) * 100 for column, count in total_counts.items()}

print("Percentage of empty values for each column:")
for column, percentage in empty_percentages.items():
    print(f"{column}: {percentage:.2f}%")

print("\nPercentage of [''] values for each column:")
for column, percentage in explicit_empty_percentages.items():
    print(f"{column}: {percentage:.2f}%")

print("\nTotal percentage of empty and [''] values for each column:")
for column, percentage in total_percentages.items():
    print(f"{column}: {percentage:.2f}%")

Percentage of empty values for each column:
String Tokens: 0.00%
Wikidata Qnode: 7.72%
Associated Ethnic Group: 66.71%
Years Active as Party: 31.36%
Ideological Orientation: 70.45%
Years Active as Entity: 81.91%

Percentage of [''] values for each column:
String Tokens: 0.00%
Wikidata Qnode: 8.20%
Associated Ethnic Group: 20.75%
Years Active as Party: 6.63%
Ideological Orientation: 0.00%
Years Active as Entity: 0.72%

Total percentage of empty and [''] values for each column:
String Tokens: 0.00%
Wikidata Qnode: 15.92%
Associated Ethnic Group: 87.45%
Years Active as Party: 38.00%
Ideological Orientation: 70.45%
Years Active as Entity: 82.63%


##### Armed Groups

In [8]:
armed_groups = []
string_tokens_armed = []
wikidata_qnodes_armed = []
associated_ethnic_group_armed = []
regions_most_active = []
years_active_as_armed = []
ideological_orientation_armed = []
countries_armed = []

country_names = sorted(set(file_name.replace('Concepts.yml', '') for file_name in concepts_files))
country_map = {name: i+1 for i, name in enumerate(country_names)}

for file_name in concepts_files:
    file_path = os.path.join(directory_path, file_name)
    country_name = file_name.replace('Concepts.yml', '')
    country_code = country_map[country_name]
    try:
        with open(file_path, 'r') as file:
            data = yaml.safe_load(file)
            concepts = data.get('concepts', {})
            for key, value in concepts.items():
                if key.endswith('armedNonGovernmentalOrganizedGroup'):
                    armed_groups.append(key)
                    string_tokens_armed.append(value.get('stringTokens', None) if 'stringTokens' in value else None)
                    wikidata_qnodes_armed.append(value.get('wikidataQnode', None) if 'wikidataQnode' in value else None)
                    associated_ethnic_group_armed.append(value.get('associatedEthnicGroup', None) if 'associatedEthnicGroup' in value else None)
                    regions_most_active.append(value.get('regionsMostActive', None) if 'regionsMostActive' in value else None)
                    years_active_as_armed.append(value.get('yearsActiveasArmed', None) if 'yearsActiveasArmed' in value else None)
                    ideological_orientation_armed.append(value.get('ideologicalOrientation', None) if 'ideologicalOrientation' in value else None)
                    countries_armed.append(country_name)
    except yaml.YAMLError as exc:
        print(f"Error parsing {file_name}: {exc}")

armed_groups_data = {
    "Country": countries_armed,
    "Country Numeric Code": [country_map[country] for country in countries_armed],
    "Armed Group": armed_groups,
    "String Tokens": string_tokens_armed,
    "Wikidata Qnode": wikidata_qnodes_armed,
    "Associated Ethnic Group": associated_ethnic_group_armed,
    "Regions Most Active": regions_most_active,
    "Years Active as Armed Group": years_active_as_armed,
    "Ideological Orientation": ideological_orientation_armed
}

armed_groups_df = pd.DataFrame(armed_groups_data)
armed_groups_df = armed_groups_df.applymap(lambda x: [""] if x == [] else x)

file_path_excel_armed = "/Users/mervekeskin/Desktop/ArmedGroups_Concepts_Compiled_Updated.xlsx"
armed_groups_df.to_excel(file_path_excel_armed, index=False)

Error parsing UgandaConcepts.yml: while parsing a flow sequence
  in "/Users/mervekeskin/Desktop/Knowledge_Graphs_Summer2024/country_files/UgandaConcepts.yml", line 146, column 13
expected ',' or ']', but got '<scalar>'
  in "/Users/mervekeskin/Desktop/Knowledge_Graphs_Summer2024/country_files/UgandaConcepts.yml", line 146, column 23


  armed_groups_df = armed_groups_df.applymap(lambda x: [""] if x == [] else x)


In [9]:
columns_to_check = [
    "String Tokens", 
    "Wikidata Qnode",
    "Associated Ethnic Group",
    "Years Active as Armed Group",
    "Ideological Orientation",
    "Regions Most Active"
]

empty_counts = {column: 0 for column in columns_to_check}
explicit_empty_counts = {column: 0 for column in columns_to_check}
total_counts = {column: 0 for column in columns_to_check}
total_rows = len(armed_groups_df)

def is_explicit_empty(value):
    if isinstance(value, list):
        return len(value) == 1 and value[0] == ""
    return False

def is_empty(value):
    if isinstance(value, list):
        return len(value) == 0
    return pd.isna(value)

for column in columns_to_check:
    for value in armed_groups_df[column]:
        if is_empty(value):
            empty_counts[column] += 1
        if is_explicit_empty(value):
            explicit_empty_counts[column] += 1
        if is_empty(value) or is_explicit_empty(value):
            total_counts[column] += 1

# percentages
empty_percentages = {column: (count / total_rows) * 100 for column, count in empty_counts.items()}
explicit_empty_percentages = {column: (count / total_rows) * 100 for column, count in explicit_empty_counts.items()}
total_percentages = {column: (count / total_rows) * 100 for column, count in total_counts.items()}

print("Percentage of empty values for each column:")
for column, percentage in empty_percentages.items():
    print(f"{column}: {percentage:.2f}%")

print("\nPercentage of [''] values for each column:")
for column, percentage in explicit_empty_percentages.items():
    print(f"{column}: {percentage:.2f}%")

print("\nTotal percentage of empty and [''] values for each column:")
for column, percentage in total_percentages.items():
    print(f"{column}: {percentage:.2f}%")

Percentage of empty values for each column:
String Tokens: 0.00%
Wikidata Qnode: 0.36%
Associated Ethnic Group: 20.71%
Years Active as Armed Group: 71.43%
Ideological Orientation: 39.29%
Regions Most Active: 63.93%

Percentage of [''] values for each column:
String Tokens: 0.00%
Wikidata Qnode: 0.36%
Associated Ethnic Group: 43.93%
Years Active as Armed Group: 1.43%
Ideological Orientation: 0.00%
Regions Most Active: 9.64%

Total percentage of empty and [''] values for each column:
String Tokens: 0.00%
Wikidata Qnode: 0.71%
Associated Ethnic Group: 64.64%
Years Active as Armed Group: 72.86%
Ideological Orientation: 39.29%
Regions Most Active: 73.57%
