# Initial Executable Notebook for COG260 Project

Data source: https://osf.io/hjvm5/

Paper: https://psyarxiv.com/efs4p/

Import relevant Python libraries.

In [39]:
import numpy as np
import pandas as pd
# the library below is useful for estimating the progress of a for loop
# the use of tqdm will be displayed in Demo 2
from tqdm import tqdm
from itertools import product

pd.options.mode.chained_assignment = None

In [40]:
df = pd.read_csv("df_all_raw.csv")
df.columns = list(map(str.lower, df.columns))
df = df.drop(columns=['dataset_id', 'form_id', 'form', 'gloss_in_source', 'iso639p3code', 'mrc_word', 'kucera_francis_frequency'])

  df = pd.read_csv("df_all_raw.csv")


# Data Fields

**clics_form**: the form of the word in the language of interest

**concepticon_id**: unique numerical identifier of underlying concept

**concepticon_gloss**: the concept underlying the word form

**ontological_category**: broad category that the concept falls into

**semantic_field**: a set of words related in meaning

**variety**: the language

**glottocode**: unique alphanumeric identifier for the language variety

**macroarea**: the part of the world the language is common in

**family**: the language family of the current variety

**latitude**: rough latitude where the language variety can be found

**longitude**: rough latitude where the language variety can be found

**age_of_acquisition**: the age at which a concept is typically learned

**concreteness**: a numerical rating of how abstract or concrete a concept is, rated from (100-700)

**familiarity**: a numerical rating of how familiar a concept is to the average person, rated from (100-700)

**imagability**: a numerical rating of how well an average person can mentally visualize a concept, rated from (100-700)

# Concept categorization

First, we will visualize some of the concepticons that we will categorize as either "abstract" or concrete" in tables.

In [19]:
abstract_cpts = df[df['concreteness'] < 400]
abstract_cpts = abstract_concepts[["concepticon_id", "concepticon_gloss", "concreteness"]].drop_duplicates()
abstract_cpts

Unnamed: 0,concepticon_id,concepticon_gloss,concreteness
30,1035,GOOD,297.0
570,98,ALL,267.0
651,244,LEFT,341.0
786,1203,LONG,381.0
905,1198,MANY,276.0
...,...,...,...
1303158,240,ELECTION,343.0
1304727,1478,WITHOUT,267.0
1307706,1557,BEGINNING,318.0
1355363,2033,DELIVER,393.0


In [22]:
num_abstract = abstract_cpts['concepticon_id'].nunique()
print(f"Number of unique abstract concepts: {num_abstract}")

Number of unique abstract concepts: 356


In [20]:
concrete_cpts = df[df['concreteness'] >= 400]
concrete_cpts = concrete_cpts[["concepticon_id", "concepticon_gloss", "concreteness"]].drop_duplicates()
concrete_cpts

Unnamed: 0,concepticon_id,concepticon_gloss,concreteness
0,1369,GOLD,576.0
60,1425,GREEN,460.0
122,1481,HAMMER,605.0
152,1277,HAND,604.0
182,1256,HEAD,603.0
...,...,...,...
1305965,287,CONE,573.0
1351537,1015,SOIL,581.0
1351716,637,MOSS,575.0
1352029,2290,PUDDLE,604.0


In [25]:
num_concrete = concrete_cpts['concepticon_id'].nunique()
print(f"Number of unique abstract concepts: {num_concrete}")

Number of unique abstract concepts: 846


To simplify the process of analysis later, we will create a dictionary that maps concepticon_id to a boolean representing whether or not the concepticon is abstract

In [38]:
df["is_abstract"] = df["concreteness"] < 400
filtered_df = df[["concepticon_id", "is_abstract"]].drop_duplicates()
id_to_abstractness = filtered_df.set_index("concepticon_id")["is_abstract"].to_dict()

# Helper functions

For now, these just include the helper + main functions from the demo notebook, copied here for later use:

In [41]:
def per_lang_colexification(curr_df):
    """
    Calculate the colexification frequency of pairs of concepts present in the current language.
    """
    all_combos_dict = {}
    # We iterate through each row, which has the concepts associated with a specific word
    for i, row in curr_df.iterrows():
        # Get the current set of concepts
        a = row['concepticon_gloss']
        # Create all possible unique combinations of concepts, where each pair is alphabetically sorted
        combos = list(set(map(lambda x: tuple(sorted(x)), product(a, a))))
        # Ensure the concepts in the pair are not identical
        combos = [combo for combo in combos if combo[0] != combo[1]]
        # Add counts for a pair of combinations being colexified
        for combo in combos:
            if combo in all_combos_dict:
                all_combos_dict[combo] += 1
            else:
                all_combos_dict[combo] = 1

    # Create a DataFrame out of our dictionary and return the colexification counts for two concepts
    tmp = pd.DataFrame.from_dict(all_combos_dict, "index").reset_index()
    per_lang = pd.DataFrame(tmp['index'].tolist(), columns=['concept_1', "concept_2"])
    per_lang['colexification_count'] = tmp[0]
    return per_lang

In [None]:
def main():
    all_dfs = []
    for variety in tqdm(df['variety'].unique()):
        sub = df[df['variety'] == variety]
        agg = sub.groupby("clics_form")[['concepticon_gloss', 'concepticon_id']].agg(list)
        agg['num_concepts'] = agg['concepticon_gloss'].apply(lambda x: len(set(x)))
        colex = agg[agg['num_concepts']>1]
        colex['concepticon_gloss'] = colex['concepticon_gloss'].apply(lambda x: sorted(list(set(x))))
        # We skip any language where no concepts are colexified
        if colex.shape[0] == 0:
            continue
        curr_df = per_lang_colexification(colex)
        all_dfs.append(curr_df)
    mega = pd.concat(all_dfs)
    colex_counts = mega.groupby(["concept_1", "concept_2"]).sum().reset_index()
    return colex_counts