In [32]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from scipy.stats import ttest_ind

import nltk
from nltk.corpus import wordnet as wn
from nltk.corpus import brown 

from deep_translator import GoogleTranslator

Verb database: LISADA (Sdroc et al., 2025)

In [33]:
endeuhun_df = pd.read_csv('C:/Users/baran/Desktop/Introduction to biological data science/endeuhun 5.csv', sep=';')   

These are manually collected inanimate nouns, they work for the lower threshold mutuality scores

In [34]:
frequencies_df = pd.read_csv('C:/Users/baran/Desktop/Introduction to biological data science/frequencies.csv', sep=';')
inanimate_av_df = pd.read_csv('C:/Users/baran/Desktop/Introduction to biological data science/inanimate_arousal_valence.csv', sep=';')  

In [35]:
inanimate_av_df.rename(columns={'Word': 'en_inanimate'}, inplace=True) 
inanimate_df = pd.merge(frequencies_df, inanimate_av_df, on='en_inanimate', how='inner')  

As we have relative frequency, we need to transform them

In [36]:
inanimate_df['en_rel_freq'] = np.log10(inanimate_df['en_rel_freq'] + 1)
inanimate_df['de_rel_freq'] = np.log10(inanimate_df['de_rel_freq'] + 1)
inanimate_df['hu_rel_freq'] = np.log10(inanimate_df['hu_rel_freq'] + 1)

In [37]:
inanimate_df.rename(columns={
    'en_rel_freq': 'en_log_freq',
    'de_rel_freq': 'de_log_freq',
    'hu_rel_freq': 'hu_log_freq'
}, inplace=True)

In [38]:
inanimate_df['Valence'] = pd.to_numeric(inanimate_df['Valence'], errors='coerce')
inanimate_df['Arousal'] = pd.to_numeric(inanimate_df['Arousal'], errors='coerce')
endeuhun_df['A.Mean.Sum'] = pd.to_numeric(endeuhun_df['A.Mean.Sum'], errors='coerce')
endeuhun_df['V.Mean.Sum'] = pd.to_numeric(endeuhun_df['V.Mean.Sum'], errors='coerce')

In [39]:
inanimate_df.dropna(subset=['Valence', 'Arousal', 'en_log_freq', 'de_log_freq', 'hu_log_freq'], inplace=True)
endeuhun_df.dropna(subset=['A.Mean.Sum', 'V.Mean.Sum', 'en_log_freq', 'de_log_freq', 'hu_log_freq'], inplace=True)

The ultimately goal would be to increase this mutuality to 0.75. With 0.50, it works easily

In [40]:
social_verbs_pool = endeuhun_df[
    (endeuhun_df['EnglishMutuality'] > 0.5) & (endeuhun_df['EnglishJointness'] > 0.5)
    & (endeuhun_df['GermanMutuality'] > 0.5) & (endeuhun_df['GermanJointness'] > 0.5)
    & (endeuhun_df['HungarianMutuality'] > 0.5) & (endeuhun_df['HungarianJointness'] > 0.5)].copy()

In [41]:
print(social_verbs_pool['en_log_freq'].head()) 
# If you see numbers > 10, they are probably not logged

1    1.857847
4    1.099047
5    0.165873
8    0.153681
9    1.113157
Name: en_log_freq, dtype: float64


We only care about raising the social verb's mutuality scores

In [42]:
nonsocial_verbs_pool = endeuhun_df[
    (endeuhun_df['EnglishMutuality'] < -0.5) & (endeuhun_df['EnglishJointness'] < -0.5)
    & (endeuhun_df['GermanMutuality'] < -0.5) & (endeuhun_df['GermanJointness'] < -0.5)
    & (endeuhun_df['HungarianMutuality'] < -0.5) & (endeuhun_df['HungarianJointness'] < -0.5)].copy()

In [43]:
social_verbs_pool.rename(columns={'V.Mean.Sum': 'Valence', 'A.Mean.Sum': 'Arousal'}, inplace=True)
nonsocial_verbs_pool.rename(columns={'V.Mean.Sum': 'Valence', 'A.Mean.Sum': 'Arousal'}, inplace=True)

Unfortunately we cannot increase the verbs' pools, as the aim is to work with the existing dataset

In [44]:
print(f"All possible candidates: Social={len(social_verbs_pool)}, Non-social={len(nonsocial_verbs_pool)}, Inanimate={len(inanimate_df)}")

All possible candidates: Social=26, Non-social=36, Inanimate=45


These are the most problematic values

In [45]:
metrics = ['Arousal', 'Valence', 'en_log_freq']
print(f"{'Metric':<15} | {'Social Mean':<12} | {'Inanimate Mean':<15} | {'Difference'}")
print("-" * 60)

for m in metrics:
    soc_m = social_verbs_pool[m].mean()
    inan_m = inanimate_df[m].mean()
    print(f"{m:<15} | {soc_m:<12.3f} | {inan_m:<15.3f} | {abs(soc_m - inan_m):.3f}")

Metric          | Social Mean  | Inanimate Mean  | Difference
------------------------------------------------------------
Arousal         | 4.806        | 3.573           | 1.233
Valence         | 5.604        | 5.904           | 0.300
en_log_freq     | 1.016        | 1.613           | 0.597


In [46]:
print(inanimate_df[['Arousal', 'Valence', 'en_log_freq']].mean())

Arousal        3.573333
Valence        5.903778
en_log_freq    1.613138
dtype: float64


In [47]:
print(f"Unique words number: {inanimate_df['en_inanimate'].nunique()}")

Unique words number: 45


So, this is what works with the 0.5 mutuality. It is problematic with 0.75
We shouldn't decrease the size of the list either. 

In [30]:
def find_balanced_lists(social_pool, nonsocial_pool, inanimate_pool, list_size=15, max_iterations=20000):
    
    print(f"\nAttempting to find a balanced set of {list_size} verbs per list. This may take a moment as my computer is slow...")
    for i in range(max_iterations):
        
        social_sample = social_pool.sample(n=list_size, replace=False) # not the same twice
        nonsocial_sample = nonsocial_pool.sample(n=list_size, replace=False)
        inanimate_sample = inanimate_pool.sample(n=list_size, replace=False)

        # Store all p-values here
        p_values = {}

        # 1. Arousal and valence balancing (between the 3 categories)
        p_values['Arousal_Soc_vs_NonSoc'] = ttest_ind(social_sample['Arousal'], nonsocial_sample['Arousal']).pvalue
        p_values['Arousal_Soc_vs_Inan'] = ttest_ind(social_sample['Arousal'], inanimate_sample['Arousal']).pvalue
        p_values['Arousal_NonSoc_vs_Inan'] = ttest_ind(nonsocial_sample['Arousal'], inanimate_sample['Arousal']).pvalue
        p_values['Valence_Soc_vs_NonSoc'] = ttest_ind(social_sample['Valence'], nonsocial_sample['Valence']).pvalue
        p_values['Valence_Soc_vs_Inan'] = ttest_ind(social_sample['Valence'], inanimate_sample['Valence']).pvalue
        p_values['Valence_NonSoc_vs_Inan'] = ttest_ind(nonsocial_sample['Valence'], inanimate_sample['Valence']).pvalue

        # 2. Frequency balancing (WITHIN each language)
        for lang in ['en', 'de', 'hu']:
            freq_col = f'{lang}_log_freq'
            p_values[f'Freq_{lang}_Soc_vs_NonSoc'] = ttest_ind(social_sample[freq_col], nonsocial_sample[freq_col]).pvalue
            p_values[f'Freq_{lang}_Soc_vs_Inan'] = ttest_ind(social_sample[freq_col], inanimate_sample[freq_col]).pvalue
            p_values[f'Freq_{lang}_NonSoc_vs_Inan'] = ttest_ind(nonsocial_sample[freq_col], inanimate_sample[freq_col]).pvalue

        # Check if all p-values are non-significant (e.g., > 0.05)
        if all(p > 0.05 for p in p_values.values()):
            print(f"\nSUCCESS YEY! Found a fully balanced set of lists after {i+1} iterations.")
            return social_sample, nonsocial_sample, inanimate_sample, p_values

        # Print progress for fun
        if (i + 1) % 2000 == 0:
            print(f"  ...checked {i+1} combinations, still searching...")

    print(f"\nFAILED to find a fully balanced set after {max_iterations} iterations.")
    print("Consider trying again or cry.")
    return None, None, None, None


social_list, nonsocial_list, inanimate_list, final_p_values = find_balanced_lists(
    social_verbs_pool, nonsocial_verbs_pool, inanimate_df, list_size=15
)

if final_p_values:
     
    print("\n" + "="*70)
    print("        FINAL BALANCED VERB LISTS")
    print("="*70)

    # English lists
    print("\n--- ENGLISH ---")
    print("\nSocial (EN):")
    print(social_list[['English', 'Arousal', 'Valence', 'en_log_freq']])
    print("\nNon-Social (EN):")
    print(nonsocial_list[['English', 'Arousal', 'Valence', 'en_log_freq']])
    print("\nInanimate (EN):")
    print(inanimate_list[['en_inanimate', 'Arousal', 'Valence', 'en_log_freq']].rename(columns={'en_inanimate': 'English'}))

    # German lists
    print("\n\n--- GERMAN ---")
    print("\nSocial (DE):")
    print(social_list[['German', 'Arousal', 'Valence', 'de_log_freq']])
    print("\nNon-Social (DE):")
    print(nonsocial_list[['German', 'Arousal', 'Valence', 'de_log_freq']])
    print("\nInanimate (DE):")
    print(inanimate_list[['de_inanimate', 'Arousal', 'Valence', 'de_log_freq']].rename(columns={'de_inanimate': 'German'}))

    # Hungarian lists
    print("\n\n--- HUNGARIAN ---")
    print("\nSocial (HU):")
    print(social_list[['Hungarian', 'Arousal', 'Valence', 'hu_log_freq']])
    print("\nNon-Social (HU):")
    print(nonsocial_list[['Hungarian', 'Arousal', 'Valence', 'hu_log_freq']])
    print("\nInanimate (HU):")
    print(inanimate_list[['hu_inanimate', 'Arousal', 'Valence', 'hu_log_freq']].rename(columns={'hu_inanimate': 'Hungarian'}))

    # Display p-values 
    print("\n" + "="*70)
    print("        T-TEST P-VALUES FOR BALANCING")
    print("="*50)
    print("\n--- Arousal and Valence ---")
    for key, val in final_p_values.items():
        if 'Arousal' in key or 'Valence' in key:
            print(f"{key:<25}: {val:.4f}")

    print("\n--- Within-Language Frequencies ---")
    for key, val in final_p_values.items():
        if 'Freq_en' in key or 'Freq_de' in key or 'Freq_hu' in key:
            print(f"{key:<25}: {val:.4f}")


Attempting to find a balanced set of 15 verbs per list. This may take a moment as my computer is slow...
  ...checked 2000 combinations, still searching...
  ...checked 4000 combinations, still searching...
  ...checked 6000 combinations, still searching...
  ...checked 8000 combinations, still searching...
  ...checked 10000 combinations, still searching...
  ...checked 12000 combinations, still searching...
  ...checked 14000 combinations, still searching...
  ...checked 16000 combinations, still searching...
  ...checked 18000 combinations, still searching...
  ...checked 20000 combinations, still searching...

FAILED to find a fully balanced set after 20000 iterations.
Consider trying again, reducing the list_size, or relaxing the p-value threshold or cry.


For export, if successful

In [83]:
if final_p_values:
    print("\nPreparing dataframes for export...")

    social_df = social_list.copy()
    nonsocial_df = nonsocial_list.copy()
    inanimate_df = inanimate_list.copy()

    social_df["verb_cat"] = "Social"
    nonsocial_df["verb_cat"] = "Nonsocial"
    inanimate_df["verb_cat"] = "Inanimate"

    inanimate_df.rename(columns={
        'en_inanimate': 'English',
        'de_inanimate': 'German',
        'hu_inanimate': 'Hungarian'
    }, inplace=True)

    combined_df = pd.concat([social_df, nonsocial_df, inanimate_df], ignore_index=True)

    final_columns_order = [
        'verb_cat',
        'English', 'German', 'Hungarian',
        'Arousal', 'Valence',
        'en_log_freq', 'de_log_freq', 'hu_log_freq',
        'EnglishMutuality', 'EnglishJointness',
        'GermanMutuality', 'GermanJointness',
        'HungarianMutuality', 'HungarianJointness'
    ]
    

    final_columns_order_existing = [col for col in final_columns_order if col in combined_df.columns]
    final_export_df = combined_df[final_columns_order_existing]
    
    p_values_df = pd.DataFrame(final_p_values.items(), columns=['Comparison', 'P_Value'])

    output_filename = "balanced_stimuli_lists.xlsx"
    with pd.ExcelWriter(output_filename) as writer:
        final_export_df.to_excel(writer, sheet_name='Verb_Lists', index=False)
        p_values_df.to_excel(writer, sheet_name='Balancing_P_Values', index=False)

    print(f"\nSUCCESS: All data has been exported to '{output_filename}'")
    print("The file contains two sheets: 'Verb_Lists' and 'Balancing_P_Values'.")

else:
    print("\nData export skipped because no balanced lists were found.")


Preparing dataframes for export...

SUCCESS: All data has been exported to 'balanced_stimuli_lists.xlsx'
The file contains two sheets: 'Verb_Lists' and 'Balancing_P_Values'.


In [49]:
def debug_p_values(social_pool, nonsocial_pool, inanimate_pool, iterations=500):
    fail_counts = {}
    
    for _ in range(iterations):
        soc = social_pool.sample(15)
        non = nonsocial_pool.sample(15)
        inan = inanimate_pool.sample(15)
        
        for col in ['Arousal', 'Valence', 'en_log_freq', 'de_log_freq', 'hu_log_freq']:
            p_sn = ttest_ind(soc[col if col in soc else 'Arousal'], non[col if col in non else 'Arousal']).pvalue
            p_si = ttest_ind(soc[col if col in soc else 'Arousal'], inan[col if col in inan else 'en_log_freq']).pvalue
            
            if p_sn < 0.05: fail_counts[f"{col}_Soc_vs_Non"] = fail_counts.get(f"{col}_Soc_vs_Non", 0) + 1
            if p_si < 0.05: fail_counts[f"{col}_Soc_vs_Inan"] = fail_counts.get(f"{col}_Soc_vs_Inan", 0) + 1

    print("--- Problems in the columns (the % for p > .5 problems) ---")
    for k, v in sorted(fail_counts.items(), key=lambda x: x[1], reverse=True):
        print(f"{k}: {(v/iterations)*100:.1f}%")

debug_p_values(social_verbs_pool, nonsocial_verbs_pool, inanimate_df)

--- Problems in the columns (the % for p > .5 problems) ---
hu_log_freq_Soc_vs_Inan: 100.0%
Arousal_Soc_vs_Inan: 98.4%
hu_log_freq_Soc_vs_Non: 96.4%
en_log_freq_Soc_vs_Inan: 79.2%
de_log_freq_Soc_vs_Inan: 79.2%
Arousal_Soc_vs_Non: 67.8%
de_log_freq_Soc_vs_Non: 58.0%
en_log_freq_Soc_vs_Non: 31.8%
Valence_Soc_vs_Non: 2.6%
Valence_Soc_vs_Inan: 1.2%


This just shows that this is impossible with these data only
It is either adding manually "okay" data for the manual data, or...

Trying to increase the inanimate nouns pool

In [50]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\baran\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [51]:
pip install deep-translator




In [52]:
nltk.download('brown')

[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\baran\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!


True

In [53]:
nltk.download('universal_tagset')

[nltk_data] Downloading package universal_tagset to
[nltk_data]     C:\Users\baran\AppData\Roaming\nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


True

In [54]:
# Create the frequency data 
brown_news_tagged = brown.tagged_words(tagset='universal')
noun_freq = nltk.FreqDist(word.lower() for word, tag in brown_news_tagged if tag == 'NOUN')

def get_large_inanimate_pool(min_freq=5):
    # SAFE CATEGORIES
    safe_roots = [
        'artifact.n.01', 'tool.n.01', 'furniture.n.01', 'vehicle.n.01', 
        'container.n.01', 'garment.n.01', 'appliance.n.01', 'instrument.n.01'
    ]
    
    manual_blacklist = {
        'man', 'way', 'work', 'number', 'course', 'system', 'point', 'end', 
        'john', 'case', 'thing', 'area', 'form', 'god', 'service', 'line', 
        'action', 'body', 'hand', 'head', 'face', 'side', 'back', 'part', 'ways', 'rooms'
    }

    valid_nouns = set()
    for root_name in safe_roots:
        root = wn.synset(root_name)
        for synset in root.closure(lambda s: s.hyponyms()):
            for lemma in synset.lemmas():
                word = lemma.name().lower()
                
                # We use the global noun_freq variable here
                if (word.isalpha() and 3 <= len(word) <= 12 and 
                    word not in manual_blacklist and
                    word in noun_freq and 
                    noun_freq[word] >= min_freq):
                    valid_nouns.add(word)

    result = sorted(list(valid_nouns), key=lambda x: noun_freq[x], reverse=True)
    return result

# Run the function
noun_list = get_large_inanimate_pool(min_freq=5)

print(f"Length of the list: {len(noun_list)}")
print(noun_list[:100])

Length of the list: 1300
['house', 'home', 'school', 'water', 'room', 'church', 'door', 'car', 'field', 'college', 'light', 'office', 'street', 'job', 'board', 'court', 'university', 'center', 'means', 'study', 'art', 'type', 'book', 'road', 'table', 'level', 'control', 'surface', 'secretary', 'figure', 'space', 'union', 'fire', 'plan', 'stage', 'material', 'ground', 'view', 'equipment', 'defense', 'change', 'picture', 'floor', 'wall', 'paper', 'market', 'story', 'hall', 'earth', 'hair', 'production', 'stock', 'front', 'club', 'range', 'letter', 'bill', 'rest', 'future', 'issue', 'volume', 'series', 'top', 'movement', 'bed', 'piece', 'support', 'record', 'hotel', 'eye', 'game', 'spring', 'plant', 'arms', 'window', 'image', 'radio', 'steps', 'gun', 'horse', 'corner', 'plane', 'pattern', 'staff', 'ball', 'design', 'hospital', 'step', 'approach', 'pool', 'charge', 'square', 'scene', 'building', 'station', 'return', 'mouth', 'machine', 'press', 'race']


My computer cannot really do more

In [55]:
final_noun_pool = noun_list[:300]

Now, let's translate them

In [56]:
# translation
translator_hu = GoogleTranslator(source = 'en', target = 'hu')
translator_de = GoogleTranslator(source= 'en', target = 'de')

In [57]:
translated_nouns = []
total = len(final_noun_pool[:300]) 

print(f"Translation started: 0/{total}")

for i, word in enumerate(final_noun_pool[:300]):
    try:
        hu = translator_hu.translate(word).lower()
        de = translator_de.translate(word).lower()
        translated_nouns.append({'English': word, 'Hungarian': hu, 'German': de})
        
        if (i + 1) % 10 == 0:
            print(f"Progress: {i + 1}/{total} word done...")
            
    except Exception as e:
        print(f"Problem with'{word}': {e}")
        continue

print("Translation done")

Translation started: 0/300
Progress: 10/300 word done...
Progress: 20/300 word done...
Progress: 30/300 word done...
Progress: 40/300 word done...
Progress: 50/300 word done...
Progress: 60/300 word done...
Progress: 70/300 word done...
Progress: 80/300 word done...
Problem with'pattern': HTTPSConnectionPool(host='translate.google.com', port=443): Max retries exceeded with url: /m?tl=de&sl=en&q=pattern (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x00000164516CF250>, 'Connection to translate.google.com timed out. (connect timeout=None)'))
Progress: 90/300 word done...
Progress: 100/300 word done...
Progress: 110/300 word done...
Progress: 120/300 word done...
Progress: 130/300 word done...
Progress: 140/300 word done...
Progress: 150/300 word done...
Progress: 160/300 word done...
Progress: 170/300 word done...
Progress: 180/300 word done...
Progress: 190/300 word done...
Progress: 200/300 word done...
Progress: 210/300 word done...
Progress: 220/300 wor

In [58]:
translated_nouns

[{'English': 'house', 'Hungarian': 'ház', 'German': 'haus'},
 {'English': 'home', 'Hungarian': 'otthon', 'German': 'heim'},
 {'English': 'school', 'Hungarian': 'iskola', 'German': 'schule'},
 {'English': 'water', 'Hungarian': 'víz', 'German': 'wasser'},
 {'English': 'room', 'Hungarian': 'szoba', 'German': 'zimmer'},
 {'English': 'church', 'Hungarian': 'templom', 'German': 'kirche'},
 {'English': 'door', 'Hungarian': 'ajtó', 'German': 'tür'},
 {'English': 'car', 'Hungarian': 'autó', 'German': 'auto'},
 {'English': 'field', 'Hungarian': 'mező', 'German': 'feld'},
 {'English': 'college', 'Hungarian': 'főiskola', 'German': 'hochschule'},
 {'English': 'light', 'Hungarian': 'fény', 'German': 'licht'},
 {'English': 'office', 'Hungarian': 'hivatal', 'German': 'büro'},
 {'English': 'street', 'Hungarian': 'utca', 'German': 'straße'},
 {'English': 'job', 'Hungarian': 'munka', 'German': 'arbeit'},
 {'English': 'board', 'Hungarian': 'bizottság', 'German': 'planke'},
 {'English': 'court', 'Hungarian

In [59]:
df_inanimate_nouns = pd.DataFrame(translated_nouns)
df_inanimate_nouns.to_csv('translated_inanimate_nouns.csv', index = False)

print(f"Yey! {len(translated_nouns)} inanimate nouns saved.")

Yey! 299 inanimate nouns saved.


Of course, I needed to manually extract the valence and arousal ratings from EnglishLexicon (https://elexicon.wustl.edu/ -> it is a safe website, eventhough it says the opposite). We use the same ratings for all three languages. For frequencies, I used SketchEngine (enTenTen21, huTenTen23, deTenTen23 - not the up-to-date version, but the obsolete to match the LISADA database). 

In [62]:
path = r'C:/Users/baran/Desktop/Introduction to biological data science/nltk_inanimate_arousal_valence.xlsx'
df_v_a = pd.read_excel(path)

In [63]:
df_freq_hu = pd.read_csv('C:/Users/baran/Desktop/Introduction to biological data science/nltk_inanimate_nouns_frequency_hu_se.csv', sep=';')
df_freq_en = pd.read_csv('C:/Users/baran/Desktop/Introduction to biological data science/nltk_inanimate_nouns_frequency_en_se.csv', sep=';')
df_freq_de = pd.read_csv('C:/Users/baran/Desktop/Introduction to biological data science/nltk_inanimate_nouns_frequency_de_se.csv', sep=';')

Big merge for inanimate nouns

In [64]:
# base df
if isinstance(translated_nouns, list):
    df_final = pd.DataFrame(translated_nouns)
else:
    df_final = translated_nouns.copy()

# columns standardization
df_v_a.columns = ['Word', 'Valence', 'Arousal']
# freqs 
df_freq_en.columns = ['Word', 'en_rel_freq']
df_freq_hu.columns = ['Word', 'hu_rel_freq']
df_freq_de.columns = ['Word', 'de_rel_freq']

# low, clear
for df in [df_v_a, df_freq_en, df_freq_hu, df_freq_de]:
    df['Word'] = df['Word'].astype(str).str.lower().str.strip()

for col in ['English', 'Hungarian', 'German']:
    df_final[col] = df_final[col].astype(str).str.lower().str.strip()

# Merge
# Arousal és valence based on EN
df_final = pd.merge(df_final, df_v_a, left_on='English', right_on='Word', how='left').drop(columns=['Word'])

# EN freq based on EN 
df_final = pd.merge(df_final, df_freq_en, left_on='English', right_on='Word', how='left').drop(columns=['Word'])

# HU freq
df_final = pd.merge(df_final, df_freq_hu, left_on='Hungarian', right_on='Word', how='left').drop(columns=['Word'])

# DE freq
df_final = pd.merge(df_final, df_freq_de, left_on='German', right_on='Word', how='left').drop(columns=['Word'])

# Dropping
df_final = df_final.dropna()

print("Nice merge! Columns: ", df_final.columns.tolist())
print(df_final.head())

Nice merge! Columns:  ['English', 'Hungarian', 'German', 'Valence', 'Arousal', 'en_rel_freq', 'hu_rel_freq', 'de_rel_freq']
  English Hungarian  German Valence Arousal  en_rel_freq  hu_rel_freq  \
0   house       ház    haus    7.19    3.95    330.28630    320.02261   
1    home    otthon    heim    7.48    3.78    492.10373    136.20335   
2  school    iskola  schule    5.41    4.57    517.43270    270.05781   
3   water       víz  wasser       7    3.71    371.39904    338.04666   
4    room     szoba  zimmer    5.55     3.1    247.01542    130.39167   

   de_rel_freq  
0    313.64839  
1     10.94912  
2    177.14626  
3    198.99282  
4     64.48913  


In [65]:
# log transform
df_final['en_log_freq'] = np.log10(df_final['en_rel_freq'] + 1)
df_final['hu_log_freq'] = np.log10(df_final['hu_rel_freq'] + 1)
df_final['de_log_freq'] = np.log10(df_final['de_rel_freq'] + 1)

# check
print(df_final[['en_log_freq', 'hu_log_freq', 'de_log_freq']].head())

   en_log_freq  hu_log_freq  de_log_freq
0     2.520203     2.506536     2.497826
1     2.692938     2.137365     1.077336
2     2.714692     2.433062     2.250777
3     2.571009     2.530259     2.301014
4     2.394479     2.118568     1.816169


It is a foreshadow here, that the words have a very high frequency compared to what we have for the social verbs. 

In [75]:
df_final['en_log_freq'].mean()

np.float64(1.9239021009349186)

In [74]:
endeuhun_df['en_log_freq'].mean()

np.float64(1.2421537781725662)

In [76]:
output_path = r'C:\Users\baran\Desktop\Introduction to biological data science\df_inanimate_merged_final.csv'
df_final.to_csv(output_path, index=False, encoding='utf-8-sig')
print(f"Succesfully saved here: {output_path}")

Succesfully saved here: C:\Users\baran\Desktop\Introduction to biological data science\df_inanimate_merged_final.csv


In [77]:
print(f"Pool sizes: Social={len(social_verbs_pool)}, Non-social={len(nonsocial_verbs_pool)}, Inanimate={len(df_final)}")

Pool sizes: Social=26, Non-social=36, Inanimate=263


In [78]:
def clean_types(df):
    cols = ['Arousal', 'Valence', 'en_log_freq', 'hu_log_freq', 'de_log_freq']
    for col in cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')
    return df.dropna(subset=cols)

social_verbs_pool = clean_types(social_verbs_pool)
nonsocial_verbs_pool = clean_types(nonsocial_verbs_pool)
df_final = clean_types(df_final)

Filtering for frequencies and arousal (my current idea is to merge these nouns with my manual data and see if that changes anything. Maybe the bigger pool would do better)

In [80]:
# verb pools log_freq average < 1.5, taking out the top of the df_final
df_final_filtered = df_final[
    (df_final['en_log_freq'] < 1.9) & 
    (df_final['hu_log_freq'] < 1.9)
]

# social arousal average 4.8, inanimate only 3.8, taking out the very low arousal nouns
df_final_filtered = df_final_filtered[df_final_filtered['Arousal'] > 3.0]

print(f"New inanimate nouns' number: {len(df_final_filtered)} word")

New inanimate nouns' number: 78 word


Kind of the same balancing algorithm, but with the nltk nouns, including some cost funcion. 

In [82]:
def find_balanced_lists(social_pool, nonsocial_pool, df_final_filtered, list_size=15, max_iterations=10000):
    
    best_cost = float('inf')
    best_samples = (None, None, None, None)

    print(f"\nSearching for balanced lists using df_final (max {max_iterations} iterations)...")
    
    for i in range(max_iterations):
        # sampling from pools
        social_sample = social_pool.sample(n=list_size)
        nonsocial_sample = nonsocial_pool.sample(n=list_size)
        inanimate_sample = df_final.sample(n=list_size)

        p_values = {}
        cost = 0

        # what to check columns 
        metrics = [('Arousal', None), ('Valence', None)] + [('log_freq', lang) for lang in ['en', 'de', 'hu']]
        
        for feat, lang in metrics:
            col = f"{lang}_{feat}" if lang else feat
            s1, s2, s3 = social_sample[col], nonsocial_sample[col], inanimate_sample[col]
            
            # p-values
            p_values[f'{col}_Soc_Non'] = ttest_ind(s1, s2).pvalue
            p_values[f'{col}_Soc_Inan'] = ttest_ind(s1, s3).pvalue
            p_values[f'{col}_Non_Inan'] = ttest_ind(s2, s3).pvalue
            
            # cost calculation: the sum of the abs diff bw averages 
            means = [s1.mean(), s2.mean(), s3.mean()]
            cost += abs(means[0]-means[1]) + abs(means[1]-means[2]) + abs(means[0]-means[2])

        # if every p > .5, we stop 
        if all(p > 0.05 for p in p_values.values()):
            print(f"PERFECT MATCH found at iteration {i+1}!")
            return social_sample, nonsocial_sample, inanimate_sample, p_values

        # best result save (cost function)
        if cost < best_cost:
            best_cost = cost
            best_samples = (social_sample, nonsocial_sample, inanimate_sample, p_values)

        if (i + 1) % 1000 == 0: 
            print(f"  ...{i+1} done (best cost so far: {best_cost:.4f})")

    print("\nNo perfect match. Returning the best possible balance found. Sadness.")
    return best_samples


social_list, nonsocial_list, inanimate_list, final_p_values = find_balanced_lists(
    social_verbs_pool, nonsocial_verbs_pool, df_final, list_size=15
)

# 2. Check
if social_list is not None:
    print("\n" + "="*70)
    print("        SUCCESS: THE BEST LIST")
    print("="*70)

    # Lists display
    for name, df_list in [("SOCIAL", social_list), ("NON-SOCIAL", nonsocial_list), ("INANIMATE", inanimate_list)]:
        print(f"\n--- {name} list ---")
        # check whether the cols exist 
        cols = ['English', 'German', 'Hungarian', 'Arousal', 'Valence', 'en_log_freq']
        available_cols = [c for c in cols if c in df_list.columns]
        print(df_list[available_cols])

    # p-values 
    print("\n" + "="*70)
    print("        SIGNIFICANCE (P-VALUES)")
    print("        (Goal: p > 0.05 everywhere)")
    print("="*70)
    for key, val in final_p_values.items():
        mark = "✅" if val > 0.05 else "❌"
        print(f"{mark} {key:<25}: {val:.4f}")
else:
    print("Problem!")


Searching for balanced lists using df_final (max 10000 iterations)...
  ...1000 done (best cost so far: 5.5900)
  ...2000 done (best cost so far: 5.5900)
  ...3000 done (best cost so far: 4.6093)
  ...4000 done (best cost so far: 4.6093)
  ...5000 done (best cost so far: 4.6093)
  ...6000 done (best cost so far: 4.6093)
  ...7000 done (best cost so far: 4.6093)
  ...8000 done (best cost so far: 4.6093)
  ...9000 done (best cost so far: 4.6093)
  ...10000 done (best cost so far: 4.6093)

No perfect match. Returning the best possible balance found. Sadness.

        SUCCESS: THE BEST LIST

--- SOCIAL list ---
           English            German      Hungarian  Arousal  Valence  \
51     to squabble            zoffen       balhézni     4.06     3.60   
11     to chitchat         quatschen       csevegni     3.88     6.00   
39         to meet           treffen     találkozni     3.71     6.09   
34       to gossip         tratschen     pletykálni     3.91     3.50   
15      to compete 

# References

Srdoc, T., Marx, E., Safrany, A. V., Balla, A., & Wittenberg, E. (2025). Linguistic Impact on Social Action Construal Database (LISADA). https://osf.io/xzdqc/

Sdroc, T., Marx, E., & Wittenberg, E. (2025). Event construal through social verbs in English and German: The LISADA corpus. Processings of the Cognitive Science Society Annual Meeting (Vol. 47). 