In [1]:
import dhlab as dh
import pandas as pd

In [34]:
tot = dh.totals(200000)

In [40]:
paradigms = dh.WordParadigm([x for x in tot.index]).frame

In [36]:
forms = dh.WordForm([x for x in tot.index]).frame 

In [43]:
paradigms.columns=  ["word","pos", "paradigm"]

In [79]:
forms.columns = ["word", "form"]

In [63]:
paradigms[paradigms.word.isin(paradigms[paradigms.word=="fisker"].iloc[0].paradigm)]

Unnamed: 0,word,pos,paradigm
17201,fiska,adj,"[fiska, fiskede, fiskende, fisket, fiskete]"
17202,fiska,subs,"[fiska, fiske, fiskene, fisker, fisket]"
17203,fiska,verb,"[fisk, fiska, fiske, fisker, fiskes, fisket]"
17278,fisket,adj,"[fiska, fiskede, fiskende, fisket, fiskete]"
17279,fisket,subs,"[fiska, fiske, fiskene, fisker, fisket]"
17280,fisket,verb,"[fisk, fiska, fiske, fisker, fiskes, fisket]"


In [58]:
[x for x in paradigms[paradigms.word=="fisker"].iloc[2].paradigm if x in paradigms.index]

[]

In [68]:
paradigms[paradigms.word=="fisker"].iloc[2]

word                                              fisker
pos                                                 verb
paradigm    [fisk, fiska, fiske, fisker, fiskes, fisket]
Name: 17244, dtype: object

In [69]:
def analyze_paradigm_ambiguity(paradigms_df):
    # Group by word to find words with multiple paradigms
    ambiguous = paradigms_df.groupby('word').agg({
        'pos': lambda x: list(x),
        'paradigm': lambda x: list(x)
    }).reset_index()
    
    # Filter for words that appear in multiple paradigms
    ambiguous = ambiguous[ambiguous['pos'].str.len() > 1]
    return ambiguous

# Then for analyzing form distributions:
def analyze_form_distributions(paradigms_df):
    # Create distributions of forms within each POS
    pos_form_distributions = {}
    for pos in paradigms_df['pos'].unique():
        pos_paradigms = paradigms_df[paradigms_df['pos'] == pos]
        # Flatten all paradigms for this POS
        all_forms = [form for paradigm in pos_paradigms['paradigm'] for form in paradigm]
        # Calculate frequency distribution
        forms_freq = pd.Series(all_forms).value_counts(normalize=True)
        pos_form_distributions[pos] = forms_freq
    return pos_form_distributions

In [70]:
def estimate_paradigm_probability(word_form, form_distributions):
    probs = {}
    for pos, distribution in form_distributions.items():
        if word_form in distribution:
            probs[pos] = distribution[word_form]
    # Normalize probabilities
    total = sum(probs.values())
    return {pos: prob/total for pos, prob in probs.items()}

In [71]:
ambig = analyze_paradigm_ambiguity(paradigms)

In [72]:
ambig

Unnamed: 0,word,pos,paradigm
0,A,"[adv, subs]","[[A], [A, A'en, A-en, A-ene, A-er]]"
5,A/S,"[fork, henv]","[[A.S, A/S, AS, a.s., a/s, as], [A/S]]"
49,CD,"[fork, subs]","[[CD], [CD, CDen, CDene, CDer]]"
86,E,"[fork, subs]","[[E], [E, E'en, E-en, E-ene, E-er]]"
142,I,"[pron, subs, symb]","[[Eder, I, eder], [I, I'en, I-en, I-ene, I-er]..."
...,...,...,...
60152,øyer,"[adj, subs, verb]","[[øyd, øyde, øyende], [øy, øya, øyen, øyene, ø..."
60159,øyne,"[adj, subs, verb]","[[øyna, øynede, øynende, øynet, øynete], [øya,..."
60161,øyner,"[adj, verb]","[[øyna, øynede, øynende, øynet, øynete], [øyn,..."
60162,øynes,"[adj, verb]","[[øyna, øynede, øynende, øynet, øynete], [øyn,..."


In [80]:
# First, get clean forms for all words in paradigms
paradigm_words = set([word for paradigm_list in paradigms['paradigm'] for word in paradigm_list])
word_forms = dh.WordForm(list(paradigm_words)).frame


In [118]:
def clean_word_forms(word_forms_df):
    def extract_basic_form(form_str):
        parts = form_str.split()
        base_pos = parts[0]  # First part is usually the POS
        
        # Handle adverbs - any form string starting with 'adv' should be marked as adverb
        if base_pos == 'adv':
            return 'adverb'
            
        elif base_pos == 'verb':
            if 'inf' in parts:
                return 'infinitive'
            elif 'perf-part' in parts:
                return 'perfect-participle'
            elif 'pres-part' in parts:
                return 'present-participle'
            elif 'pret' in parts:
                return 'preterite'
            elif 'pres' in parts:
                return 'present'
            elif 'imp' in parts:
                return 'imperative'
            return 'unknown-verb'
                
        elif base_pos == 'subst':
            form = []
            if 'ent' in parts:
                form.append('singular')
            elif 'fl' in parts:
                form.append('plural')
            if 'be' in parts:
                form.append('definite')
            elif 'ub' in parts:
                form.append('indefinite')
            return '-'.join(form) if form else 'unknown-noun'
            
        elif base_pos == 'adj':
            if '<perf-part>' in form_str:
                return 'participle-adj'
            elif 'pos' in parts:
                return 'positive'
            elif 'komp' in parts:
                return 'comparative'
            elif 'sup' in parts:
                return 'superlative'
            return 'unknown-adj'
            
        return f'unknown-{base_pos}'

    word_forms_df['clean_form'] = word_forms_df['form'].apply(extract_basic_form)
    return word_forms_df

In [105]:
# Then analyze distributions
def analyze_paradigm_distributions(paradigms_df, word_forms_df):
    # Group word forms by their base form to get form types
    form_types = {}
    for _, row in word_forms_df.iterrows():
        form_types[row['word']] = row['clean_form']  # Assuming 'form' is the column with form type
    
    # Analyze distribution within each POS type
    pos_distributions = {}
    for pos in paradigms_df['pos'].unique():
        pos_forms = []
        # Get all forms from paradigms of this POS
        pos_paradigms = paradigms_df[paradigms_df['pos'] == pos]
        for paradigm in pos_paradigms['paradigm']:
            forms = [form_types.get(word, 'unknown') for word in paradigm]
            pos_forms.extend(forms)
        
        # Calculate distribution
        distribution = pd.Series(pos_forms).value_counts(normalize=True)
        pos_distributions[pos] = distribution
    
    return pos_distributions

In [85]:
word_forms.columns = ["word","form"]

In [123]:
word_forms[word_forms.form.str.startswith("subst")]

Unnamed: 0,word,form,clean_form
0,lønnen,subst mask appell ent be normert,singular-definite
4,karrierene,subst fem appell fl be unormert,plural-definite
5,karrierene,subst mask appell fl be normert,plural-definite
7,verdensrya,subst nøyt appell fl be normert,plural-definite
8,overdekkene,subst nøyt appell fl be normert,plural-definite
...,...,...,...
247057,farmødrene,subst fem appell fl be normert,plural-definite
247058,farmødrene,subst mask appell fl be normert,plural-definite
247059,bybefolkninger,subst fem appell fl ub normert,plural-indefinite
247060,bybefolkninger,subst mask appell fl ub normert,plural-indefinite


In [124]:
tot

Unnamed: 0,freq
.,7655423257
",",5052171514
i,2531262027
og,2520268056
-,1314451583
...,...
enterprises,17126
Isaiah,17126
velprøvd,17125
udløb,17125


In [119]:
# Example usage:
cleaned_forms = clean_word_forms(word_forms)

In [120]:
pos = analyze_paradigm_distributions(paradigms, cleaned_forms)

In [121]:
pos

{'adv': adverb                 0.750594
 imperative             0.097387
 singular-indefinite    0.057007
 plural-indefinite      0.042755
 preterite              0.009501
 perfect-participle     0.007126
 unknown-sbu            0.004751
 positive               0.004751
 unknown-noun           0.004751
 singular-definite      0.004751
 unknown-interj         0.004751
 unknown-symb           0.002375
 unknown-prep           0.002375
 unknown-det            0.002375
 infinitive             0.002375
 unknown-konj           0.002375
 Name: proportion, dtype: float64,
 'subs': plural-definite        0.271342
 singular-definite      0.268347
 plural-indefinite      0.228297
 singular-indefinite    0.161466
 imperative             0.027228
 preterite              0.020067
 present                0.012794
 positive               0.004110
 infinitive             0.002312
 unknown-noun           0.002245
 perfect-participle     0.000638
 participle-adj         0.000252
 adverb                 0.

In [97]:
word_forms[word_forms['form'].str.contains('adj')].iloc[4]

word                                             tvinnede
form    adj <perf-part> fl <trans1> <refl5> <trans9> n...
Name: 10, dtype: object

In [103]:
verb_dist = pos['verb']
print("Verb form distributions:")
print(f"Past tense: {verb_dist.get('past', 0)*100:.1f}%")
print(f"Present: {verb_dist.get('present', 0)*100:.1f}%")
print(f"Infinitive: {verb_dist.get('infinitive', 0)*100:.1f}%")
print(f"Participle: {verb_dist.get('participle', 0)*100:.1f}%")

Verb form distributions:
Past tense: 0.0%
Present: 0.0%
Infinitive: 0.0%
Participle: 0.0%


In [128]:
from collections import defaultdict


In [None]:
from collections import defaultdict
import pandas as pd

def analyze_form_frequencies(paradigms_df, word_forms_df, tot_df):
    # Create lookup for clean forms
    form_lookup = dict(zip(word_forms_df['word'], word_forms_df['clean_form']))
    
    # Dictionary to store frequencies by POS and form
    pos_frequencies = {}
    
    for pos in paradigms_df['pos'].unique():
        form_freqs = defaultdict(int)
        pos_paradigms = paradigms_df[paradigms_df['pos'] == pos]
        
        for _, row in pos_paradigms.iterrows():
            # For each word in the paradigm
            for word in row['paradigm']:
                if word in tot_df.index:
                    # Get the actual frequency value
                    freq = tot_df.loc[word].iloc[0]  # or .values[0] would also work
                    form = form_lookup.get(word, 'unknown')
                    form_freqs[form] += freq
        
        # Convert to proportions
        total = sum(form_freqs.values())
        if total > 0:
            proportions = {form: freq/total for form, freq in form_freqs.items()}
            pos_frequencies[pos] = pd.Series(proportions)
    
    return pos_frequencies

In [132]:
# Usage:
freq_distributions = analyze_form_frequencies(paradigms, word_forms, tot)

In [None]:
pos_metadata = {
    'total_occurrences': sum of all frequencies,
    'unique_forms': number of distinct forms,
    'top_forms': most frequent forms,
    'ambiguity_score': measure of form distribution entropy
}

In [133]:
def compare_form_distributions(raw_dist, freq_dist):
    """Compare raw form distributions with frequency-weighted ones"""
    all_forms = set(raw_dist.index) | set(freq_dist.index)
    
    comparison = pd.DataFrame({
        'raw': [raw_dist.get(form, 0) for form in all_forms],
        'frequency_weighted': [freq_dist.get(form, 0) for form in all_forms]
    }, index=all_forms)
    
    comparison['difference'] = comparison['frequency_weighted'] - comparison['raw']
    return comparison.sort_values('difference', ascending=False)

In [134]:
freq_distributions

{'adv': adverb                 0.264277
 unknown-symb           0.028148
 plural-indefinite      0.108983
 imperative             0.337237
 unknown-sbu            0.155197
 singular-indefinite    0.045803
 unknown-det            0.006257
 unknown-prep           0.014774
 infinitive             0.000338
 preterite              0.002121
 unknown-interj         0.015119
 positive               0.000257
 perfect-participle     0.010236
 unknown-konj           0.004186
 singular-definite      0.000010
 unknown-noun           0.007058
 dtype: float64,
 'subs': adverb                 0.004101
 singular-indefinite    0.325045
 plural-indefinite      0.204370
 singular-definite      0.106573
 unknown-noun           0.012608
 unknown-fork           0.006269
 plural-definite        0.029714
 unknown-symb           0.017080
 imperative             0.171191
 preterite              0.056831
 present                0.028890
 unknown-prep           0.000113
 infinitive             0.013975
 perfect-pa

In [138]:
tot.loc[[x.strip() for x in "bok, boka, boken, bøker, bøkene".split(',')]].to_csv()

',freq\nbok,8044500\nboka,3299703\nboken,3623884\nbøker,4804941\nbøkene,1412713\n'

In [137]:
tot.loc[[x.strip() for x in "kvinne, kvinnen, kvinner, kvinnene".split(',')]]

Unnamed: 0,freq
kvinne,6769333
kvinnen,3103342
kvinner,14345018
kvinnene,2676898
