In [1]:
import pandas as pd
from lark import Lark, Transformer, v_args
import dhlab as dh
import tools_imag as ti

In [2]:
# Define the grammar for expressions
grammar = """
    ?start: expr

    ?expr: expr "+" term   -> add
         | expr "-" term   -> sub
         | term

    ?term: term "*" factor -> mul
         | term "/" factor -> div
         | factor

    ?factor: WORD           -> lookup
           | "(" expr ")"

    %import common.WORD
    %import common.WS_INLINE
    %ignore WS_INLINE
"""

# Parser using the defined grammar
parser = Lark(grammar, start='start', parser='lalr')

In [53]:
@v_args(inline=True)
class Calculate(Transformer):
    def __init__(self, current_corpus):
        self.current_corpus = current_corpus  # Store current_corpus in the instance

    def add(self, a, b):
        #print("Adding:", a, "and", b)  # Debug: print the two components being added
        if isinstance(a, pd.DataFrame) and isinstance(b, pd.DataFrame):
            # Add the two DataFrames column-wise, summing the aligned indices
            result = a.iloc[:,0].add(b.iloc[:,0]).to_frame()#, fill_value=0).sum(axis=1).to_frame(name='sum')
            #print("Result of add:", result)  # Debug: print the result of the addition
            return result
        return a + b

    def sub(self, a, b):
        if isinstance(a, pd.DataFrame) and isinstance(b, pd.DataFrame):
            return a.iloc[:, 0].subtract(b.iloc[:,0]).to_frame()#, fill_value=0).sum(axis=1).to_frame(name='diff')
        return a - b

    def mul(self, a, b):
        if isinstance(a, pd.DataFrame) and isinstance(b, pd.DataFrame):
            return a.iloc[:,0].multiply(b.iloc[:, 0]).to_frame()#, fill_value=1).sum(axis=1).to_frame(name='mul')
        return a * b

    def div(self, a, b):
        if isinstance(a, pd.DataFrame) and isinstance(b, pd.DataFrame):
            #return a.divide(b, fill_value=1).sum(axis=1).to_frame(name='div')
            return a.iloc[:, 0].divide(b.iloc[:,0]).to_frame()#, fill_value=1)#.to_frame(name='div')
        return a / b

    def lookup(self, word):
        #print(f"Looking up word: {word}")  # Debug: print the word being looked up
        # Return the trendline (DataFrame) for the word or a default DataFrame if not found
        result = ti.corpus_ngram(self.current_corpus, word)
        return result
        # if isinstance(result, pd.DataFrame):
        #     print(f"Found DataFrame for '{word}':\n{result}")  # Debug: print the result of lookup
        #     return result
        # else:
        #     raise ValueError(f"Expected DataFrame for '{word}', got {type(result)}")

In [54]:
def evaluate_expression(parser, expression, current_corpus):
    tree = parser.parse(expression)
    calc = Calculate(current_corpus)
    final_result = calc.transform(tree)  
    return final_result  # Just return the DataFrame


In [35]:
# Example 

corpus = ti.get_imag_corpus()

c = corpus[(corpus.authors.str.contains("Skram"))]

In [60]:
# Example usage
expression = "(hun/han)"
result = evaluate_expression(parser, expression, corpus)
result.columns = [expression]

In [61]:
result

Unnamed: 0_level_0,(hun/han)
year,Unnamed: 1_level_1
1814,0.05503
1815,0.087496
1816,0.109841
1817,0.102389
1818,0.097813
1819,0.116091
1820,0.301038
1821,0.152234
1822,0.133193
1823,0.096057


In [59]:
result.to_frame(name=expression).plot()

AttributeError: 'DataFrame' object has no attribute 'to_frame'

In [47]:
result.iloc[:,0]

year
1814    3.326539
1815    2.418094
1816    2.162034
1817    1.732963
1818    1.538582
1819    1.850603
1820    2.494632
1821    1.864876
1822    2.122783
1823    1.662861
1824    2.275468
1825    1.760429
1826    2.122321
1827    1.818484
1828    1.611475
1829    1.912228
1830    1.786902
1831    2.693324
1832    1.818792
1833    1.925200
1834    1.585563
1835    2.221803
1836    1.612742
1837    1.832088
1838    1.861686
1839    2.018412
1840    2.696080
1841    2.211891
1842    2.046084
1843    1.425994
1844    2.071494
1845    1.937272
1846    1.979560
1847    1.756148
1848    2.466812
1849    1.819010
1850    1.709929
1851    2.273247
1852    1.524381
1853    1.701695
1854    1.460828
1855    1.766110
1856    1.541611
1857    1.557144
1858    1.361391
1859    2.146939
1860    1.874834
1861    1.635178
1862    1.445098
1863    1.705255
1864    1.919583
1865    1.813380
1866    1.667614
1867    1.874120
1868    1.734365
1869    1.574086
1870    1.755905
1871    1.834863
1872    1

In [11]:
# Example usage
expression = "(og + i)"
result = evaluate_expression(expression, current_corpus)
#print(result)

In [12]:
result

Unnamed: 0_level_0,sum
year,Unnamed: 1_level_1
1877,3.889077
1878,4.159884
1889,4.799588
1890,4.70679
1892,4.021555
1893,3.858314
1894,3.953254
1895,2.290433
1896,2.594097
1897,4.738213


In [8]:
corpus = ti.get_imag_corpus()

In [9]:
c = corpus[corpus.authors.str.contains("Hamsun")]
current_corpus = c

In [13]:
ti.corpus_ngram_old(corpus[(corpus.authors.str.contains("Skram"))],words=["og","i"])

Unnamed: 0_level_0,i,og
year,Unnamed: 1_level_1,Unnamed: 2_level_1
1827,,
1830,0.017094,0.0
1882,0.021923,0.025827
1885,0.015685,0.029035
1887,0.022065,0.03514
1888,0.01441,0.031698
1889,0.009882,0.015852
1890,0.017305,0.029724
1891,0.015094,0.03155
1892,0.015053,0.034051


In [14]:
ti.corpus_ngram(corpus[(corpus.authors.str.contains("Skram"))],words="og i")

Unnamed: 0_level_0,i,og
year,Unnamed: 1_level_1,Unnamed: 2_level_1
1827,0.0,0.0
1830,1.709402,0.0
1882,2.1923,2.582671
1885,1.568469,2.903516
1887,2.206483,3.514029
1888,1.440982,3.169814
1889,0.988234,1.585232
1890,1.682118,2.986499
1891,1.509409,3.155039
1892,1.5053,3.405127
