In [115]:
import pandas as pd
import sqlite3
from math import log

In [2]:
def query(db, sql, params=()):
    with sqlite3.connect(db) as con:
        res = pd.read_sql_query(sql, con, params=params)
    return res

def query_tuple(db, sql, params=()):
    with sqlite3.connect(db) as con:
        cur = con.cursor()
        res = cur.execute(sql, params)
    return res.fetchall()

In [64]:
uni_avis = "/mnt/disk1/NB-ngram-assoc/avis-unigram-one-row.db"

In [69]:
uni_bok = "/mnt/disk1/NB-ngram-assoc/unigram-one-row.db"

In [4]:
query(uni, "select * from sqlite_master").style

Unnamed: 0,type,name,tbl_name,rootpage,sql
0,table,unigram,unigram,2,"CREATE TABLE unigram (freq int, lang varchar, first varchar, json text)"
1,index,_lff_,unigram,1393414,"CREATE INDEX _lff_ on unigram(lang,first,freq)"
2,index,_lf_,unigram,1511081,"CREATE INDEX _lf_ on unigram(lang,freq)"
3,index,_flf_,unigram,1576805,"CREATE INDEX _flf_ on unigram (freq, lang, first)"


In [119]:
def diff(x,y):
    if x < y:
        r = x/y
    else:
        r = y/x
    return r

def get_word_structure(word, db, lang = True):
    """Fetch word structure from word - lang for unigams from books, no lang for newspapers"""
    
    pairs = [(word[:i+1], word[i+1:]) for i in range(len(word) -1)]
    res = []
    condition = [
        "a.first = ?",
        "b.first = ?",
        "c.first = ?"
    ]
    if lang:
        condition += [
            "a.lang = 'nob' ",
            "b.lang = 'nob'",
            "c.lang = 'nob'"
        ]
    conditions = ' and '.join(condition)
    for p in pairs:
        res.append(query(db,f"""select 
            a.first as pre, 
            b.first as post, 
            a.freq as freqf, 
            b.freq as freqs, 
            c.freq as freqw
        from 
            unigram as a, 
            unigram as b, 
            unigram as c
        where 
            {conditions}""", 
                         p+(word,)
                        )
                  )
    res = pd.concat(res)

    res['diff'] = res.apply(lambda x: diff(x.freqf, x.freqs), axis = 1)
    res['eval'] = res.apply(lambda x: log(x.freqf*x.freqs*x['diff']), axis = 1)
    return res.sort_values(by = 'eval', ascending = False)

def get_word_hyp(word, db):
    pairs = [(word[:i+1], word[i+1:]) for i in range(len(word) -1)]
    res = []
    for p in pairs:
        res.append(query(uni, """select 
            a.first as pre, 
            b.first as post, 
            a.freq as freqf, 
            b.freq as freqs
        from 
            unigram as a, 
            unigram as b
        where 
            a.lang = 'nob' 
            and b.lang = 'nob' 
            and a.first = ? 
            and b.first = ?""", p))
    res = pd.concat(res)
    res['eval'] = res.freqf*res.freqs
    return res.sort_values(by = 'eval', ascending = False)

In [120]:
get_word_structure('askefast', uni_avis, lang = False)

Unnamed: 0,pre,post,freqf,freqs,freqw,diff,eval
0,aske,fast,44043,3450062,859,0.012766,21.385843
0,askef,ast,42,30955,859,0.001357,7.475339


In [121]:
get_word_structure('tyttebær', uni_bok, lang = True)

Unnamed: 0,pre,post,freqf,freqs,freqw,diff,eval
0,tytte,bær,1806,115329,15235,0.01566,14.997739
0,tytt,ebær,1024,66,15235,0.064453,8.379309
0,tytteb,ær,27,36640,15235,0.000737,6.591674
0,tyt,tebær,3316,11,15235,0.003317,4.795791


In [124]:
get_word_structure('blåbær', uni_bok, lang = True)

Unnamed: 0,pre,post,freqf,freqs,freqw,diff,eval
0,blå,bær,477912,115329,21367,0.241318,23.311088
0,bl,åbær,727160,38,21367,5.2e-05,7.275172


In [128]:
get_word_structure('nasjonalbiblioteket', uni_avis, lang = False)

Unnamed: 0,pre,post,freqf,freqs,freqw,diff,eval
0,nasjonal,biblioteket,275002,189312,1414,0.688402,24.302303
0,nasjonalbibliotek,et,1654,64324485,1414,2.6e-05,14.821904
0,nasjo,nalbiblioteket,3361,82,1414,0.024398,8.813438
0,nasj,onalbiblioteket,7974,30,1414,0.003762,6.802395
0,nasjona,lbiblioteket,2243,21,1414,0.009362,6.089045
0,na,sjonalbiblioteket,1932254,16,1414,8e-06,5.545177


In [137]:
get_word_structure('forfatter', uni_bok)

Unnamed: 0,pre,post,freqf,freqs,freqw,diff,eval
0,for,fatter,95899199,62362,230427,0.00065,22.081423
0,forfatte,r,10677,5237240,230427,0.002039,18.551694
0,forf,atter,7913,534033,230427,0.014817,17.952525
0,forfatt,er,792,123343894,230427,6e-06,13.349123
0,forfat,ter,750,242747,230427,0.00309,13.240146
0,forfa,tter,367,24473,230427,0.014996,11.810724
0,f,orfatter,10815503,228,230427,2.1e-05,10.858691
0,fo,rfatter,69893,15,230427,0.000215,5.4161
