In [30]:
import pandas as pd
import sqlite3
import dhlab.text.nbtokenizer as tk

In [59]:
def query(db, sql, params=()):
    with sqlite3.connect(db) as con:
        res = pd.read_sql_query(sql, con, params=params)
    return res

def query_tuple(db, sql, params=()):
    with sqlite3.connect(db) as con:
        cur = con.cursor()
        res = cur.execute(sql, params)
    return res.fetchall()

In [11]:
bigrams = "/mnt/disk1/NB-ngram-assoc/avis-bigram-one-row.db"

In [12]:
query(bigrams, "select * from sqlite_master").style

Unnamed: 0,type,name,tbl_name,rootpage,sql
0,table,bigram,bigram,2,"CREATE TABLE bigram (freq int, first varchar, second varchar, json text, assoc float, pmi float)"
1,index,_fsf_,bigram,9066782,"CREATE INDEX _fsf_ on bigram(first,second,freq)"
2,index,_sf_,bigram,10029784,"CREATE INDEX _sf_ on bigram(second,freq)"
3,index,_f_,bigram,10726936,CREATE INDEX _f_ on bigram(freq)
4,table,sqlite_stat1,sqlite_stat1,11160619,"CREATE TABLE sqlite_stat1(tbl,idx,stat)"
5,index,_ff_,bigram,11160620,"CREATE INDEX _ff_ on bigram(first,freq)"
6,index,_fsa_,bigram,12673829,"CREATE INDEX _fsa_ on bigram (first, second, assoc)"
7,index,_a_,bigram,13911155,CREATE INDEX _a_ on bigram ( assoc)
8,index,_sa_,bigram,14615017,"CREATE INDEX _sa_ on bigram ( second,assoc)"
9,index,_fa_,bigram,15584061,"CREATE INDEX _fa_ on bigram ( first,assoc)"


In [23]:
query(bigrams, 
      "select first, second, freq, assoc"
      " from bigram where first = 'stekt'"
      " and assoc > 50 order by assoc desc limit 100")

Unnamed: 0,first,second,freq,assoc
0,stekt,flesk,1368,328.129925
1,stekt,bacon,869,270.386213
2,stekt,makrell,947,267.023029
3,stekt,mark,1308,252.36516
4,stekt,småtorsk,469,242.285061
5,stekt,fisk,1341,241.619117
6,stekt,løk,837,241.218668
7,stekt,kjøtt,798,202.528317
8,stekt,kylling,605,200.880788
9,stekt,ris,683,190.258629


In [65]:
def parse(s, db):
    words = tk.tokenize(s)
    bigrams = [(words[i], words[j]) for i in range(len(words) - 1) for j in range(i, len(words))]
    ev = evaluate(bigrams, db)
    return pd.DataFrame(ev, columns = ['f','s','freq', 'assoc']).sort_values(by = 'assoc', ascending=False)

def evaluate(bigrams, db):
    return [(x, y) + check(x, y, db) for (x, y) in bigrams]
    
def check(x,y, db):
    res = query_tuple(db, f"select freq, assoc from bigram where first = '{x}' and second= '{y}'")
    if res != []:
        a,b = res[0]
    else:
        a,b = (0,0)
    return (a,b)

In [117]:
parse("en mann vi snakket med bygde et hus.", bigrams)

Unnamed: 0,f,s,freq,assoc
32,med,et,2470001,2699.621705
1,en,mann,643018,2373.111422
25,snakket,med,209876,1784.070037
40,et,hus,126027,945.316907
12,mann,med,229965,899.136572
18,vi,snakket,27353,515.163326
43,hus,.,461637,512.267003
37,bygde,hus,3933,323.894755
16,mann,.,533439,296.765032
21,vi,et,214962,247.808198


In [73]:
tk.tokenize("det stod en dr. der. ve lars.johnsen@gmail.com at ftp://www.nb.no")

['det',
 'stod',
 'en',
 'dr.',
 'der',
 '.',
 've',
 'lars.johnsen@gmail.com',
 'at',
 'ftp',
 ':',
 '/',
 '/',
 'www.nb.no']