# Make word graphs from csv

From a csv file with at least three columns, two with words and one with a frequency. 

In [1]:
import sqlite3
import pandas as pd

# Functions 

In [2]:
def query(db, sql, param=()):
    """ Query a sqlitedatabase with sql and param"""
    with sqlite3.connect(db) as con:
        cur = con.cursor()
        cur.execute(sql, param)
    return cur.fetchall()

In [3]:
def create_database_from_df(db, df):
    """
        db is the name of an empty database, df is a dataframe with columns named first, second, freq
        A table named ngram is created with two secondary tables
    """
    print('creating database from dataframe')
    with sqlite3.connect(db) as con:
        df[['first','second','freq']].to_sql('ngram', con)
    print('indexing main table')
    query(db, "create index '_ft_' on ngram (first, second)")
    query(db, "create index '_tf_' on ngram (second, first)")
    query(db, "create table firstfreq (first varchar, freq int)")
    query(db, "create table secondfreq (second varchar, freq int)")
    query(db, "insert into firstfreq select first, sum(freq) from ngram group by first")
    query(db, "insert into secondfreq select second, sum(freq) from ngram group by second")
    print('indexing secondary tables')
    query(db, "create index _ff_ on firstfreq (first, freq)")
    query(db, "create index _ftf_ on secondfreq (second, freq)")

In [4]:
def make_word_graph(db):
    print('create graph of word pairs')
    query(db, "create table word_graph (first varchar, second varchar, freq int, pmi float)")
    query(db, "insert into word_graph select  a.first, a.second, sum(a.freq) as f, pow(a.freq,2)*1.0/ (f.freq * t.freq) as pmi from ngram as a, firstfreq as f, secondfreq as t where a.first = f.first and a.second = t.second group by a.first, a.second")
    print('indexing graph')
    query(db, "create index _gftp_ on word_graph (first, pmi)")
    query(db, "create index _gtfp_ on word_graph (second, pmi)")

In [5]:
def check_graph_first(x, top = 20):
    return query(word_pair_database, "select * from word_graph where first = ? order by pmi desc limit ?", (x, top))
def check_graph_second(x, top = 20):
    return query(word_pair_database, "select * from word_graph where second = ? order by pmi desc limit ?", (x, top))

## File to store word graph - change name as appropriate

In [25]:
word_pair_database = "news_word_pairs.db"

## CSV-file structure


Specify name of file, and a mapping from the columns to "first", "second" and "freq". The graph is built as weighted graph with edges  `(first, second)`, where frequency and compututations based on frequencies are weights. 

If the CSV file has for example five columns, and the `first` element is in 2nd position with the `second` in 4th, and frequency is in 5th, just indicate the mapping with a schematic description of the columns so that `first`, `second` and `freq` are matched up with the appropriate columns, for example like this: 

```
['p0', 'first', 'p2', 'second', 'freq']
```
The names for unused columns are arbitrary. Only the words in the `first` and `second` columns with the weights in `freq` are used in constructing the graph. 

The code will aggregate the columns, summing up the extraneous columns. 

$$\textrm{columns} = \sum_{x,y \in \textrm{p0}, \textrm{p2} } {\textrm{columns}(\textrm{freq})_{[\textrm{p0}, \textrm{first}, \textrm{p2}, \textrm{second} ]}}$$

The aggregation produces a table with aggregated frequencies with only three columns:

```
['first', 'second', 'freq']
```


### Specify CSV data  

csv_data = {
    'file':"../../ngram_2021_coordination/coord-bok.csv",
    'columns': ['lang', 'first', 'coord', 'second','freq']
}

In [26]:
csv_data = {
    'file':"../../ngram_2021_coordination/coord-avis.csv",
    'columns': [ 'first', 'coord', 'second','freq']
}

# Read CSV and aggregate the frequencies

If csv has a header, set `header = 0`, and if there is a separate (initial) index column, set `index_col = 0`. If there are errors, just add cells and inspect the variable `data`.

In [31]:
data = pd.read_csv(csv_data['file'], header = None, index_col = None)

In [32]:
data

Unnamed: 0,0,1,2,3
0,!,og,08.55,5
1,!,og,160,5
2,!,og,19.00,5
3,!,og,A-laget,5
4,!,og,Andre,5
...,...,...,...,...
12547198,",",og,de,6290096
12547199,",",og,vi,6518966
12547200,",",og,i,6773165
12547201,",",og,at,8850710


In [33]:
data.columns = csv_data['columns']

In [34]:
if len(data.columns) > 3:
    data = data.groupby(['first', 'second']).sum('freq').reset_index()

# Create basic word pairs

Store the dataframe in an sqlite-database with frequencies together with derived frequencies - table is called `ngram`. Words with frequencies from the first and second columns are extracted and added up into separate tables dubbed `firstfreq` and `secondfreq`.

In [35]:
create_database_from_df(word_pair_database, data)

creating database from dataframe
indexing main table
indexing secondary tables


# Make table of word pairs

The table of word pairs is computed using the ngram-table and named `word_graph`. Indexed and ready to use.

The columns of the `word_graph` table are `first, second, freq, pmi` where freq contains the original frequencies, and pmi is the computed association value, using the following formula:

$$\frac{\textrm{freq}(x,y)^2}{\textrm{freq}(x)*\textrm{freq}(y)}$$

The frequency is squared to give the cooccurence frequency a higher weight. The marginals `freq(x)` are aggregated from the word positions. For each word $w$ it is associated with two frequencies, one frequency from the occurence in the first column, and one from the second column.

The actual computation can be changed in the definition of the function `make_word_graph`. Locate the part of the expression `pow(a.freq,2)*1.0/ (f.freq * t.freq)` in one of the sql-queries, and substitute it with a preferred computation.

In [36]:
make_word_graph(word_pair_database)

create graph of word pairs
indexing graph


# Querying word graphs

In [37]:
check_graph_first('Ibsen')

[('Ibsen', 'Bjørnson', 9292, 0.09418937783791119),
 ('Ibsen', 'Hamsun', 2508, 0.020234145931904427),
 ('Ibsen', 'Strindberg', 1413, 0.008817534030960994),
 ('Ibsen', 'Bjornson', 489, 0.006173023560988957),
 ('Ibsen', 'Shakespeare', 709, 0.003965907308327859),
 ('Ibsen', 'Bjørnstjerne', 1160, 0.0025674813806368575),
 ('Ibsen', 'skandinavismen', 159, 0.0017095133698876127),
 ('Ibsen', 'Bjsrnson', 134, 0.0014743776387262414),
 ('Ibsen', 'Bjørnsons', 562, 0.0014472037908524673),
 ('Ibsen', 'Grieg', 1166, 0.0014451388035318631),
 ('Ibsen', 'Munch', 565, 0.0014288586230320971),
 ('Ibsen', 'Kielland', 524, 0.0014035709526468451),
 ('Ibsen', 'moderniteten', 57, 0.0009620150617981421),
 ('Ibsen', 'Sigval', 163, 0.0008087536925698582),
 ('Ibsen', 'Egner', 144, 0.0006720224287485595),
 ('Ibsen', 'Wenche', 1501, 0.0005798994271320755),
 ('Ibsen', 'Hamsundagene', 141, 0.0004961952003615553),
 ('Ibsen', 'Holberg', 213, 0.00048396040709125043),
 ('Ibsen', 'Hamsun-prisen', 32, 0.0004548030578399342),


In [38]:
check_graph_second('Ibsen')

[('Bjørnson', 'Ibsen', 6021, 0.06442220940913733),
 ('Shakespeare', 'Ibsen', 842, 0.008708065816422318),
 ('Bjornson', 'Ibsen', 399, 0.008259061046113493),
 ('Bjsrnson', 'Ibsen', 167, 0.003645904485381543),
 ('Hamsun', 'Ibsen', 697, 0.0020866237781364707),
 ('Holberg', 'Ibsen', 445, 0.001616635066665736),
 ('Strindberg', 'Ibsen', 330, 0.0015834916703830494),
 ('Munch', 'Ibsen', 635, 0.0011622409371508467),
 ('digtning', 'Ibsen', 77, 0.0009411843143355594),
 ('Bjprnson', 'Ibsen', 10, 0.0006889424733034792),
 ('Vjsrnson', 'Ibsen', 20, 0.0006408767193520736),
 ('Grimstadtid', 'Ibsen', 7, 0.00048225973131243543),
 ('Wergeland', 'Ibsen', 345, 0.0004600874032707547),
 ('Vjornson', 'Ibsen', 15, 0.0004189515040358995),
 ('Bjørnsom', 'Ibsen', 6, 0.0004133654839820875),
 ('Tsjekhov', 'Ibsen', 25, 0.00037770968931111795),
 ('Hebbel', 'Ibsen', 5, 0.0003444712366517396),
 ('Bjomson', 'Ibsen', 5, 0.0003444712366517396),
 ('Grieg', 'Ibsen', 435, 0.0002572166989145292),
 ('Kierkegaard', 'Ibsen', 71, 0