### Fetching data from the database

In [2]:
import psycopg2

In [3]:
con = psycopg2.connect(user = "postgres",
                       password = "",
                       host = "127.0.0.1",
                       port = "5432",
                       database = "adj_noun_pairs")

try:
    cursor = con.cursor();
except (Exception, psycopg2.Error) as error:
    print("Error in Connection",error)

In [46]:
try:
    cursor.execute("SELECT * FROM pair_counts_10000 WHERE noun IN (SELECT noun FROM pair_counts_10000 GROUP BY noun HAVING COUNT(*) >= 5)")
    pairs = cursor.fetchall() 
except (Exception, psycopg2.Error) as error:
    con.rollback()
    print("Error while inserting data into the table:", error)

In [47]:
pairs[0]

('viimane', 'aasta', 1493)

In [48]:
len(pairs)

284340

In [52]:
adjs = list(set([pair[0] for pair in pairs]))
nouns = list(set([pair[1] for pair in pairs]))

In [54]:
print(len(nouns), len(adjs))

10521 20266


### Data to dataframe

In [55]:
import pandas as pd
import numpy as np

In [57]:
df = pd.DataFrame(0, index=nouns, columns=adjs)
df.head()

Unnamed: 0,produtseeriv,koolitatud,viiesaja-aastane,vikerkaarevärviline,novembriöine,maoistlik,helesinine,piinarikas,jutukas,keerelnud,...,kondenseeruv,valikuline,kõrvalepandud,spetsiaal,tõusiklik-anarhistlik,heterotroofne,ekstaatiline,10palliline,sõnalõpuline,kalmusetaoline
ümberjagamine,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
palm,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
rahatrahv,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
tavakool,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
post-RISC-arhitektuur,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [59]:
for tup in pairs:
    df.loc[tup[1], tup[0]] = tup[2]

In [60]:
df.head()

Unnamed: 0,produtseeriv,koolitatud,viiesaja-aastane,vikerkaarevärviline,novembriöine,maoistlik,helesinine,piinarikas,jutukas,keerelnud,...,kondenseeruv,valikuline,kõrvalepandud,spetsiaal,tõusiklik-anarhistlik,heterotroofne,ekstaatiline,10palliline,sõnalõpuline,kalmusetaoline
ümberjagamine,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
palm,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
rahatrahv,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
tavakool,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
post-RISC-arhitektuur,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Model creation and training

In [61]:
from sklearn.decomposition import LatentDirichletAllocation

In [62]:
lda = LatentDirichletAllocation(n_components=500, random_state=1)
lda.fit_transform(df)

array([[2.22222222e-04, 2.22222222e-04, 2.22222222e-04, ...,
        2.22222222e-04, 2.22222222e-04, 2.22222222e-04],
       [1.81818182e-04, 1.81818182e-04, 1.81818182e-04, ...,
        1.81818182e-04, 1.81818182e-04, 1.81818182e-04],
       [5.40540541e-05, 5.40540541e-05, 5.40540541e-05, ...,
        5.40540541e-05, 5.40540541e-05, 5.40540541e-05],
       ...,
       [3.33333333e-04, 3.33333333e-04, 3.33333333e-04, ...,
        3.33333333e-04, 3.33333333e-04, 3.33333333e-04],
       [3.44827586e-05, 3.44827586e-05, 3.44827586e-05, ...,
        3.44827586e-05, 3.44827586e-05, 3.44827586e-05],
       [6.89655172e-05, 6.89655172e-05, 6.89655172e-05, ...,
        6.89655172e-05, 6.89655172e-05, 6.89655172e-05]])

In [64]:
topics_test = {}
for i in range(500):
    topics_test[i] = []
    
topic_probs_test = lda.transform(df)
for i, prob in enumerate(topic_probs_test):
    topics_test[np.argmax(prob)].append(df.index[i])

In [65]:
print([len(topic) for topic in topics_test.values()])

[18, 19, 19, 9, 24, 9, 7, 23, 14, 13, 13, 22, 9, 44, 20, 16, 86, 35, 140, 19, 20, 15, 9, 17, 10, 10, 45, 18, 19, 9, 13, 16, 18, 14, 16, 26, 12, 16, 21, 24, 15, 13, 51, 15, 28, 17, 34, 21, 12, 12, 26, 14, 18, 22, 43, 10, 34, 11, 16, 13, 16, 5, 16, 34, 11, 6, 17, 4, 36, 17, 39, 14, 14, 10, 20, 24, 15, 36, 28, 6, 9, 17, 20, 15, 23, 7, 14, 32, 12, 13, 21, 27, 22, 8, 14, 13, 10, 13, 79, 18, 7, 11, 12, 25, 16, 27, 31, 8, 11, 13, 21, 16, 9, 14, 10, 26, 14, 10, 19, 19, 38, 33, 11, 24, 8, 22, 21, 26, 16, 29, 27, 11, 32, 19, 48, 7, 12, 19, 11, 50, 15, 89, 15, 75, 42, 13, 8, 9, 14, 13, 9, 14, 26, 19, 13, 17, 14, 13, 16, 10, 19, 10, 6, 19, 20, 13, 20, 27, 11, 4, 23, 27, 19, 167, 17, 11, 12, 18, 32, 13, 15, 16, 30, 21, 20, 21, 14, 21, 32, 22, 24, 18, 14, 17, 39, 32, 21, 6, 18, 17, 61, 26, 24, 11, 25, 18, 43, 113, 19, 16, 18, 35, 6, 25, 62, 37, 13, 13, 10, 9, 10, 13, 21, 33, 18, 36, 13, 11, 31, 29, 62, 11, 16, 25, 19, 17, 82, 15, 10, 14, 13, 20, 23, 13, 16, 22, 12, 13, 12, 16, 19, 9, 8, 29, 19, 15, 

In [66]:
for i in range(500):
    if len(topics_test[i]) > 5 and len(topics_test[i]) < 20:
        print(topics_test[i])
        print("---------------------------------")

['jaapanlane', 'välistuum', 'muutja', 'elukvaliteet', 'konsortsium', 'magma', 'meelemürk', 'viski', 'mälujälg', 'vaik', 'heroiin', 'refleksiivsus', 'liustik', 'ülakiht', 'õppeprotsess', 'ligipääs', 'kahetsus', 'arvutitarkvara']
---------------------------------
['paavian', 'läbimüük', 'muretsemine', 'raudtee', 'sõiduk', 'prestiiž', 'ausammas', 'lahing', 'koguhulk', 'hoidla', 'infoühiskond', 'pühapäev', 'arsenal', 'tramm', 'sajandialgus', 'äriinimene', 'sümbolipaar', 'kriminaalasi', 'soomusauto']
---------------------------------
['läige', 'ehedus', 'dokumentaal', 'kinkimine', 'esitus', 'riigikaitse', 'zhürii', 'sissekandmisluba', 'domineerimine', 'noot', 'olulisus', 'tahteavaldus', 'kinnistusamet', 'keskaeg', 'reklaamiandja', 'list', 'deklaratsioon', 'äripäev', 'keskkonnaamet']
---------------------------------
['tähistaevas', 'organism', 'pensionireform', 'abiline', 'meditsiinipraktika', 'elevant', 'aatom', 'meditsiin', 'välispoliitika']
---------------------------------
['miin', 'täp

---

### New one

In [122]:
try:
    cursor.execute("SELECT * FROM pair_counts_10000 WHERE noun IN (SELECT noun FROM pair_counts_10000 GROUP BY noun HAVING COUNT(*) >= 5) AND count >= 3;")
    pairs2 = cursor.fetchall() 
except (Exception, psycopg2.Error) as error:
    con.rollback()
    print("Error while inserting data into the table:", error)

In [123]:
len(pairs2)

29112

In [124]:
adjs2 = list(set([pair[0] for pair in pairs2]))
nouns2 = list(set([pair[1] for pair in pairs2]))

In [125]:
print(len(adjs2), len(nouns2))

3152 4373


In [126]:
df2 = pd.DataFrame(0, index=nouns2, columns=adjs2)
df2.head()

Unnamed: 0,märgatav,inimesesarnane,energeetiline,moraalne,bürokraatlik,helesinine,nukleotiidne,kujuline,piinarikas,mehaaniline,...,toimiv,põhiline,000kroonine,värvitu,valikuline,paras,transformaalne,asümmeetriline,tänulik,sõnalõpuline
asundus,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
järgnevus,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
sõber,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
hiilgus,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
rahatrahv,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [127]:
for tup in pairs2:
    df2.loc[tup[1], tup[0]] = tup[2]

In [128]:
import time
from datetime import timedelta, datetime

In [135]:
start = time.time()
print("Start:", str(timedelta(seconds=start+7200)).split(", ")[1])

lda2 = LatentDirichletAllocation(n_components=1000)
lda2.fit_transform(df2)

Start: 2:24:26.154722


array([[2.00000000e-04, 2.00000000e-04, 2.00000000e-04, ...,
        2.00000000e-04, 2.00000000e-04, 2.00000000e-04],
       [1.25000000e-04, 1.25000000e-04, 1.25000000e-04, ...,
        1.25000000e-04, 1.25000000e-04, 1.25000000e-04],
       [2.80112045e-06, 2.80112045e-06, 2.80112045e-06, ...,
        2.80112045e-06, 2.80112045e-06, 2.80112045e-06],
       ...,
       [5.55555556e-05, 5.55555556e-05, 5.55555556e-05, ...,
        5.55555556e-05, 5.55555556e-05, 5.55555556e-05],
       [1.66666667e-04, 1.66666667e-04, 1.66666667e-04, ...,
        1.66666667e-04, 1.66666667e-04, 1.66666667e-04],
       [6.66666667e-05, 6.66666667e-05, 6.66666667e-05, ...,
        6.66666667e-05, 6.66666667e-05, 6.66666667e-05]])

In [136]:
start = time.time()
print("End:", str(timedelta(seconds=start+7200)).split(", ")[1])

End: 2:25:40.340720


In [137]:
topics_test2 = {}
for i in range(1000):
    topics_test2[i] = []
    
topic_probs_test2 = lda2.transform(df2)
for i, prob in enumerate(topic_probs_test2):
    topics_test2[np.argmax(prob)].append(df2.index[i])

In [138]:
print([len(topic) for topic in topics_test2.values()])

[103, 1, 2, 9, 4, 3, 2, 3, 6, 7, 2, 1, 3, 3, 3, 3, 0, 1, 0, 5, 5, 3, 5, 2, 10, 0, 1, 1, 2, 3, 1, 0, 4, 3, 6, 0, 0, 4, 2, 0, 3, 2, 5, 5, 36, 0, 0, 0, 1, 2, 0, 4, 0, 3, 1, 0, 4, 1, 38, 6, 18, 2, 7, 4, 4, 4, 5, 3, 6, 1, 4, 3, 1, 2, 1, 0, 1, 3, 0, 1, 4, 1, 0, 85, 4, 6, 3, 1, 4, 21, 8, 0, 0, 2, 2, 2, 0, 3, 2, 0, 0, 4, 3, 0, 0, 6, 1, 5, 1, 5, 8, 3, 2, 1, 4, 1, 0, 7, 0, 4, 4, 2, 3, 1, 0, 4, 1, 0, 8, 40, 0, 0, 0, 2, 2, 0, 12, 0, 3, 1, 3, 2, 12, 3, 2, 0, 2, 3, 5, 3, 2, 6, 2, 2, 4, 2, 2, 7, 7, 0, 0, 2, 1, 0, 0, 0, 1, 2, 2, 1, 19, 1, 0, 9, 6, 1, 12, 0, 3, 1, 1, 0, 0, 0, 7, 4, 2, 2, 0, 3, 0, 0, 0, 1, 1, 1, 3, 28, 1, 4, 2, 1, 3, 2, 0, 0, 2, 5, 1, 1, 1, 4, 3, 2, 6, 1, 0, 4, 0, 14, 1, 2, 32, 19, 1, 6, 0, 14, 0, 164, 0, 81, 1, 0, 0, 1, 4, 3, 4, 1, 0, 3, 11, 0, 3, 3, 0, 29, 2, 0, 8, 2, 1, 3, 1, 0, 2, 5, 4, 6, 8, 0, 4, 1, 0, 1, 0, 0, 0, 4, 2, 0, 3, 1, 4, 0, 4, 5, 1, 0, 1, 9, 2, 0, 6, 11, 7, 3, 8, 1, 14, 3, 9, 5, 0, 2, 1, 5, 0, 4, 0, 0, 1, 5, 2, 4, 2, 3, 1, 1, 1, 0, 1, 1, 1, 9, 3, 3, 0, 9, 0, 11, 5, 0, 3

In [140]:
for i in range(1000):
    if len(topics_test2[i]) > 5 and len(topics_test2[i]) < 20:
        print(topics_test2[i])
        print("---------------------------------")
    
    '''print(topics_test2[i])
    print("---------------------------------")'''

['sekretär', 'toimetaja', 'järjestus', 'teek', 'väljaandja', 'rekonfigureerimine', 'järjestamine', 'konfiguratsioon', 'andmestruktuur']
---------------------------------
['sisaldus', 'eelis', 'stabiilsus', 'pakt', 'niiskus', 'olulisus']
---------------------------------
['vabadus', 'riigikord', 'sotsialism', 'õigusriik', 'juhtimisstiil', 'legitiimsus', 'riigikorraldus']
---------------------------------
['korrasolek', 'dokumentatsioon', 'ülevaatus', 'progress', 'varustatus', 'rike', 'üksikasi', 'arsenal', 'komitee', 'abijõud']
---------------------------------
['koolilõpetaja', 'suurprojekt', 'linnaeelarve', 'finalist', 'eurolaul', 'autonäitus']
---------------------------------
['kasutajaliides', 'disain', 'disainimuster', 'liides', 'programmeerimine', 'leht']
---------------------------------
['rajoon', 'vein', 'täpp', 'terror', 'vaip', 'roos', 'kurat', 'niit', 'tellis', 'kollokatsioon', 'härg', 'moon', 'kilp', 'latern', 'lill', 'kukk', 'tuli', 'väljak']
-----------------------------

In [121]:
for i in range(100):
    if len(topics_test2[i]) < 10:
        print(topics_test2[i])
        print("---------------------------------")

['äraelamine', 'valimistulemus', 'olemus', 'leib', 'oht', 'praktika', 'tarbeese', 'elu', 'oks']
---------------------------------
['soojus', 'ülekanne', 'fakt', 'laul', 'kütus', 'julgeolekuteenistus', 'olemine', 'näide', 'tulevik']
---------------------------------
['spetsialist', 'kartul', 'abort', 'õigusakt', 'tugipunkt', 'pensionär', 'elusündmus', 'seadusandlus']
---------------------------------
['periood', 'perekond', 'tee', 'töö', 'keerukus', 'järjekindlus']
---------------------------------
['kaamera', 'info', 'keskkond', 'problemaatika', 'probleem', 'muutmine', 'andmed', 'kiri']
---------------------------------
['võim', 'liider', 'riigivõim', 'süsteem', 'teenus', 'ühiskond', 'leiutis', 'konveier']
---------------------------------
['aeg', 'personal', 'politseinik', 'õppevorm']
---------------------------------
['igavik', 'soojenemine', 'mõistus', 'pere', 'varandus', 'diskrimineerimine', 'rida', 'genitiiv', 'iseärasus']
---------------------------------
['pähkel', 'dokument', '