In [1]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [15]:
from gensim import corpora, models

In [3]:
import spacy
nlp = spacy.load('it_core_news_sm')

In [4]:
def read(fname, pos=False):
    sent = []
    for line in open(fname):
        line = line.strip().split()
        if not line:
            if sent: yield sent
            sent = []
        else:
            if pos:
                w,p = line
                sent.append((w,p))
            else:
                w, _ = line
                sent.append((w))

In [5]:
# tokenized
# documents = [' '.join(d) for d in read("project/data/li_data_dict_ma/train/ita-train.tt")]
documents = list(read("project/data/li_data_dict_ma/train/ita-train.tt"))+list(read("project/data/li_data_dict_ma/test/ita.tt"))

In [6]:
len(documents)

3359

In [7]:
documents[:2]

[['Per',
  'due',
  'anni',
  'gli',
  'uomini',
  'di',
  'Fiorini',
  'si',
  'sono',
  'rifiutati',
  'di',
  'fornire',
  'informazioni',
  'su',
  'un',
  'conto',
  'battezzato',
  'Reprival',
  '.'],
 ["All'",
  'inizio',
  'il',
  'colpo',
  'venne',
  'attribuito',
  'agli',
  'hezbollah',
  'filo',
  'iraniani',
  ',',
  'più',
  'tardi',
  'fu',
  'ipotizzata',
  'una',
  'partecipazione',
  'del',
  'gruppo',
  'di',
  'Abu',
  'Nidal',
  ',',
  'il',
  'più',
  'terribile',
  'terrorista',
  'palestinese',
  '.']]

In [8]:
' '.join(documents[4])

'Il Papa condanna la " sistematica emarginazione " dell\' altra metà del cielo in campo culturale , chiede " pari opportunità " per tutte , cita la pedagoga Montessori quale ideale portabandiera , e soprattutto esalta " l\' ingresso sempre più qualificato delle donne non soltanto come fruitrici , ma anche come protagoniste " in campo intellettuale .'

In [9]:
with open('project/data/stopwords-it.txt', 'r') as f:
    stoplist = set(f.read().splitlines()).union(('.','*', ';', ':', '?', '!', ',', '"',"'"))
    
def preprocess(document):
    return [w.lemma_.lower() for w in nlp(' '.join(document)) if w.text.lower() not in stoplist]

In [10]:
# create dictionary
texts = [preprocess(d) for d in documents]
dictionary = corpora.Dictionary(texts)
dictionary.save('/tmp/ailct.dict')  # store the dictionary, for future reference
print(dictionary)

2018-04-23 11:55:27,722 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2018-04-23 11:55:27,824 : INFO : built Dictionary(9915 unique tokens: ['battezzare', 'contare', 'fiorini', 'fornire', 'informazione']...) from 3359 documents (total 31910 corpus positions)
2018-04-23 11:55:27,825 : INFO : saving Dictionary object under /tmp/ailct.dict, separately None
2018-04-23 11:55:27,829 : INFO : saved /tmp/ailct.dict


Dictionary(9915 unique tokens: ['battezzare', 'contare', 'fiorini', 'fornire', 'informazione']...)


In [12]:
texts[:2]

[['uomini',
  'fiorini',
  'rifiutato',
  'fornire',
  'informazione',
  'contare',
  'battezzare',
  'reprival'],
 ['iniziare',
  'colpo',
  'venire',
  'attribuire',
  'hezbollah',
  'filare',
  'iraniano',
  'tardo',
  'ipotizzare',
  'partecipazione',
  'abu',
  'nidal',
  'terribile',
  'terrorista',
  'palestinese']]

In [13]:
# convert corpus
corpus = [dictionary.doc2bow(text) for text in texts]
corpora.MmCorpus.serialize('/tmp/ailct.mm', corpus)  # store to disk, for later use

2018-04-23 11:56:26,999 : INFO : storing corpus in Matrix Market format to /tmp/ailct.mm
2018-04-23 11:56:27,001 : INFO : saving sparse matrix to /tmp/ailct.mm
2018-04-23 11:56:27,002 : INFO : PROGRESS: saving document #0
2018-04-23 11:56:27,022 : INFO : PROGRESS: saving document #1000
2018-04-23 11:56:27,041 : INFO : PROGRESS: saving document #2000
2018-04-23 11:56:27,060 : INFO : PROGRESS: saving document #3000
2018-04-23 11:56:27,070 : INFO : saved 3359x9915 matrix, density=0.094% (31243/33304485)
2018-04-23 11:56:27,071 : INFO : saving MmCorpus index to /tmp/ailct.mm.index


In [75]:
lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=50)

2018-04-23 18:16:57,948 : INFO : using serial LSI version on this node
2018-04-23 18:16:57,973 : INFO : updating model with new documents
2018-04-23 18:16:57,974 : INFO : preparing a new chunk of documents
2018-04-23 18:16:58,014 : INFO : using 100 extra samples and 2 power iterations
2018-04-23 18:16:58,016 : INFO : 1st phase: constructing (9915, 150) action matrix
2018-04-23 18:16:58,071 : INFO : orthonormalizing (9915, 150) action matrix
2018-04-23 18:16:58,982 : INFO : 2nd phase: running dense svd on (150, 3359) matrix
2018-04-23 18:16:59,190 : INFO : computing the final decomposition
2018-04-23 18:16:59,207 : INFO : keeping 50 factors (discarding 40.949% of energy spectrum)
2018-04-23 18:16:59,231 : INFO : processed documents up to #3359
2018-04-23 18:16:59,243 : INFO : topic #0(29.873): 0.658*")" + 0.628*"(" + 0.255*"vendere" + 0.151*"­" + 0.055*"milano" + 0.048*"aa" + 0.037*"lira" + 0.036*"vincono" + 0.036*"italia" + 0.034*"roma"
2018-04-23 18:16:59,248 : INFO : topic #1(23.318)

In [76]:
lsi.print_topics(3)

2018-04-23 18:17:05,212 : INFO : topic #0(29.873): 0.658*")" + 0.628*"(" + 0.255*"vendere" + 0.151*"­" + 0.055*"milano" + 0.048*"aa" + 0.037*"lira" + 0.036*"vincono" + 0.036*"italia" + 0.034*"roma"
2018-04-23 18:17:05,217 : INFO : topic #1(23.318): 0.736*"­" + -0.577*"vendere" + -0.109*"aa" + -0.082*"vincono" + -0.055*"premio" + -0.055*"brescia" + -0.055*"ab" + -0.055*"z" + -0.051*"s" + 0.037*"italiano"
2018-04-23 18:17:05,218 : INFO : topic #2(22.602): 0.631*"vendere" + 0.611*"­" + -0.235*")" + -0.217*"(" + 0.120*"aa" + 0.090*"vincono" + 0.080*"roma" + 0.067*"milano" + 0.065*"s" + 0.060*"premio"


[(0,
  '0.658*")" + 0.628*"(" + 0.255*"vendere" + 0.151*"\xad" + 0.055*"milano" + 0.048*"aa" + 0.037*"lira" + 0.036*"vincono" + 0.036*"italia" + 0.034*"roma"'),
 (1,
  '0.736*"\xad" + -0.577*"vendere" + -0.109*"aa" + -0.082*"vincono" + -0.055*"premio" + -0.055*"brescia" + -0.055*"ab" + -0.055*"z" + -0.051*"s" + 0.037*"italiano"'),
 (2,
  '0.631*"vendere" + 0.611*"\xad" + -0.235*")" + -0.217*"(" + 0.120*"aa" + 0.090*"vincono" + 0.080*"roma" + 0.067*"milano" + 0.065*"s" + 0.060*"premio"')]

In [77]:
lsi[corpus[0]]

[(0, 0.011844287631525039),
 (1, 0.014090507347567859),
 (2, 0.004057747366479785),
 (3, 0.1626190738562196),
 (4, 0.05505545361032539),
 (5, 0.015091807802303285),
 (6, -0.00409723736441963),
 (7, -0.027208925802335058),
 (8, 0.04210765639408623),
 (9, 0.06149589560729715),
 (10, -0.026426509291045605),
 (11, -0.011233020348017642),
 (12, 0.022549640486397605),
 (13, -0.044596948996156516),
 (14, -0.03205768027395545),
 (15, 0.0018758299089645719),
 (16, 0.025336527716189678),
 (17, -0.02251279802264066),
 (18, 0.01053343725715876),
 (19, -0.005572405255825803),
 (20, -0.039577427039776764),
 (21, 0.04095568785235507),
 (22, -0.011661881304324934),
 (23, -0.010452110627907272),
 (24, -0.04255108506342817),
 (25, 0.038174715556902994),
 (26, -0.06145220470087308),
 (27, -0.013114278096092473),
 (28, -0.005257142365920267),
 (29, 0.03439589205167867),
 (30, 0.0310364664565234),
 (31, -0.016519188487458827),
 (32, -0.029797128811958087),
 (33, 0.026904065916599936),
 (34, -0.027101048237

In [72]:
# save topics to a file for dynet
with open("topic.train", 'w') as f:
    for i, topic in enumerate(lda[corpus][:3110]):
        f.write(str(topic[0][0])+'\n')