In [1]:
import pandas as pd
import swifter
import numpy as np
import string
import matplotlib.pyplot as plt
import seaborn as sns
import nltk

from spacy.lemmatizer import Lemmatizer
from spacy.lang.it.lemmatizer import LOOKUP
from spacy.lang.it.stop_words import STOP_WORDS

from scipy.sparse import save_npz, load_npz

from sklearn.cluster import KMeans, SpectralClustering
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances

import spacy

from scipy.spatial.distance import cdist

import pickle

from gensim import corpora, models
%matplotlib inline

In [2]:
agricole = pd.read_excel('agricole.xlsx', 
                         dtype={'Partita IVA': str, 'Codice fiscale': str}, 
                         sheet_name='Risultati', na_values=['n.d.', 'n.s.'])

multi = pd.read_excel('multi.xlsx', 
                         dtype={'Partita IVA': str, 'Codice fiscale': str}, 
                         sheet_name='Risultati', na_values=['n.d.', 'n.s.'])

In [3]:
multi = multi.append(agricole, ignore_index=True)

In [4]:
multi.columns = multi.columns.str.replace('\n', ' ')

In [5]:
rename = {
'Altri ricavi EUR Ultimo anno disp.': 'other_sales',
'PFN/EBITDA Ultimo anno disp.': 'leva',
'PFN EUR Ultimo anno disp.': 'pfn',
'Durata media dei crediti al lordo IVA (gg) Ultimo anno disp.': 'crediti',
'Durata media dei debiti al lordo IVA (gg) Ultimo anno disp.': 'debiti',
'EBITDA EUR Ultimo anno disp.': 'ebitda',
'Capitale sociale EUR Ultimo anno disp.': 'capitale',
'Giac. media delle scorte (gg) Ultimo anno disp.': 'magazzino',
'MORE Probabilità di default - attuale  %': 'more',
'CRIF score - attuale': 'crif',
'Descrizione di consolidamento': 'cons',
'Ragione sociale' : 'nome', 
'Partita IVA': 'piva',
'Codice fiscale': 'cf',
'Ricavi vendite e prestazioni EUR Ultimo anno disp.': 'sales',
'Oggetto sociale': 'oggetto', 
'ATECO 2007 codice': 'ateco', 
}

In [6]:
multi = multi.rename(columns=rename)

In [7]:
multi = multi[[k for k in rename.values()]]

In [8]:
multi.ateco = multi.ateco.astype(str).str.zfill(6)

In [9]:
def split_ateco(str):
    return [str[start:start + 2] for start in range(0, len(str), 2)]

In [10]:
ateco = multi.ateco.swifter.apply(split_ateco)

  from pandas import Panel


HBox(children=(IntProgress(value=0, description='Pandas Apply', max=9069, style=ProgressStyle(description_widt…




In [11]:
multi['ateco_1'] = [i[0] for i in ateco]
multi['ateco_2'] = [i[1][0] for i in ateco]
multi['ateco_3'] = [i[1][1] for i in ateco]
multi['ateco_4'] = [i[2] for i in ateco]

In [12]:
multi.ateco_1.value_counts()

10    5639
01    3270
03     160
Name: ateco_1, dtype: int64

In [13]:
multi = multi[~multi.oggetto.isnull()]

In [14]:
stable = str.maketrans({key: ' ' for key in string.punctuation})

In [15]:
def clean_str_simple(s):
    tokens = nltk.word_tokenize(s.translate(stable), language='italian')
    tokens = [i for i in tokens if i.lower() not in set(nltk.corpus.stopwords.words('italian') + ['d'])]
    return tokens

In [16]:
def print_topics(model, vectorizer, n_top_words):
    words = vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(model.components_):
        print("\nTopic #%d:" % topic_idx)
        print(" ".join([words[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))

In [17]:
nlp = spacy.load("it_core_news_sm")

In [18]:
custom = ['n.', "nell'", "1)", "2)", "3)", "4)", "5)",
          "6)", "7)", "8)", "9)", "a)", "b)", "c)",
          "d)", "e)", "f)", "g)", "h)", "i)", "l)",
          "1.", "2.", "3.", "4.", "5.", "6.", "7."]

common = ['attività', 'società', 'oggetto', 'esercizio', 'scopi', 'articolo', 'legge',
          'organizzazione', 'obiettivi', 'sensi', 'genere']

common_l = []

for i in common:
    for j in ['à', 'è', 'ù', 'ò', 'ì']:
        if j in i:
            common_l.append(i.replace(j, ''))
    common_l.append(i)

for c in custom:
    STOP_WORDS.add(c)
    
for c in common_l:
    STOP_WORDS.add(c)
    
for w in STOP_WORDS:
    nlp.vocab[w].is_stop = True

In [19]:
accenti = {
    "a": "à",
    "e": "è",
    "u": "ù",
    "o": "ò",
    "i": "ì"
}

In [27]:
def apostrofo_to_accento(s):
    t = s.lower().replace('- ', '').split()
    for i, w in enumerate(t):
        if "'" == w[-1] and len(w) > 2 and w[-2] in 'aeiou':
            n = w[:-2]
            t[i]  = n + accenti[w[-2]]
    return ' '.join(t)

def clean_simple(s):
    clean = apostrofo_to_accento(s)
    p = nlp(clean)
    return [w.text for w in p if len(w) >= 3 
            and w.pos_ == 'NOUN' 
            and not w.is_stop 
            and not w.is_punct
            and not w.text.isdigit()]

def count(ser, strategy):
    p = ser.swifter.apply(strategy)
    count = CountVectorizer()
    return count.fit_transform([' '.join(s) for s in p]), count.get_feature_names(), count

def tfidf_count(ser, strategy):
    p = ser.swifter.apply(strategy)
    count = TfidfVectorizer(strip_accents='unicode', max_df=.5, min_df=.05, ngram_range=(2, 2))
    return count.fit_transform([' '.join(s) for s in p]), count.get_feature_names(), count

In [28]:
%%time
c, c_names, counter = count(multi.oggetto, clean_simple)

  from pandas import Panel


HBox(children=(IntProgress(value=0, description='Pandas Apply', max=8954, style=ProgressStyle(description_widt…


Wall time: 7min


In [29]:
%%time
tfidf, tfidf_names, tfidf_counter = tfidf_count(multi.oggetto, clean_simple)

  from pandas import Panel


HBox(children=(IntProgress(value=0, description='Pandas Apply', max=8954, style=ProgressStyle(description_widt…


Wall time: 7min


In [30]:
save_npz('c.npz', c)
save_npz('tfidf.npz', tfidf)

In [31]:
def pickle_it(obj, path):
    with open(path, "wb") as f:
        pickle.dump(obj, f)
    return True

In [32]:
pickle_it([c_names, counter], "c_names.pkl")
pickle_it([tfidf_names, tfidf_counter], "tfidf_names.pkl")

True

In [33]:
def unpickle_it(path):
    with open(path, "rb") as f:
        p = pickle.load(f)
    return p

def read_models(name):
    npz = load_npz(name + '.npz')
    pkl = unpickle_it(name + '_names.pkl')
    return npz, pkl

names = ["c", "tfidf"]

c, [c_names, counter] = read_models(names[0])
tfidf_count, [tfidf_names, tfidf_counter] = read_models(names[1])

In [85]:
%%time
lda_count = LatentDirichletAllocation(30, max_iter=500, n_jobs=6, 
                                       verbose=1, evaluate_every=20,
                                      perp_tol=.1, random_state=12346)
lda_count.fit(c)

iteration: 1 of max_iter: 500
iteration: 2 of max_iter: 500
iteration: 3 of max_iter: 500
iteration: 4 of max_iter: 500
iteration: 5 of max_iter: 500
iteration: 6 of max_iter: 500
iteration: 7 of max_iter: 500
iteration: 8 of max_iter: 500
iteration: 9 of max_iter: 500
iteration: 10 of max_iter: 500
iteration: 11 of max_iter: 500
iteration: 12 of max_iter: 500
iteration: 13 of max_iter: 500
iteration: 14 of max_iter: 500
iteration: 15 of max_iter: 500
iteration: 16 of max_iter: 500
iteration: 17 of max_iter: 500
iteration: 18 of max_iter: 500
iteration: 19 of max_iter: 500
iteration: 20 of max_iter: 500, perplexity: 445.8542
iteration: 21 of max_iter: 500
iteration: 22 of max_iter: 500
iteration: 23 of max_iter: 500
iteration: 24 of max_iter: 500
iteration: 25 of max_iter: 500
iteration: 26 of max_iter: 500
iteration: 27 of max_iter: 500
iteration: 28 of max_iter: 500
iteration: 29 of max_iter: 500
iteration: 30 of max_iter: 500
iteration: 31 of max_iter: 500
iteration: 32 of max_iter:

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=20, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=500,
                          mean_change_tol=0.001, n_components=30, n_jobs=6,
                          perp_tol=0.1, random_state=12346,
                          topic_word_prior=None, total_samples=1000000.0,
                          verbose=1)

In [92]:
%%time
summary = {}

for i in [30, 50, 70, 100]:
    print(i)
    perps = []
    lda = LatentDirichletAllocation(i, max_iter=300, n_jobs=6, 
                                    verbose=0, evaluate_every=30, 
                                    random_state=1234)
    lda.fit(tfidf)
    p = lda.perplexity(tfidf)
    perps.append(p)
    print('Perplexity: ', p)
    print()
    summary[i] = lda

30
30 6412.404923567844

50
50 11068.878800450226

70
70 15578.722968283673

100
100 25654.734355974502

Wall time: 8min 55s


In [34]:
%%time
lda_tfidf = LatentDirichletAllocation(50, max_iter=500, n_jobs=6, 
                                       verbose=1, evaluate_every=20, 
                                       random_state=12346)
lda_tfidf.fit(tfidf_count)

iteration: 1 of max_iter: 500
iteration: 2 of max_iter: 500
iteration: 3 of max_iter: 500
iteration: 4 of max_iter: 500
iteration: 5 of max_iter: 500
iteration: 6 of max_iter: 500
iteration: 7 of max_iter: 500
iteration: 8 of max_iter: 500
iteration: 9 of max_iter: 500
iteration: 10 of max_iter: 500
iteration: 11 of max_iter: 500
iteration: 12 of max_iter: 500
iteration: 13 of max_iter: 500
iteration: 14 of max_iter: 500
iteration: 15 of max_iter: 500
iteration: 16 of max_iter: 500
iteration: 17 of max_iter: 500
iteration: 18 of max_iter: 500
iteration: 19 of max_iter: 500
iteration: 20 of max_iter: 500, perplexity: 437.3053
iteration: 21 of max_iter: 500
iteration: 22 of max_iter: 500
iteration: 23 of max_iter: 500
iteration: 24 of max_iter: 500
iteration: 25 of max_iter: 500
iteration: 26 of max_iter: 500
iteration: 27 of max_iter: 500
iteration: 28 of max_iter: 500
iteration: 29 of max_iter: 500
iteration: 30 of max_iter: 500
iteration: 31 of max_iter: 500
iteration: 32 of max_iter:

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=20, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=500,
                          mean_change_tol=0.001, n_components=50, n_jobs=6,
                          perp_tol=0.1, random_state=12346,
                          topic_word_prior=None, total_samples=1000000.0,
                          verbose=1)

In [35]:
pickle_it(lda_count, 'lda_count.pkl')
pickle_it(lda_tfidf, 'lda_tfidf.pkl')

NameError: name 'lda_count' is not defined

In [None]:
lda_count = unpickle_it('lda_count.pkl')
lda_tfidf = unpickle_it('lda_tfidf.pkl')

In [25]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print ("Topic %d:" % (topic_idx))
        print (" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

no_top_words = 5
display_topics(lda_count, c_names, no_top_words)

NameError: name 'lda_count' is not defined

In [36]:
display_topics(lda_tfidf, tfidf_names, no_top_words)

Topic 0:
tecniche produzione vendita prodotti fini speculazione valorizzazione prodotti dell attivita
Topic 1:
prodotti soci prodotti sottoprodotti commercializzazione prodotti trasformazione prodotti lavorazione trasformazione
Topic 2:
allevamento animali coltivazione fondo dell attivita fideiussioni garanzie prodotti prodotti
Topic 3:
partecipazioni interessenze interessenze imprese operazioni mobiliari fideiussioni garanzie operazioni finanziarie
Topic 4:
all articolo prodotti soci soci cooperativa requisiti interessi prodotti sottoprodotti
Topic 5:
import export vendita prodotti conto conto interessenze partecipazioni interessenze imprese
Topic 6:
commercio prodotti prodotti prodotti interessenze partecipazioni partecipazioni imprese dettaglio prodotti
Topic 7:
prodotti produzione commercializzazione prodotti vendita prodotti fidejussioni avalli dell attivita
Topic 8:
produzione vendita vendita prodotti partecipazioni imprese interessenze partecipazioni operazioni finanziarie
Topic

In [100]:
for topic_idx, topic in enumerate(summary[100].components_):
    if 'vino' in ' '.join([tfidf_names[i] for i in topic.argsort()[:-20 - 1:-1]]):
        print(topic_idx)

28
68


In [26]:
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

In [27]:
pyLDAvis.sklearn.prepare(lda_count, c, counter)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [50]:
pyLDAvis.sklearn.prepare(lda_tfidf, tfidf, tfidf_counter)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [29]:
topics_count = lda_count.transform(c)

In [30]:
multi['lda_count'] = [t.argmax() for t in topics_count]

In [38]:
def sample(obj):
    if obj.shape[0] >= 5:
        return obj.loc[np.random.choice(obj.index, 5, False),:]
    else:
        return obj.loc[np.random.choice(obj.index, obj.shape[0], False),:]

topics_multi = multi.groupby('lda_count', as_index=False).apply(sample)

In [41]:
topics_multi[['lda_count', 'nome', 'piva', 'cf', 'ateco', 'oggetto']].to_excel('with_topics.xlsx', index=False)

In [44]:
multi.lda_count.value_counts()

36    1533
17     599
16     587
44     568
8      388
12     364
10     363
24     352
37     313
22     306
43     240
32     240
13     227
18     198
7      179
29     174
33     173
4      152
5      150
42     131
9      116
48     113
23     110
30     108
41     100
11      97
2       97
47      86
40      85
27      83
14      77
39      75
34      73
0       70
46      69
3       62
49      61
1       55
19      35
38      35
25      31
35      21
45      20
26      12
6       10
31       8
15       4
20       3
21       1
Name: lda_count, dtype: int64

In [46]:
multi[multi.lda_count == 21]['nome']

128    MARICOLTURA DI ROSIGNANO SOLVAY S.R.L.
Name: nome, dtype: object

In [123]:
pars = [
    {'max_df':.5, 'min_df':.05},
    {'max_df':.5, 'min_df':.1},
    {'max_df':.3, 'min_df':.05},
    {'max_df':.3, 'min_df':.01},
    {'max_df':.5, 'min_df':.05, 'sublinear_tf':True},
]

def tfidf_comp(ser, strategy, params_dict):
    p = ser.swifter.apply(strategy)
    count = TfidfVectorizer(**params_dict, strip_accents='unicode')
    return count.fit_transform([' '.join(s) for s in p]), count.get_feature_names(), count

In [125]:
def collect_topics(model, feature_names, no_top_words):
    tps = {}
    for topic_idx, topic in enumerate(model.components_):
        tps[topic_idx] = " ".join([feature_names[i]
                                  for i in topic.argsort()[:-no_top_words - 1:-1]])
    return tps

In [135]:
results = []
for p in pars:
    tfidf, tfidf_names, tfidf_counter = tfidf_comp(multi.oggetto, clean_simple, p)
    lda = LatentDirichletAllocation(50, max_iter=500, n_jobs=6, 
                                    verbose=0, evaluate_every=30, 
                                    random_state=12346)
    lda.fit(tfidf)
    results.append({'counts': [tfidf, tfidf_names, tfidf_counter],
                    'params': p,
                    'model': lda,
                    'topics': collect_topics(lda, tfidf_names, 10)})

  from pandas import Panel


HBox(children=(IntProgress(value=0, description='Pandas Apply', max=8954, style=ProgressStyle(description_widt…




  from pandas import Panel


HBox(children=(IntProgress(value=0, description='Pandas Apply', max=8954, style=ProgressStyle(description_widt…




  from pandas import Panel


HBox(children=(IntProgress(value=0, description='Pandas Apply', max=8954, style=ProgressStyle(description_widt…




  from pandas import Panel


HBox(children=(IntProgress(value=0, description='Pandas Apply', max=8954, style=ProgressStyle(description_widt…




  from pandas import Panel


HBox(children=(IntProgress(value=0, description='Pandas Apply', max=8954, style=ProgressStyle(description_widt…




In [142]:
import pprint

pp = pprint.PrettyPrinter()

for i in results:
    pp.pprint(i['params'])
    pp.pprint({k: ' '.join(v.split()[:5]) for k, v in i['topics'].items()})
    print()

{'max_df': 0.5, 'min_df': 0.05}
{0: 'mercati acque dettaglio attivita uso',
 1: 'conseguimento interessenze mobiliari imprese norme',
 2: 'mutui carattere assunzione confronti garanzie',
 3: 'vendita consorzi fidejussioni fideiussioni export',
 4: 'esclusione rilascio confronti raggiungimento integrazioni',
 5: 'energia impianti coltivazione realizzazione allevamento',
 6: 'campo ricerca alimentazione commercio vendita',
 7: 'settori settore ricerca commercializzazione conto',
 8: 'vendita consorzi fidejussioni fideiussioni export',
 9: 'stabilimenti commercializzazione lavorazione conservazione conto',
 10: 'importazione esportazione commercio materie rappresentanze',
 11: 'investimento collocamento avalli esclusione raggiungimento',
 12: 'salumi stagionatura formaggi carni commercio',
 13: 'risparmio pubblico raccolta imprese garanzie',
 14: 'fondi allevamento coltivazione bestiame terreni',
 15: 'locazione immobili costruzione vendita acquisto',
 16: 'soci cooperativa mutualita cond