In [1]:
import string
import numpy as np
import pandas as pd

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem.snowball import SnowballStemmer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer



In [49]:
corpus = [
    u"Nel MeZZo del cammino, di nostra vita",
    u"Mi ritrovai perso. Per un'ampia selva oscura",
    u"Che la diritta via e la verità erano smarrita"
]

In [50]:
stop_words = nltk.corpus.stopwords.words('italian') + (string.punctuation).split()

In [51]:
def fn_stemmer(x):
    stemmer = SnowballStemmer("italian")
    def fn(y): return " ".join([ stemmer.stem(word) for word in word_tokenize(y) ])
    fn_v = np.vectorize(fn)
    return fn_v(x)

In [52]:
pipe = Pipeline([
    ("stemmer", FunctionTransformer(fn_stemmer)),
    ("tfidf_vectorizer", TfidfVectorizer(
        encoding = "utf-8",
        lowercase = True,
        stop_words =  stop_words,
        ngram_range = (1,1),
        max_df = 1,
        max_features = None,
        use_idf = True,
    ))
])

In [53]:
e = pipe.fit_transform(corpus)

pd.DataFrame(data = e.todense(), columns = pipe["tfidf_vectorizer"].get_feature_names())

['nel mezz del cammin , di nostr vit'
 "mi ritrova pers . per un'amp selv oscur"
 'che la diritt via e la verit eran smarr']


Unnamed: 0,amp,cammin,diritt,eran,mezz,nostr,oscur,pers,ritrova,selv,smarr,verit,via,vit
0,0.0,0.5,0.0,0.0,0.5,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5
1,0.447214,0.0,0.0,0.0,0.0,0.0,0.447214,0.447214,0.447214,0.447214,0.0,0.0,0.0,0.0
2,0.0,0.0,0.447214,0.447214,0.0,0.0,0.0,0.0,0.0,0.0,0.447214,0.447214,0.447214,0.0
