In [1]:
import re

In [2]:
import pandas as pd
import json

## Open a dataset

In [3]:
df = pd.read_csv('../data/df_mess_analysed.csv',
                 encoding='utf8',
                 delimiter=';',
                 usecols=['chrn_gaz','stm_clean','offre']
                )

In [4]:
df = df[(df.offre == 'DUAL') | (df.offre == 'GAZ')]

In [5]:
df.stm_clean = df.stm_clean.apply(lambda row: re.findall("\w+",row))

In [6]:
df.head()

Unnamed: 0,offre,chrn_gaz,stm_clean
0,DUAL,0,"[mettr, jour, adress, palenci, bourg]"
1,DUAL,0,"[souhait, modifi, adress, mail, etre, contacte..."
2,DUAL,0,"[prochain, factur, disponibl, compt]"
3,DUAL,0,"[index, factur, aout, net, superieur, realit, ..."
4,DUAL,0,"[savoir, factur, annuel, pai]"


In [7]:
list_of_docs = [' '.join(doc) for doc in df.stm_clean.tolist()]

In [8]:
list_of_docs[:3]

['mettr jour adress palenci bourg',
 'souhait modifi adress mail etre contacte servic cel nadiyaa545 gmail prendr cel compt dorenavent',
 'prochain factur disponibl compt']

# Compute TF-IDF

In [9]:
from nautilus_nlp.utils.text_vectorizer import Tfidf

In [10]:
tfidf = Tfidf(max_df=0.95, #ignore terms that have a document frequency strictly higher 
              min_df=0.01,
             max_features=10000,
             encoding='utf-8',
             #stop_words=SW,
             norm=None,
             #ngram_range=(1, 2))
             )


In [11]:
# uses Sci-kit learn TfidfVectorizer()
# You can pass all the arguments supported by sci-kit 
# Doc : https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
tfidf.tfidf_vectorizer

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.95, max_features=10000, min_df=0.01,
        ngram_range=(1, 1), norm=None, preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [12]:
# Compute the word count vector matrix.
tfidf.compute_tfidf(list_of_docs)

<17744x434 sparse matrix of type '<class 'numpy.float64'>'
	with 300349 stored elements in Compressed Sparse Row format>

In [13]:
# Get highest-weighted words
tfidf.get_top_tfidf()

{'euro': 96.099,
 'rembours': 116.797,
 'adress': 115.531,
 'compt': 89.231,
 'coupur': 87.871,
 'chequ': 82.334,
 'mois': 81.935}

In [14]:
# Get highest-weighted words per document
for doc in list_of_docs[:10]:
    print(tfidf.get_top_tfidf_per_doc(doc))

['mettr', 'adress', 'jour']
['cel', 'prendr', 'modifi', 'adress', 'mail', 'servic', 'etre', 'compt', 'souhait']
['disponibl', 'prochain', 'compt', 'factur']
['index', 'modif', 'modifi', 'lign', 'prochain', 'aout', 'electricit', 'pouv', 'prelev', 'factur']
['annuel', 'savoir', 'pai', 'factur']
['disponibl', 'ete', 'concern', 'relev', 'compteur', 'demand']
['acce', 'juin', 'plait', 'avanc', 'envoi', 'compt', 'mois', 'factur']
['conso', 'reel', 'communiqu', 'vient', 'met', 'relev', 'argent', 'conseiller', 'repondu', 'connaitr']
['nest', 'point', 'lappliqu', 'effect', 'sup', 'appliqu', 'vrai', 'toujour', 'arriv', 'souc']
['mensuel', 'pai', 'prelev']
