In [1]:
import os
import re
import pandas as pd
import scipy as sp

from spellchecker import SpellChecker
# from skmultilearn.problem_transform import ClassifierChain
from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM
from sklearn.neighbors import LocalOutlierFactor
from tqdm.auto import tqdm

tqdm.pandas()

In [2]:
def rm_digit_and_spe_char(text, stopwords):
    """
        Prepare and clean text :
            - Remove digit
            - Remove special character
            - Remove stopword
            - Correct typo with pyspellchecher

    :param text: text to clean
    :param stopwords: list of stopword used to remove stopword in text

    :return: cleaned text
    :rtype: str
    """

    spell = SpellChecker(language="fr", distance=1)  # fix distance to 1 for shorter run times

    text_output = " "
    for word in text.split():
        word = re.sub(r'\d+', "", word)  # remove digital char
        word = re.sub(r'[\@!-+°—"-_*()=,;:./?…|<>«»]', " ", word)  # remove special character
        word = word.lower()  # normalize to lower case

        # Check if the word is myspell
        if spell.unknown([word]):
            corrected = spell.correction(word)
            if corrected == word:
                word = ""
            else:
                word = corrected
        elif len(word) > 36:
            word = ""
        else:
            word = word
    
        if word not in stopwords:
            text_output += " " + word

    return text_output

In [3]:
stopwords = open("stopwords.txt", 'r').read().split()

data_set = pd.DataFrame()
data_outlier = pd.DataFrame()

for file in os.listdir("data_arthur"):
    df = pd.read_json("data_arthur/" + file, orient="record")
    data_set = pd.concat([data_set, df], ignore_index=True)

for file in os.listdir("data_test_arthur"):
    df = pd.read_json("data_test_arthur/" + file, orient="record")
    data_outlier = pd.concat([data_outlier, df], ignore_index=True)

In [None]:
data_set["content"] = data_set.progress_apply(lambda x: rm_digit_and_spe_char(x["content"], stopwords), axis=1)

  0%|          | 0/13257 [00:00<?, ?it/s]

In [26]:
data_outlier["content"] = data_outlier.progress_apply(lambda x: rm_digit_and_spe_char(x["content"], stopwords), axis=1)

  0%|          | 0/1880 [00:00<?, ?it/s]

In [44]:
data_set["category"].unique()

array(['avis_situation_declarative', 'avis_taxe_fonciere',
       'bulletin_de_paie', 'compromis_de_vente', 'contrat_bail_locatif',
       'justificatif_domicile', 'justificatif_domicile_taxe_habitation',
       'epargne', 'impot', 'releve_de_compte'], dtype=object)

In [48]:
data_set_impot = data_set[data_set["category"] == "justificatif_domicile"]
data_set_impot

Unnamed: 0,category,content,type
4772,justificatif_domicile,chic numéro contrat lieu consommation rue be...,
4773,justificatif_domicile,free service abonné paris codex soucie delph...,
4774,justificatif_domicile,attestation valant justificatif domicile ene...,
4775,justificatif_domicile,my cardio document conserver ans page che da...,
4776,justificatif_domicile,mieux comprendre montant facture bonjour fac...,
...,...,...,...
4888,justificatif_domicile,document conserver ans page evolution consom...,
4889,justificatif_domicile,document conserver ans page détail facture c...,
4890,justificatif_domicile,mieux comprendre montant facture bonjour fac...,
4891,justificatif_domicile,contacter end client internet application mo...,


In [50]:
data_outlier.duplicated()

0       False
1       False
2       False
3       False
4       False
        ...  
1875    False
1876    False
1877    False
1878    False
1879    False
Length: 1880, dtype: bool

In [56]:
word = "rennnrrrrerennnnennerenensenneeereeeeseneesser"
if spell.unknown([word]):
    corrected = spell.correction(word)
    if corrected == word:
        word = ""
    else:
        word = corrected
else:
    word = word

print(word)

rennnrrrrerennnnennerenensenneeereeeeseneesser


In [10]:
print(data_set["content"][2])

avis situation déclaratif impôt revenir suite avis information complementaire revenir fiscal référence dom rrrrrrrr information indiquer mémoire rcm déjà soumettre prélèvement social csg déductible plafond epargne retraite plafond disponible déduction cotisation verser déclaration revenir souscrir déclar plafond total ner rrnnncrerecnnnecennnneeerennneernnece cs plafond non utiliser revenir plafond non utiliser revenir plafond non utiliser revenir plafond calculer revenir plafond cotisation verser
