In [38]:
# Import des librairies

import os
import sys
import numpy as np
import pandas as pd
import re

from sklearn.svm import SVC
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.multioutput import ClassifierChain
from sklearn.preprocessing import OrdinalEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, mean_squared_error
from sklearn.metrics import accuracy_score

In [2]:
# Récupération des jeux de données connus et inconnus

df_train = pd.read_csv("clean.csv", sep=",", header=0, index_col=0)

df_test = pd.read_csv("outlier.csv", sep=",", header=0, index_col=0)

print(df_train.head())
print(df_test.head())


                     category  \
0  avis_situation_declarative   
1  avis_situation_declarative   
2  avis_situation_declarative   
3  avis_situation_declarative   
4  avis_situation_declarative   

                                             content   type  
0    art direction général finance public avis si...  connu  
1    avis situation impôt revenir détail revenir ...  connu  
2    avis situation impôt revenir suite avis info...  connu  
3    direction general finance public avis situat...  connu  
4    avis situation impôt revenir résidence exclu...  connu  
               category                                            content  \
0  attestation_donation    jute  gus poster  ke  jule perdue routière t...   
1  attestation_donation    bons lo nés  lee ef ga amer ie jase nains uu...   
2  attestation_donation    mahdi re we taule lus za lo toulouse hide el...   
3  attestation_donation    parie ef ans robert  île fa those robert nic...   
4  attestation_donation    hu are re va

In [3]:
# CountVectorizer sur df_train_encoded
vectorizer = CountVectorizer(max_features=5000)
vectorizer.fit(df_train['content'])

X_known = vectorizer.transform(df_train["content"])
y_known = pd.get_dummies(df_train["category"])

X_outliers = vectorizer.transform(df_test["content"])
y_outlier = pd.get_dummies(df_test["category"])

# Train test split sur les données connues
X_train, X_test, y_train, y_test = train_test_split(X_known, y_known, test_size=0.2, random_state=42, shuffle=True)

In [39]:
# Classifier Chain avec SVM, Naive et regression
base_model =  SVC(kernel="linear") #MultinomialNB()
clf = ClassifierChain(
    base_model,
    order="random",
    random_state=42,
    cv = 4)

# Apprentissage et test sur les données d'apprentissage

clf.fit(X_train, y_train)

ClassifierChain(base_estimator=SVC(kernel='linear'), cv=4, order='random',
                random_state=42)

In [40]:
# Prédiction sur les données connues de train et de test
y_train_pred = clf.predict(X_train)
y_test_pred = clf.predict(X_test)

# Score de l'apprentissage
print("Score de l'apprentissage : ", accuracy_score(y_train, y_train_pred))
print("Score du test : ", accuracy_score(y_test, y_test_pred))

# RMSE de l'apprentissage
print("RMSE de l'apprentissage : ", np.sqrt(mean_squared_error(y_train, y_train_pred)))

Score de l'apprentissage :  0.9990570485619991
Score du test :  0.9641779788838613
RMSE de l'apprentissage :  0.011489699792428521


In [41]:
# Prédiction sur les outliers
y_outlier_pred = clf.predict(X_test)

# Formatage pour avoir le même que y_train
y_outlier = pd.get_dummies(df_test["category"])

# Pour avoir les mêmes colonnes, je retire les colonnes des outliers
for i in y_known.columns :
    if i not in(y_outlier.columns) :
        y_outlier[i] = 0
        
y_outlier = y_outlier.drop(list(set(y_outlier.columns).difference(y_known.columns)), axis=1)

In [42]:
# Prédiction sur les outliers
y_outlier_pred = clf.predict(X_outliers)

# Score des outliers
print("Score des outliers : ", accuracy_score(y_outlier, y_outlier_pred))

# RMSE des outliers
print("RMSE des outliers : ", np.sqrt(mean_squared_error(y_outlier, y_outlier_pred)))

Score des outliers :  0.3276595744680851
RMSE des outliers :  0.2886137078179509


In [43]:
# Aficchage des outliers dans un dataFrame
y_outlier_pred = pd.DataFrame(y_outlier_pred)

y_outlier_pred.columns = y_known.columns

y_outlier_pred

Unnamed: 0,avis_situation_declarative,avis_taxe_fonciere,bulletin_de_paie,compromis_de_vente,contrat_bail_locatif,epargne,impot,justificatif_domicile,justificatif_domicile_taxe_habitation,releve_de_compte
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
1875,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1876,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1877,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1878,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [44]:
# Comparaisaon des résultats obtenus et des résultats réels
res = pd.DataFrame(y_outlier_pred.idxmax(axis=1),columns=["category_pred"])

res["category_true"] = df_test["category"]

res

Unnamed: 0,category_pred,category_true
0,avis_situation_declarative,attestation_donation
1,releve_de_compte,attestation_donation
2,epargne,attestation_donation
3,avis_situation_declarative,attestation_donation
4,epargne,attestation_donation
...,...,...
1875,epargne,offre_pret_immo
1876,avis_situation_declarative,offre_pret_immo
1877,epargne,offre_pret_immo
1878,epargne,offre_pret_immo


In [45]:
# Ajout d'un tfidf en plus du countVectorizer sur les 2 sets de données initiaux
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train)
X_train_tf = tf_transformer.transform(X_train)
X_test_tf = tf_transformer.transform(X_test)

X_outliers_tf = tf_transformer.transform(X_outliers)


In [46]:
# Je formate les résultats
y_train_pred = clf.predict(X_train_tf)
y_test_pred = clf.predict(X_test_tf)

# Score de l'apprentissage
print("Score de l'apprentissage : ", accuracy_score(y_train, y_train_pred))
print("Score du test : ", accuracy_score(y_test, y_test_pred))

# RMSE
print("RMSE de l'apprentissage : ", np.sqrt(mean_squared_error(y_train, y_train_pred)))

Score de l'apprentissage :  0.012352663837812352
Score du test :  0.011312217194570135
RMSE de l'apprentissage :  0.3142685692464628


In [47]:
# Prédiction sur les outliers
y_outlier_pred = clf.predict(X_test)

y_outlier = pd.get_dummies(df_test["category"])
# Pour avoir les mêmes dimmensions de résultats
for i in y_known.columns :
    if i not in(y_outlier.columns) :
        y_outlier[i] = 0
        
y_outlier = y_outlier.drop(list(set(y_outlier.columns).difference(y_known.columns)), axis=1)

In [48]:
# Prédiction sur les outliers
y_outlier_pred = clf.predict(X_outliers_tf)

# Score des outliers
print("Score des outliers : ", accuracy_score(y_outlier, y_outlier_pred))

# RMSE des outliers
print("RMSE des outliers : ", np.sqrt(mean_squared_error(y_outlier, y_outlier_pred)))

Score des outliers :  0.9968085106382979
RMSE des outliers :  0.01786474002526241


In [52]:
# DataFrame de comparason des résultats obtenus

train_svc = {"train": 0.999, "test": 0.964, "RMSE": 0.011, "model" : "linear SVC"}
test_svc = {"train": np.nan, "test":0.327, "RMSE": 0.288, "model" : "linear SVC"}
tfidf_train_svc = {"train": 0.012, "test": 0.011, "RMSE": 0.314, "model" : "linear SVC"}
tfidf_test_svc = {"train": np.nan, "test":0.996, "RMSE": 0.017, "model" : "linear SVC"}

train_bayes = {"train": 0.599, "test": 0.598, "RMSE": 0.210, "model" : "MultinomialNB"}
test_bayes = {"train": np.nan, "test":0.041, "RMSE": 0.375, "model" : "MultinomialNB"}
tfidf_train_bayes = {"train": 0.730, "test": 0.733, "RMSE": 0.169, "model" : "MultinomialNB"}
tfidf_test_bayes = {"train": np.nan, "test":0.235, "RMSE": 0.309, "model" : "MultinomialNB"}

df_res = pd.DataFrame([train_svc,test_svc,tfidf_train_svc,tfidf_test_svc,train_bayes,test_bayes,tfidf_train_bayes,tfidf_test_bayes],
                     index=["train_svc","test_svc","tfidf_train_svc","tfidf_test_svc","train_bayes","test_bayes","tfidf_train_bayes","tfidf_test_bayes"])

In [53]:
# Affichage des résultats
df_res

Unnamed: 0,train,test,RMSE,model
train_svc,0.999,0.964,0.011,linear SVC
test_svc,,0.327,0.288,linear SVC
tfidf_train_svc,0.012,0.011,0.314,linear SVC
tfidf_test_svc,,0.996,0.017,linear SVC
train_bayes,0.599,0.598,0.21,MultinomialNB
test_bayes,,0.041,0.375,MultinomialNB
tfidf_train_bayes,0.73,0.733,0.169,MultinomialNB
tfidf_test_bayes,,0.235,0.309,MultinomialNB


In [61]:
# Affichage de la classe prédite pour le premier document

print("formatage de la prédiction :\n")
print(y_train.iloc[0])

formatage de la prédiction :

avis_situation_declarative               0
avis_taxe_fonciere                       0
bulletin_de_paie                         1
compromis_de_vente                       0
contrat_bail_locatif                     0
epargne                                  0
impot                                    0
justificatif_domicile                    0
justificatif_domicile_taxe_habitation    0
releve_de_compte                         0
Name: 1254, dtype: uint8
