In [1]:
import os
import numpy as np
import re
import sys
import pandas as pd

from sklearn.model_selection import train_test_split
from spellchecker import SpellChecker
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.neighbors import LocalOutlierFactor

In [2]:
pathname = os.path.dirname(sys.argv[0])

In [3]:
def vectorize_documents(documents):
    """
        Vectorize documents as bag of words

    :param documents: List of all documents retrieved
    :rtype: list

    :return: Vectorized document
    :rtype: `np.ndarray`
    """

    vectorizer = CountVectorizer(max_features=6000)
    X = vectorizer.fit_transform(documents)
    return X.toarray()

def vectorize_tfidf_documents(documents):
    """
        Vectorize documents as bag of words with a weighting factor

    :param documents: List of all documents retrieved
    :rtype: list

    :return: Vectorized document
    :rtype: `np.ndarray`
    """

    vectorizer = TfidfVectorizer(max_features=6000)
    X = vectorizer.fit_transform(documents)
    return X.toarray()

# Avec Novelty

### Bag of words

In [4]:
all_document_text_train = []
all_document_text_test = []

clean = pd.read_csv("clean.csv").reset_index()
outlier = pd.read_csv("outlier.csv").reset_index()

#Contenu des textes récupéré et stocké
for index, row in clean.iterrows():
    text = row["content"]
    all_document_text_train.append(text)
    
for index, row in outlier.iterrows():
    text = row["content"]
    all_document_text_test.append(text)

In [5]:
#Bag of words
X1 = vectorize_documents(all_document_text_train)
X2 = vectorize_documents(all_document_text_test)

X_train, X_test = train_test_split(X1, shuffle=True, random_state=42)
lof = LocalOutlierFactor(novelty=True)
lof.fit(X_train)

y_pred_train = lof.predict(X_train)
y_pred_test = lof.predict(X_test)
y_pred_outliers = lof.predict(X2)

#Calcul des erreurs (outliers détectés là où il n'y en a pas)
n_error_train = y_pred_train[y_pred_train == -1].size
n_error_test = y_pred_test[y_pred_test == -1].size

#Calcul du nombre d'outliers non-détectés (omission)
n_error_outliers = y_pred_outliers[y_pred_outliers == 1].size

train_size = y_pred_train.size
test_size = y_pred_test.size
outliers_size = y_pred_outliers.size

#Calcul des taux d'erreur
error_train = n_error_train/train_size * 100
error_test = n_error_test/test_size * 100
error_outliers = n_error_outliers/outliers_size * 100

print("error_train : %3f %%"%error_train)
print("error_test : %3f %%"%error_test)
print("error_outliers : %3f %%"%error_outliers)

error_train : 21.514786 %
error_test : 22.775264 %
error_outliers : 6.489362 %


### Bag of words + TF-IDF

In [6]:
#TF-IDF
X1 = vectorize_tfidf_documents(all_document_text_train)
X2 = vectorize_tfidf_documents(all_document_text_test)

X_train, X_test = train_test_split(X1, shuffle=True, random_state=42)
lof = LocalOutlierFactor(novelty=True)
lof.fit(X_train)

y_pred_train = lof.predict(X_train)
y_pred_test = lof.predict(X_test)
y_pred_outliers = lof.predict(X2)

#Calcul des erreurs (outliers détectés là où il n'y en a pas)
n_error_train = y_pred_train[y_pred_train == -1].size
n_error_test = y_pred_test[y_pred_test == -1].size

#Calcul du nombre d'outliers non-détectés (omission)
n_error_outliers = y_pred_outliers[y_pred_outliers == 1].size

train_size = y_pred_train.size
test_size = y_pred_test.size
outliers_size = y_pred_outliers.size

#Calcul des taux d'erreur
error_train = n_error_train/train_size * 100
error_test = n_error_test/test_size * 100
error_outliers = n_error_outliers/outliers_size * 100

print("error_train : %3f %%"%error_train)
print("error_test : %3f %%"%error_test)
print("error_outliers : %3f %%"%error_outliers)

error_train : 1.740093 %
error_test : 1.779789 %
error_outliers : 77.978723 %


# Sans Novelty

In [30]:
clean = pd.read_csv("clean.csv").reset_index()
outlier = pd.read_csv("outlier.csv").reset_index()

print("Nombre de documents connus :",clean.size)
print("Nombre d'outliers :",outlier.size)

#Base de données contenant des outliers
df = pd.concat([clean,outlier])

print("Taux d'outliers attendus : %3f%%"%(outlier.size/df.size*100))

Nombre de documents connus : 66285
Nombre d'outliers : 9400
Taux d'outliers attendus : 12.419898%


In [31]:
all_document_text_train = []

for index, row in df.iterrows():
    text = row["content"]
    all_document_text_train.append(text)
    
X = vectorize_documents(all_document_text_train)

lof = LocalOutlierFactor()
y_pred = lof.fit_predict(X)


print("Taux d'outliers détectés : %3f %%"%(y_pred[y_pred == -1].size/y_pred.size*100))

Taux d'outliers détectés : 26.497985 %
