# Imports

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

from nltk.corpus import stopwords
import spacy
from spacy.lang.fr.examples import sentences

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import Ridge, SGDClassifier, LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, GridSearchCV
from sklearn.metrics import roc_auc_score

import pickle, sys, re, string

# Import  Dataset and Model

In [2]:
dataset_path = "../dataset/train_master.csv"
model_path = "../models/lr_clf_roc_auc_corrected" # best model !

In [3]:
clf = pickle.load(open(model_path, 'rb'))
df = pd.read_csv(dataset_path).dropna()

In [4]:
new_df = pd.DataFrame()
new_df['msg'] = df['review_title'] + " " + df['review_content']
new_df['Target'] = df['Target']
new_df

Unnamed: 0,msg,Target
0,La Police s'inscrit en acheteur privé sur Pric...,0
1,"Chef D'Oeuvre Absolu en vue... Alors, là, on a...",1
2,Effet garanti sur la terrase. Ils donnent immé...,0
3,Apple Power MAC G4 Ordinateur de bureau trés b...,1
4,"Comme Hermione Ma fille adore, elle n'en n'éta...",1
...,...,...
54131,"moto Bonjour, je suis intereser par votre anno...",1
54132,Un témoignage fort contre l'oubli Voline nous ...,1
54133,Double plaisir D'une bonne matière flexible ce...,1
54134,tres bien très bien pour compléter des collect...,1


In [5]:
# corpus
nlp = spacy.load("fr_core_news_lg")


def reformat(msg):
    msg = msg.replace('\r', ' ')
    msg = msg.replace('\n', ' ')
    msg = msg.lower()
    # remove digits
    msg = re.sub(r'\d+', '', msg)
    # remove ponctuation
    msg = msg.translate(str.maketrans(' ', ' ', string.punctuation))
    # remove multiple space
    msg = re.sub(' +', ' ', msg)
    msg = msg.strip()
    return msg

def text_preprocess(text):
    # reformat
    msg = reformat(text)

    # lemmatizer
    doc = nlp(msg)
    tokens = [token.lemma_ for token in doc if token.lemma_ not in stopwords.words('french')]
    
    return tokens

In [6]:
# Create an instance of TfidfVectorizer
vectoriser = TfidfVectorizer(analyzer=text_preprocess)

# Fit to the data and transform to feature matrix
df_vec = vectoriser.fit_transform(new_df['msg'])
print('df_vec.shape', df_vec.shape)

X_train, X_test, y_train, y_test = train_test_split(df_vec, new_df['Target'], test_size=0.3, random_state=123)
print(f'Train dimensions: {X_train.shape, y_train.shape}')
print(f'Test dimensions: {X_test.shape, y_test.shape}')

# Check out target distribution
print('y_train distribution:')
print(y_train.value_counts())
print('y_test distribution:')
print(y_test.value_counts())

df_vec.shape (54133, 69108)
Train dimensions: ((37893, 69108), (37893,))
Test dimensions: ((16240, 69108), (16240,))
y_train distribution:
1    21024
0    16869
Name: Target, dtype: int64
y_test distribution:
1    8973
0    7267
Name: Target, dtype: int64


In [7]:
pred = clf.fit(X_train, y_train).predict(X_test)
roc_auc_score(y_test, pred)

0.6441361452674461

# Test on all

In [8]:
pred = clf.fit(df_vec, new_df['Target']).predict(df_vec)
roc_auc_score(new_df['Target'], pred)

0.7619057204582462