In [None]:
import pandas as pd
from news_paper_dao import NewsPaperDao
from IPython.core.display import HTML
from os import getcwd, path
from news_paper_nlp_pre_processing import *
from nltk.corpus import stopwords, wordnet
import nltk

: 

In [None]:
# Récupère le répertoire du programme
curent_path = getcwd()+ "\\"
if "ema_lannuontimes" not in curent_path:
    curent_path += "PROJETS\\ema_lannuontimes\\"
print(curent_path)

<div style="display: flex; background-color: Blue; padding: 15px;" >

## 1.Exploration des données: 
</div>

In [None]:
verbose = 1

dao = NewsPaperDao(nom_bdd=curent_path+"em_bdd.db")
assert dao.test_connexion()

df_articles = dao.get_articles(verbose=0)
print(df_articles.shape)

In [None]:
color_by_journal = {"Le Trégor":"green","ActuGaming":"orange", "Elle":"pink", "30 M. d\\'amis":"blue"}
journaux = df_articles["journal"].unique()

In [None]:
print(df_articles.shape)
print(df_articles.columns)
display(HTML(df_articles.head().to_html()))

In [None]:
import matplotlib.pyplot as plt

In [None]:
figure, axe = color_graph_background(1, 1)

bins = len(journaux)
for journal in journaux:
    df_articles[df_articles["journal"]==journal].journal.hist(ax=axe, color=color_by_journal.get(journal, "gray"), bins=bins)

figure.set_size_inches(15, 5, forward=True)
axe.set_ylabel("Nombre d'articles")
plt.title("Nombre d'articles par journal")
plt.xticks(rotation=45, ha="right")
plt.show()

In [None]:
df_articles = df_word_tokenize(df_articles, text_col_name='texte', token_col_name="mots_origine", verbose=verbose)

In [None]:
df_articles["nb_mots_origine"] = df_articles['texte'].apply(lambda x: word_count_func(x))

In [None]:
group = df_articles.groupby(["journal"], as_index=True).agg({'nb_mots_origine':['mean']})
group = group.reset_index()
group


In [None]:
df_articles.columns

In [None]:
list_tuple = [(i, i+299) for i in range (600, 3300, 300)]
list_tuple.insert(0, (200, 299))
list_tuple.insert(0, (100, 199))
list_tuple.insert(0, (0, 99))
list_tuple.append((3300, 100000))
bins = pd.IntervalIndex.from_tuples(list_tuple)
df_articles["tranche_nb_origin"] = pd.cut(df_articles['nb_mots_origine'], bins)
df_articles.head()

In [None]:
group_nb_mots_art = df_articles.groupby(["journal", "tranche_nb_origin" ])["titre"].count().unstack("journal").fillna(0)
group_nb_mots_art.head()

In [None]:
figure, axe = color_graph_background(1,1)

# Affichage d'articles par journal
group_nb_mots_art.plot(kind='bar', ax=axe)
axe.set_ylabel("Nombre d'articles")
axe.grid(axis='y')

figure.set_size_inches(16, 8, forward=True)
figure.suptitle("Nombre d'articles par tranche de mots", fontsize=16)
plt.xticks(rotation=45, ha="right")
plt.show()

In [None]:
df_articles.describe()

In [None]:
df_articles.head()

In [None]:
df_articles.columns

<div style="display: flex; background-color: Blue; padding: 15px;" >

## 3.NLP Preprocessing
</div>

In [None]:
personnal_stop_word = ["tout", "tous", "cette", "bien", "comme", "encore", "autre", "bien", "tres", "alors", "plus", "aussi", "si", "donc", "p", "h", "etre"]
if personnal_stop_word is None:
    personnal_stop_word = stopwords.words("french")
else :
    personnal_stop_word.extend(stopwords.words("french"))
len(personnal_stop_word)

In [None]:
def nlp_pre_process(input, sw=None, verbose=0):
    res = []
    res = tokenize(input)
    res = remove_stopwords_func(res, sw=sw)
    res = normalize_accented_chars(res)
    res = remove_irr_char_func(res)        
    res = remove_stopwords_func(res, sw=sw)
        
    res = list(filter(None, res))
    return res

In [None]:
df_articles["clean_words"] = df_articles['texte'].apply(lambda x: nlp_pre_process(x, sw=personnal_stop_word))
print(df_articles.columns)
display(HTML(df_articles.head().to_html()))

<div style="display: flex; background-color: Green; padding: 7px;" >

### 3.1.Analyse nb mots
</div>

In [None]:
df_articles["nb_mots_clean"] = df_articles['clean_words'].apply(lambda x: word_count_func(x))

In [None]:
df_articles["freq_name"] = df_articles["clean_words"].apply(lambda x: nltk.FreqDist(x))
df_articles["freq_unique_words"] = df_articles["freq_name"].apply(lambda x: len(x.keys()))

In [None]:
df_articles = df_articles.sort_values(by=["freq_unique_words"], ascending=False)

In [None]:
df_articles.head()

In [None]:
figure, axe = color_graph_background(1,1)

df_articles.plot(kind='bar', x='titre', y="nb_mots_clean", ax=axe, title='Nombre de mots par article')
df_articles.plot(y="freq_unique_words", x='titre', kind="bar", ax=axe, color="red")
figure.set_size_inches(16, 8, forward=True)

plt.ylabel("Nombre de mots par article")
plt.xlabel("")
plt.xticks([])
# plt.xticks(rotation=45, ha="right")
plt.show()

In [None]:
figure, axes = color_graph_background(len(journaux),1)

y_ticks = [i for i in range(0, 1200, 200)]

i = 0
for journal in journaux:
    df_articles[df_articles["journal"]==journal].plot(y="freq_unique_words", x='titre', label=journal, kind="bar", ax=axes[i], color=color_by_journal.get(journal, "gray"))
    axes[i].set_yticks(y_ticks)
    axes[i].set_xticks([])
    i += 1

figure.set_size_inches(16, 8, forward=True)
plt.suptitle('Nombre de mots uniques par article par journal', fontsize=16)
plt.xlabel("Articles")
plt.xticks([])
plt.show()

<div style="display: flex; background-color: Green; padding: 7px;" >

### 3.2.Nuage de mots
</div>

In [None]:
from wordcloud import WordCloud

<div style="display: flex; background-color: indigo;" >

#### 3.2.1. SANS lemmatisation
</div>

In [None]:
df_articles["texte_clean"] = [','.join(map(str, l)) for l in df_articles["clean_words"]]
df_articles["sentence_clean"] = [' '.join(map(str, l)) for l in df_articles["clean_words"]]
df_articles["texte_clean"]

In [None]:
for journal in journaux:
    word_cloud(journal, df_articles)

<div style="display: flex; background-color: indigo;" >

#### 3.2.2. AVEC lemmatisation
</div>

<div style="display: flex; background-color: Blue; padding: 15px;" >

## 4.Entrainement d'un modèle de classification
</div>

Questions :
- Est-ce qu'on met en place l'approche td-idf ?
- Est-ce qu'on fait la lemmatisation ?
- 

<div style="display: flex; background-color: Green; padding: 7px;" >

### 4.0.Préparer les données
</div>

<div style="display: flex; background-color: indigo;" >

#### 4.0.1. Encodage de la target
</div>

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
random_state = 0

In [None]:
target_name = 'journal_code'
transformer_news_paper = LabelEncoder()
df_articles[target_name] = transformer_news_paper.fit_transform(df_articles["journal"])
#-- On positionne la colonne juste avant le nom du journal pour plus de lisibilité
cols = list(df_articles.columns)
cols.remove(target_name)
idx = cols.index('journal')
cols.insert(idx, target_name)
df_articles = df_articles[cols]
#--
df_articles

<div style="display: flex; background-color: Green; padding: 7px;" >

### 4.1.Essaie avec TF-IDF
</div>

<div style="display: flex; background-color: indigo;" >

#### 4.0.2. Calcul
</div>

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
idf_full = df_articles.copy()

In [None]:
df_articles.columns

In [None]:
vectorizer_idf_big = TfidfVectorizer(analyzer="word",token_pattern=get_regex_tokens(), stop_words=personnal_stop_word, ngram_range = (1,1), max_features=400)
X = vectorizer_idf_big.fit_transform(idf_full['sentence_clean'])
idf_big_df = pd.DataFrame(X.toarray(), index=idf_full.index, columns=vectorizer_idf_big.get_feature_names_out())
# ajout du site web

# Réorganisation des colonnes pour la lecture
idf_full = pd.merge(idf_full, idf_big_df, left_index=True, right_index=True)
idf_full = idf_full.drop(["mots_origine", "url", "nb_mots_origine",'auteur', 'tags', 'nb_mots_clean','freq_name','freq_unique_words'], axis=1)
# --
idf_full.head()

<div style="display: flex; background-color: indigo;" >

#### 4.0.3. Préparer le test et train
</div>

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
numeric_cols = get_numeric_columns_names(idf_full, verbose=verbose)
numeric_cols.remove(target_name)
numeric_cols

In [None]:
# On prend uniquement les colonnes qui nous intéresse (identifiée précédemment)
X_train_idf, X_test_idf, y_train_idf, y_test_idf = train_test_split(idf_full[numeric_cols], idf_full[target_name], test_size=0.2, random_state=random_state)
print(f" Train : {X_train_idf.shape} et {y_train_idf.shape} --- Test : {X_test_idf.shape} et {y_test_idf.shape}")

<div style="display: flex; background-color: indigo;" >

#### 4.0.4. Prédiction
</div>

In [None]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB, ComplementNB,BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.svm import LinearSVC

In [None]:
verbose = 0

#Create classifier
model_list = {
    "LogisticR":LogisticRegression(random_state=random_state, verbose=verbose),
    "SVC":svm.SVC(random_state=random_state, verbose=verbose), # SCORE de 0.643333
    "KNN":KNeighborsClassifier(n_neighbors=3), # SCRORE de 0.558333
    "LinearSVC":LinearSVC(random_state=random_state, verbose=verbose),
    "naiveGaussianNB":GaussianNB(),
    "MultinomialNB": MultinomialNB(),
    "ComplementNB": ComplementNB(),
    "BernoulliNB":BernoulliNB()
}

# Train the model using the training sets
model_dic_idf, scores_idf = fit_and_test_models(model_list, X_train=X_train_idf, Y_train=y_train_idf, X_test=X_test_idf, Y_test=y_test_idf, verbose=verbose)

score_all_class_df = pd.DataFrame(scores_idf).set_index("Modeli")
score_all_class_df.round(decimals=3)
score_all_class_df = score_all_class_df.sort_values(by="R2", ascending=False)
score_all_class_df

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
grid_params_BernoulliNB = { 'alpha' : [0,1],
                            'binarize' : [0.0, 0.5,1],
                            'fit_prior' : [True,False]
                            }

In [None]:
cross_validation = 3
grid_model_list = {
    "Grid_BernoulliNB":GridSearchCV(estimator=BernoulliNB(), param_grid=grid_params_BernoulliNB, cv=cross_validation, verbose=verbose),
}

In [None]:
# Train the model using the training sets
model_dic_idf_grid, scores_idf_grid = fit_and_test_models(grid_model_list, X_train=X_train_idf, Y_train=y_train_idf, X_test=X_test_idf, Y_test=y_test_idf, scores=scores_idf, verbose=verbose)

score_grid_idf = pd.DataFrame(scores_idf_grid).set_index("Modeli")
score_grid_idf.round(decimals=3)
score_grid_idf = score_grid_idf.sort_values(by="R2", ascending=False)
score_grid_idf

<div style="display: flex; background-color: Green; padding: 7px;" >

### 4.1.Essaie avec W2V

In [None]:
from gensim import downloader as g_dwnl

In [None]:
g_dwnl.info()["models"].keys()

In [None]:
glove_vectors = g_dwnl.load('glove-wiki-gigaword-100')

<div style="display: flex; background-color: indigo;" >

#### 4.0.2. Calcul
</div>

<div style="display: flex; background-color: indigo;" >

#### 4.0.3. Préparer le test et train
</div>

<div style="display: flex; background-color: Green; padding: 7px;" >

### 4.1.Test de plusieurs modèles
</div>

*   Entrainer un modèle de classification
*   Afficher la matrice de confusion
*   Calculer l'accuracy, la précision et le recall
*   Votre modèle est-il soumis à un overfitting ?

In [None]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB, ComplementNB,BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.svm import LinearSVC

<div style="display: flex; background-color: Green; padding: 7px;" >

### 4.2.Optimisation du modèle retenu
</div>

In [None]:
model_to_save = None

<div style="display: flex; background-color: Green; padding: 7px;" >

### 4.3.Sauvegarde du modèle entrainé
</div>

In [None]:
from joblib import dump, load
from datetime import datetime

In [None]:
# Sauvegarde du meilleur modele
now = datetime.now() # current date and time
date_time = now.strftime("%Y-%m-%d-%H_%M_%S")
model_save_file_name = 'ema_lannuontimes_saved_model_' + date_time + '.joblib'
# Attention, il faudra mettre à jour les colonnes correspondantes dans le premier if en cas de modification du model
dump(model_to_save, curent_path+model_save_file_name)

<div style="display: flex; background-color: Green; padding: 7px;" >

### 4.4.Prédiction
</div>

In [None]:
model_save_file_name = 'ema_lannuontimes_saved_model_' + date_time + '.joblib'
model_save_path = curent_path+ model_save_file_name

if path.exists(model_save_path) and path.isfile(model_save_path):
    # Chargement du modèle pré-entrainer
    better_model = load(model_save_path)

<div style="display: flex; background-color: Blue; padding: 15px;" >

## ANNEXE
</div>