In [1]:
import numpy as np

np.set_printoptions(threshold=10000, suppress=True)
import pandas as pd
import warnings

warnings.filterwarnings('ignore')
import nltk

# 1. Téléchargement du dataset

In [2]:
nltk.download('reuters')

[nltk_data] Downloading package reuters to C:\Users\Bastien
[nltk_data]     Audu\AppData\Roaming\nltk_data...
[nltk_data]   Package reuters is already up-to-date!


True

In [3]:
from nltk.corpus import reuters

train_documents, train_categories = zip(
    *[(reuters.raw(i), reuters.categories(i)) for i in reuters.fileids() if i.startswith('training/')])
test_documents, test_categories = zip(
    *[(reuters.raw(i), reuters.categories(i)) for i in reuters.fileids() if i.startswith('test/')])

In [4]:
print('Taille du corpus : {0:d}'.format(len(train_documents)))

Taille du corpus : 7769


# 2. Transformation des catégories en vecteurs

In [5]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
train_labels = mlb.fit_transform(train_categories)
test_labels = mlb.transform(test_categories)

# 3. Models

In [6]:
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, BaggingClassifier
from sklearn.metrics import roc_auc_score
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

algos = {
    'KNN5': KNeighborsClassifier(n_neighbors=5, n_jobs=-1, metric='cosine'),
    'MLP': MLPClassifier(hidden_layer_sizes=(20, 10), max_iter=200, random_state=1, alpha=0.001),
    'OVSR': OneVsRestClassifier(LinearSVC(random_state=0)),
    # 'ADA': AdaBoostClassifier(n_estimators=100, random_state=0),
    # 'GDB': GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0),
    # 'BGG': BaggingClassifier(n_estimators=100, random_state=0)
}


def run_models(X_train, Y_train, X_test, Y_test, algos):
    for algo_name in algos:
        model = algos[algo_name]
        model.fit(X_train, Y_train)
        Y_pred = model.predict(X_test)
        scores = roc_auc_score(Y_test, Y_pred)
        print('################## {0} #############'.format(algo_name))
        print('Aire sous la courbe: {:.3f}%'.format(scores.mean() * 100))
        print()


# 4. TF-IDF

In [7]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer

CV = CountVectorizer(max_features=1000, stop_words='english')
CV.fit(train_documents)
corpus_train_CV = CV.transform(train_documents)
corpus_test_CV = CV.transform(test_documents)

TFIDF = TfidfTransformer()
TFIDF.fit(corpus_train_CV)
corpus_train_tfidf = TFIDF.transform(corpus_train_CV)
corpus_test_tfidf = TFIDF.transform(corpus_test_CV)

TFIDF = TfidfVectorizer(max_features=1000, stop_words='english')
TFIDF.fit(train_documents)
corpus_train_tfidf = TFIDF.transform(train_documents)
corpus_test_tfidf = TFIDF.transform(test_documents)

In [8]:
from sklearn.model_selection import GridSearchCV


def get_best_model(model, X, y, cv=5,
                   ={}):
    grid = GridSearchCV(model, param_grid=params, cv=cv, scoring='roc_auc')
    grid.fit(X, y)
    return grid.best_estimator_, grid.best_params_, grid.best_score_

# 5. Exécution des modèles

In [9]:
params_knn = {
    'n_neighbors': [3, 5, 8],
}

params_mlp = {
    'hidden_layer_sizes': [(20, 10), (20, 20, 10), (20, 20, 20, 10)],
    'activation': ['tanh', 'relu'],
    'alpha': [0.0001, 0.001, 0.01, 0.1]
}

params_ovr = {
    'estimator__C': [0.1, 1, 10, 100, 1000],
    'estimator__penalty': ['l1', 'l2'],
    'estimator__loss': ['hinge', 'squared_hinge']
}

best_knn_model,best_knn_param, best_knn_score = get_best_model(KNeighborsClassifier(n_jobs=-1, metric='cosine'), corpus_train_tfidf, train_labels, params=params_knn)
print('Meilleur modèle KNN : {0}'.format(best_knn_model))
print('Meilleur paramètre KNN : {0}'.format(best_knn_param))
print('Meilleur score KNN : {0}'.format(best_knn_score))
best_mlp_model,best_mlp_param, best_mlp_score = get_best_model(MLPClassifier(max_iter=200, random_state=1), corpus_train_tfidf, train_labels, params=params_mlp)
print('Meilleur modèle MLP : {0}'.format(best_mlp_model))
print('Meilleur paramètre MLP : {0}'.format(best_mlp_param))
print('Meilleur score MLP : {0}'.format(best_mlp_score))
best_ovr_model,best_ovr_param, best_ovr_score = get_best_model(OneVsRestClassifier(LinearSVC(random_state=0)), corpus_train_tfidf, train_labels, params=params_ovr)
print('Meilleur modèle OVR : {0}'.format(best_ovr_model))
print('Meilleur paramètre OVR : {0}'.format(best_ovr_param))
print('Meilleur score OVR : {0}'.format(best_ovr_score))
algos = {
    'KNN': best_knn_model,
    'MLP': best_mlp_model,
    'OVR': best_ovr_model
}

Meilleur modèle KNN : KNeighborsClassifier(metric='cosine', n_jobs=-1, n_neighbors=3)
Meilleur paramètre KNN : {'n_neighbors': 3}
Meilleur score KNN : nan
Meilleur modèle MLP : MLPClassifier(activation='tanh', hidden_layer_sizes=(20, 10), random_state=1)
Meilleur paramètre MLP : {'activation': 'tanh', 'alpha': 0.0001, 'hidden_layer_sizes': (20, 10)}
Meilleur score MLP : nan


ValueError: Unsupported set of arguments: The combination of penalty='l1' and loss='hinge' is not supported, Parameters: penalty='l1', loss='hinge', dual=True

In [10]:
model = MLPClassifier(max_iter=200, random_state=1)
model.fit(corpus_train_tfidf, train_labels)


In [11]:
roc_auc_score(test_labels, model.predict(corpus_test_tfidf))

0.6770241338446288

In [41]:
run_models(np.array(corpus_train_tfidf.toarray()), train_labels, np.array(corpus_test_tfidf.toarray()), test_labels,
           algos)

################## bestKn #############
Aire sous la courbe: 65.214%



# 6. Vectorisation par SVD

In [39]:
from sklearn.decomposition import TruncatedSVD

SVD = TruncatedSVD(n_components=100)
SVD.fit(corpus_train_tfidf)
corpus_train_SVD = SVD.transform(corpus_train_tfidf)
corpus_test_SVD = SVD.transform(corpus_test_tfidf)
corpus_train_SVD.shape

(7769, 100)

In [40]:
run_models(corpus_train_SVD, train_labels, corpus_test_SVD, test_labels, algos)

################## RF #############
Cross-validation accuracy: 0.2277 (+/- 0.0325)

################## KNN #############
Cross-validation accuracy: 0.3364 (+/- 0.0592)

################## MLP #############
Cross-validation accuracy: -0.0078 (+/- 0.0114)

################## OVSR #############
Cross-validation accuracy: 0.7659 (+/- 0.0354)



In [45]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Concept #%d: " % topic_idx
        message += " ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()


print_top_words(SVD, TFIDF.get_feature_names_out(), 10)

Concept #0: vs cts mln 000 loss net shr dlrs profit revs
Concept #1: said pct dlrs company billion bank shares mln lt stock
Concept #2: cts div qtly record april pay prior dividend sets march
Concept #3: billion bank pct stg mln february vs january money trade
Concept #4: loss 000 profit trade tonnes pct japan said dollar rate
Concept #5: 000 tonnes wheat said sugar net trade export vs ec
Concept #6: dlrs tonnes 1986 billion year quarter earnings 1987 january february
Concept #7: pct february january 000 shares stock rose stake rate common
Concept #8: stg tonnes 000 mln loss wheat bank money market pct
Concept #9: 000 dlrs bank billion oper fed money quarter share dollar
Concept #10: billion stock split dividend trade tonnes shares common board declared
Concept #11: split dividend stock share quarter earnings 1987 declared payable rate
Concept #12: oil billion crude gas split 000 opec barrels stock reserves
Concept #13: dlrs fed says mln shares offer rate pct week oper
Concept #14: tra

# 7. Word2Vec

In [61]:
import gensim
import multiprocessing

cores = multiprocessing.cpu_count()
corpus = []
for i in range(len(train_documents)):
    corpus.append(gensim.utils.simple_preprocess(train_documents[i]))
corpus[0]

['bahia',
 'cocoa',
 'review',
 'showers',
 'continued',
 'throughout',
 'the',
 'week',
 'in',
 'the',
 'bahia',
 'cocoa',
 'zone',
 'alleviating',
 'the',
 'drought',
 'since',
 'early',
 'january',
 'and',
 'improving',
 'prospects',
 'for',
 'the',
 'coming',
 'temporao',
 'although',
 'normal',
 'humidity',
 'levels',
 'have',
 'not',
 'been',
 'restored',
 'comissaria',
 'smith',
 'said',
 'in',
 'its',
 'weekly',
 'review',
 'the',
 'dry',
 'period',
 'means',
 'the',
 'temporao',
 'will',
 'be',
 'late',
 'this',
 'year',
 'arrivals',
 'for',
 'the',
 'week',
 'ended',
 'february',
 'were',
 'bags',
 'of',
 'kilos',
 'making',
 'cumulative',
 'total',
 'for',
 'the',
 'season',
 'of',
 'mln',
 'against',
 'at',
 'the',
 'same',
 'stage',
 'last',
 'year',
 'again',
 'it',
 'seems',
 'that',
 'cocoa',
 'delivered',
 'earlier',
 'on',
 'consignment',
 'was',
 'included',
 'in',
 'the',
 'arrivals',
 'figures',
 'comissaria',
 'smith',
 'said',
 'there',
 'is',
 'still',
 'some',


In [62]:
model_size = 100
model = gensim.models.Word2Vec(corpus, vector_size=model_size, sg=0, window=5, min_count=2, workers=cores - 1)

In [63]:
for i in range(100):
    model.train(corpus, total_examples=len(corpus), epochs=1)
    print('Train ', i)

Train  0
Train  1
Train  2
Train  3
Train  4
Train  5
Train  6
Train  7
Train  8
Train  9
Train  10
Train  11
Train  12
Train  13
Train  14
Train  15
Train  16
Train  17
Train  18
Train  19
Train  20
Train  21
Train  22
Train  23
Train  24
Train  25
Train  26
Train  27
Train  28
Train  29
Train  30
Train  31
Train  32
Train  33
Train  34
Train  35
Train  36
Train  37
Train  38
Train  39
Train  40
Train  41
Train  42
Train  43
Train  44
Train  45
Train  46
Train  47
Train  48
Train  49
Train  50
Train  51
Train  52
Train  53
Train  54
Train  55
Train  56
Train  57
Train  58
Train  59
Train  60
Train  61
Train  62
Train  63
Train  64
Train  65
Train  66
Train  67
Train  68
Train  69
Train  70
Train  71
Train  72
Train  73
Train  74
Train  75
Train  76
Train  77
Train  78
Train  79
Train  80
Train  81
Train  82
Train  83
Train  84
Train  85
Train  86
Train  87
Train  88
Train  89
Train  90
Train  91
Train  92
Train  93
Train  94
Train  95
Train  96
Train  97
Train  98
Train  99


In [64]:
model.save('./Word2vec_entraine.h5')

In [65]:
def word2vec_generator(texts, model, vector_size):
    dict_word2vec = {}
    for index, word_list in enumerate(texts):
        arr = np.array([0.0 for i in range(0, vector_size)])
        nb_word = 0
        for word in word_list:
            try:
                arr += model[word]
                nb_word = nb_word + 1
            except KeyError:
                continue
        if (len(word_list) == 0):
            dict_word2vec[index] = arr
        else:
            dict_word2vec[index] = arr / nb_word
    df_word2vec = pd.DataFrame(dict_word2vec).T
    return df_word2vec

In [66]:
corpus_train_tokens = pd.Series(train_documents).apply(lambda line: gensim.utils.simple_preprocess((line)))
corpus_test_tokens = pd.Series(test_documents).apply(lambda line: gensim.utils.simple_preprocess((line)))

In [67]:
model_wv_entraine = gensim.models.Word2Vec.load('./Word2vec_entraine.h5')
vector_size = model_wv_entraine.vector_size
corpus_train_wv_entraine = word2vec_generator(corpus_train_tokens, model_wv_entraine.wv, vector_size)
corpus_test_wv_entraine = word2vec_generator(corpus_test_tokens, model_wv_entraine.wv, vector_size)

In [70]:
run_models(corpus_train_wv_entraine, train_labels, corpus_test_wv_entraine, test_labels, algos)

################## RF #############
Cross-validation accuracy: 0.1904 (+/- 0.0221)

################## KNN #############
Cross-validation accuracy: 0.3432 (+/- 0.0705)

################## MLP #############
Cross-validation accuracy: 0.0094 (+/- 0.0082)

################## OVSR #############
Cross-validation accuracy: 0.7440 (+/- 0.0274)

