In [8]:
import numpy as np
np.set_printoptions(threshold=10000,suppress=True)
import pandas as pd
import warnings
import matplotlib.pyplot as plt
warnings.filterwarnings('ignore')
import nltk

# 1. Téléchargement du dataset

In [9]:
nltk.download('reuters')

[nltk_data] Downloading package reuters to C:\Users\Bastien
[nltk_data]     Audu\AppData\Roaming\nltk_data...


True

In [10]:
from nltk.corpus import reuters
train_documents, train_categories = zip(*[(reuters.raw(i), reuters.categories(i)) for i in reuters.fileids() if i.startswith('training/')])
test_documents, test_categories = zip(*[(reuters.raw(i), reuters.categories(i)) for i in reuters.fileids() if i.startswith('test/')])

In [14]:
print('Taille du corpus : {0:d}'.format(len(train_documents)))

Taille du corpus : 7769


# 2. Transformation des catégories en vecteurs

In [29]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
train_labels = mlb.fit_transform(train_categories)
test_labels = mlb.transform(test_categories)

# 3. Models

In [37]:
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_absolute_error, accuracy_score, confusion_matrix
from sklearn.model_selection import KFold, cross_val_score

algos = {
    'RF': RandomForestRegressor(n_estimators=50, random_state=1, n_jobs=-1),
    'KNN': KNeighborsRegressor(n_neighbors=5, n_jobs=-1, metric='cosine'),
    'MLP': MLPRegressor(hidden_layer_sizes=(20, 10), max_iter=200, random_state=1, alpha=0.001),
    'OVSR': OneVsRestClassifier(LinearSVC(random_state=0))
}


def run_models(X_train, Y_train, X_test, Y_test, algos):
    for algo_name in algos:
        model = algos[algo_name]
        model.fit(X_train, Y_train)
        prediction = model.predict(X_test)
        MAE = mean_absolute_error(Y_test, prediction)
        ACC = accuracy_score(Y_test, np.round(prediction))
        kf = KFold(n_splits=10, random_state = 42, shuffle = True)
        scores = cross_val_score(model, X_train, train_labels, cv = kf)
        print('################## {0} #############'.format(algo_name))
        print('Cross-validation accuracy: {:.4f} (+/- {:.4f})'.format(scores.mean(), scores.std() * 2))
        # print('MAE = {0:.3f}, Accuracy ={1:.3f}'.format(scores.mean(), ACC))
        print()


# 4. TF-IDF

In [19]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer,TfidfVectorizer
CV=CountVectorizer(max_features=1000,stop_words='english')
CV.fit(train_documents)
corpus_train_CV=CV.transform(train_documents)
corpus_test_CV=CV.transform(test_documents)

TFIDF=TfidfTransformer()
TFIDF.fit(corpus_train_CV)
corpus_train_tfidf=TFIDF.transform(corpus_train_CV)
corpus_test_tfidf=TFIDF.transform(corpus_test_CV)



TFIDF=TfidfVectorizer(max_features=1000,stop_words='english')
TFIDF.fit(train_documents)
corpus_train_tfidf=TFIDF.transform(train_documents)
corpus_test_tfidf=TFIDF.transform(test_documents)

# 5. Exécution des modèles

In [38]:
run_models (corpus_train_tfidf.toarray(),train_labels,corpus_test_tfidf.toarray(),test_labels,algos)

################## RF #############
Cross-validation accuracy: 0.3072 (+/- 0.0279)

################## KNN #############
Cross-validation accuracy: 0.3838 (+/- 0.0731)

################## MLP #############
Cross-validation accuracy: 0.0349 (+/- 0.0110)

################## OVSR #############
Cross-validation accuracy: 0.8176 (+/- 0.0304)



# 6. Vectorisation par SVD

In [39]:
from sklearn.decomposition import TruncatedSVD

SVD = TruncatedSVD(n_components=100)
SVD.fit(corpus_train_tfidf)
corpus_train_SVD = SVD.transform(corpus_train_tfidf)
corpus_test_SVD = SVD.transform(corpus_test_tfidf)
corpus_train_SVD.shape

(7769, 100)

In [40]:
run_models (corpus_train_SVD,train_labels,corpus_test_SVD,test_labels,algos)

################## RF #############
Cross-validation accuracy: 0.2277 (+/- 0.0325)

################## KNN #############
Cross-validation accuracy: 0.3364 (+/- 0.0592)

################## MLP #############
Cross-validation accuracy: -0.0078 (+/- 0.0114)

################## OVSR #############
Cross-validation accuracy: 0.7659 (+/- 0.0354)



In [45]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Concept #%d: " % topic_idx
        message += " ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()


print_top_words(SVD, TFIDF.get_feature_names_out(), 10)

Concept #0: vs cts mln 000 loss net shr dlrs profit revs
Concept #1: said pct dlrs company billion bank shares mln lt stock
Concept #2: cts div qtly record april pay prior dividend sets march
Concept #3: billion bank pct stg mln february vs january money trade
Concept #4: loss 000 profit trade tonnes pct japan said dollar rate
Concept #5: 000 tonnes wheat said sugar net trade export vs ec
Concept #6: dlrs tonnes 1986 billion year quarter earnings 1987 january february
Concept #7: pct february january 000 shares stock rose stake rate common
Concept #8: stg tonnes 000 mln loss wheat bank money market pct
Concept #9: 000 dlrs bank billion oper fed money quarter share dollar
Concept #10: billion stock split dividend trade tonnes shares common board declared
Concept #11: split dividend stock share quarter earnings 1987 declared payable rate
Concept #12: oil billion crude gas split 000 opec barrels stock reserves
Concept #13: dlrs fed says mln shares offer rate pct week oper
Concept #14: tra