In [None]:
from bs4 import BeautifulSoup
from tqdm import tqdm
import os
import pandas as pd
from collections import Counter
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('wordnet')
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report, accuracy_score

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\moham\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\moham\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\moham\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\moham\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\moham\AppData\Roaming\nltk_data...


True

In [None]:
def parse_reuters_sgm(file_path):

    with open(file_path, 'r', encoding='latin-1') as file:
        data = file.read()
    
    soup = BeautifulSoup(data, 'html.parser')
    articles = []

    for reuters in soup.find_all('reuters'):

        # Extract TITLE
        title = reuters.find('title').text if reuters.find('title') else None

        # Extract LEWISSPLIT attribute
        lewissplit = reuters.get('lewissplit', 'UNKNOWN')

        # Extract PLACES
        places = [d.text for d in reuters.find('places').find_all('d')] if reuters.find('places') else []

        # Extract DATE
        date = reuters.find('date').text if reuters.find('date') else None

        # Extract DATELINE
        dateline = reuters.find('dateline').text if reuters.find('dateline') else None

        # Extract BODY
        body = reuters.find('text').body.text if reuters.find('text') and reuters.find('text').body else None

        # Extract TOPICS
        topics = [d.text for d in reuters.find('topics').find_all('d')] if reuters.find('topics') else []

        # Append all extracted features to articles list
        articles.append({
            'TITLE': title,
            'LEWISSPLIT': lewissplit,
            'PLACES': places,
            'DATE': date,
            'DATELINE': dateline,
            'BODY': body,
            'TOPICS': topics
        })
    
    return articles

def process_all_sgm_files(directory_path):
    """
    Process all .sgm files in a directory and combine the data into a single DataFrame.
    """
    all_articles = []
    for file_name in os.listdir(directory_path):
        if file_name.endswith('.sgm'):
            file_path = os.path.join(directory_path, file_name)
            articles = parse_reuters_sgm(file_path)
            all_articles.extend(articles)
    
    return pd.DataFrame(all_articles)


In [56]:
# Directory containing the .sgm files
directory = 'reuters21578'

# Process all .sgm files and create a DataFrame
df = process_all_sgm_files(directory)

In [58]:
df.sample(7)

Unnamed: 0,TITLE,LEWISSPLIT,PLACES,DATE,DATELINE,BODY,TOPICS
8690,ECOLAB <ECL> STARTS BID FOR CHEMLAWN <CHEM>,TRAIN,[usa],24-MAR-1987 08:07:26.91,"NEW YORK, March 24 -",Ecolab Inc said it has started its\npreviously...,[acq]
1384,JAPAN CUTTING CHINA CORN COMMITMENTS - USDA,TRAIN,"[usa, japan, china]",3-MAR-1987 17:40:09.18,"WASHINGTON, March 3 -",Japanese traders have apparently\nsharply redu...,"[wheat, corn]"
15210,WALGREEN CO <WAG> 2ND QTR FEB 28 NET,TEST,[usa],8-APR-1987 13:57:29.01,"DEERFIELD, Ill, April 8 -\n",Shr 62 cts vs 58 cts\n Qtly div 13-1/2 cts ...,[earn]
10020,HIGHER U.S. WEEKLY CAR OUTPUT ESTIMATED,TRAIN,[usa],26-MAR-1987 12:38:09.30,"DETROIT, March 26 -","U.S. automakers are expected to build\n167,236...",[]
13637,,TRAIN,[],7-APR-1987 08:31:45.50,,,[]
13723,,TRAIN,[],7-APR-1987 09:25:24.41,,,[]
1123,<ROYAL BANK OF CANADA> 1ST QTR JAN 31 NET,TRAIN,[canada],3-MAR-1987 11:44:14.11,"MONTREAL, March 3 -\n",Shr basic 88 cts vs 1.22 dlrs\n Shr diluted...,[earn]


In [59]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21578 entries, 0 to 21577
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   TITLE       20841 non-null  object
 1   LEWISSPLIT  21578 non-null  object
 2   PLACES      21578 non-null  object
 3   DATE        21578 non-null  object
 4   DATELINE    19043 non-null  object
 5   BODY        19043 non-null  object
 6   TOPICS      21578 non-null  object
dtypes: object(7)
memory usage: 1.2+ MB


In [63]:
# Supprimer les articles sans body
df = df.dropna(subset=['BODY'])

# Supprimer les articles avec des topics = '[]'
df = df[df['TOPICS'].apply(lambda x: x != [])]


In [64]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10377 entries, 0 to 21575
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   TITLE       10377 non-null  object
 1   LEWISSPLIT  10377 non-null  object
 2   PLACES      10377 non-null  object
 3   DATE        10377 non-null  object
 4   DATELINE    10377 non-null  object
 5   BODY        10377 non-null  object
 6   TOPICS      10377 non-null  object
dtypes: object(7)
memory usage: 648.6+ KB


In [76]:
target_topics = {'money-fx', 'ship', 'interest', 'acq', 'earn'}

# Supprimer les articles qui ne contiennent pas les topics cibles
df_cleaned = df[df['TOPICS'].apply(lambda x: bool(set(x) & target_topics))]

In [81]:
# Si un article contient plusieurs topics, on garde uniquement le topic présent dans target_topics
df_cleaned['TOPICS'] = df_cleaned['TOPICS'].apply(lambda x: list(set(x) & target_topics)[0])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['TOPICS'] = df_cleaned['TOPICS'].apply(lambda x: list(set(x) & target_topics)[0])


In [95]:
df_cleaned.sample(6)

Unnamed: 0,TITLE,LEWISSPLIT,PLACES,DATE,DATELINE,BODY,TOPICS
6685,SERVICE CORP INTERNATIONAL <SRV> SETS QUARTERLY,TRAIN,[usa],18-MAR-1987 13:43:47.83,"HOUSTON, March 18 -\n",Qtly div eight cts vs eight cts prior\n Pay...,earn
1420,SWAP DEALERS UNVEIL STANDARD CONTRACT,TRAIN,"[uk, usa]",4-MAR-1987 09:30:50.97,"London, March 4 -",The International Swap Dealers\nAssociation ha...,interest
16816,<BIRDSBORO CORP> 4TH QTR LOSS,TEST,[usa],17-APR-1987 09:24:56.94,"MIAMI, April 17 -\n",Shr loss 24 cts vs loss 20 cts\n Net loss 1...,earn
1505,CONVENIENT FOOD MART <CFMI> AGREES TO BUY CHAIN,TRAIN,[usa],4-MAR-1987 10:52:39.54,"ROSEMONT, Ill, March 4 -",Convenient Food Mart Inc said it\nhas tentativ...,acq
15714,FIRST FEDERAL OF MICHIGAN <FFOM> 1ST QTR NET,TEST,[usa],9-APR-1987 12:22:10.55,"DETROIT, April 9 -\n","Shr 3.33 dlrs vs 3.39 dlrs\n Net 37,069,000...",earn
5860,U.K. MONEY MARKET RECEIVES NO MORNING ASSISTANCE,TRAIN,[uk],17-MAR-1987 07:22:27.62,"LONDON, March 17 -",The Bank of England said it did not\noperate i...,interest


In [91]:
df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7175 entries, 8 to 21573
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   TITLE       7175 non-null   object
 1   LEWISSPLIT  7175 non-null   object
 2   PLACES      7175 non-null   object
 3   DATE        7175 non-null   object
 4   DATELINE    7175 non-null   object
 5   BODY        7175 non-null   object
 6   TOPICS      7175 non-null   object
dtypes: object(7)
memory usage: 448.4+ KB


## Descriptive Analysis

In [120]:
# Initialiser le lemmatiseur
lemmatizer = WordNetLemmatizer()

def get_wordnet_pos(word):
    """
    Retourne la catégorie grammaticale (POS) pour un mot.
    """
    from nltk.corpus import wordnet
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {
        "J": wordnet.ADJ,
        "N": wordnet.NOUN,
        "V": wordnet.VERB,
        "R": wordnet.ADV
    }
    return tag_dict.get(tag, wordnet.NOUN)

def lemmatize_text(text):
    """
    Lemmatisation d'un texte en supprimant les stopwords.
    """
    tokens = word_tokenize(text.lower())
    return [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in tokens if word.isalnum()]

def get_word_frequencies(texts, stop_words):
    """
    Calcule les mots les plus fréquents à partir d'une liste de textes après lemmatisation.
    """
    all_words = []
    for text in texts:
        if isinstance(text, str):
            lemmatized_words = lemmatize_text(text)
            words = [word for word in lemmatized_words if word not in stop_words]
            all_words.extend(words)
    return Counter(all_words).most_common(10)

def compute_tfidf(texts, stop_words, top_n=10):
    """
    Calcule les mots les plus significatifs selon TF-IDF après lemmatisation.
    """
    if len(texts) == 0:
        return []
    
    # Appliquer la lemmatisation et supprimer les stopwords
    processed_texts = [
        ' '.join([word for word in lemmatize_text(text) if word not in stop_words])
        for text in texts if isinstance(text, str)
    ]
    
    # Initialiser le TF-IDF Vectorizer
    tfidf_vectorizer = TfidfVectorizer(max_features=1000)
    tfidf_matrix = tfidf_vectorizer.fit_transform(processed_texts)
    feature_names = tfidf_vectorizer.get_feature_names_out()
    
    # Calculer la moyenne des scores TF-IDF pour chaque mot
    tfidf_avg_scores = tfidf_matrix.mean(axis=0).A1
    tfidf_scores = list(zip(feature_names, tfidf_avg_scores))
    
    # Trier les mots par score TF-IDF
    tfidf_scores = sorted(tfidf_scores, key=lambda x: x[1], reverse=True)
    
    return [word for word, _ in tfidf_scores[:top_n]]

def descriptive_analysis(df):
    """
    Analyse descriptive pour identifier les caractéristiques principales de chaque sujet.
    """
    stop_words = set(stopwords.words('english'))  # Stopwords à exclure
    topic_analysis = {}

    for topic in df['TOPICS'].unique():
        topic_data = df[df['TOPICS'] == topic]
        
        # Longueur moyenne du texte
        avg_body_length = topic_data['BODY'].apply(lambda x: len(x.split()) if isinstance(x, str) else 0).mean()
        
        # Fréquence des mots dans BODY
        body_frequencies = get_word_frequencies(topic_data['BODY'], stop_words)
        
        # Fréquence des mots dans TITLE
        title_frequencies = get_word_frequencies(topic_data['TITLE'], stop_words)
        
        # TF-IDF pour BODY
        body_tfidf = compute_tfidf(topic_data['BODY'].dropna().tolist(), stop_words)
        
        # TF-IDF pour TITLE
        title_tfidf = compute_tfidf(topic_data['TITLE'].dropna().tolist(), stop_words)
        
        # Lieux les plus fréquents
        places = [place for sublist in topic_data['PLACES'] for place in sublist]
        places_frequencies = Counter(places).most_common(5)

        # Stocker les résultats
        topic_analysis[topic] = {
            'Average BODY Length': avg_body_length,
            'Top Words in BODY': body_frequencies,
            'TF-IDF Words in BODY': body_tfidf,
            'Top Words in TITLE': title_frequencies,
            'TF-IDF Words in TITLE': title_tfidf,
            'Top Places': places_frequencies
        }
    
    return topic_analysis

-	Money/Foreign Exchange (MONEY-FX)
-	Shipping (SHIP)
-	Interest Rates (INTEREST)
-	Mergers/Acquisitions (ACQ)
-	Earnings and Earnings Forecasts (EARN)


In [124]:
# Perform descriptive analysis
analysis_results = descriptive_analysis(df_cleaned)

# Display results
for topic, details in analysis_results.items():
    print(f"\nTopic: {topic}")
    for key, value in details.items():
        print(f"  {key}: {value}")


Topic: earn
  Average BODY Length: 73.45887676337503
  Top Words in BODY: [('v', 14004), ('mln', 11322), ('ct', 7894), ('dlrs', 5877), ('net', 5116), ('loss', 4648), ('shr', 4002), ('reuter', 3742), ('say', 3363), ('profit', 2997)]
  TF-IDF Words in BODY: ['mln', 'ct', 'loss', 'dlrs', 'net', 'shr', 'rev', 'profit', 'reuter', 'say']
  Top Words in TITLE: [('qtr', 1856), ('net', 1506), ('inc', 1149), ('corp', 750), ('4th', 671), ('loss', 517), ('3rd', 507), ('1st', 448), ('year', 443), ('31', 377)]
  TF-IDF Words in TITLE: ['qtr', 'net', 'inc', 'corp', '4th', '3rd', 'loss', 'year', '1st', '31']
  Top Places: [('usa', 3151), ('canada', 264), ('uk', 92), ('west-germany', 41), ('japan', 31)]

Topic: acq
  Average BODY Length: 128.77782805429865
  Top Words in BODY: [('say', 7469), ('share', 3239), ('dlrs', 2828), ('company', 2820), ('mln', 2258), ('reuter', 2192), ('inc', 1913), ('pct', 1875), ('corp', 1482), ('offer', 1427)]
  TF-IDF Words in BODY: ['say', 'share', 'dlrs', 'company', 'inc

# Predictive analysis

In [157]:
df_cleaned['TOPICS'].value_counts()

TOPICS
earn        3757
acq         2210
money-fx     495
interest     424
ship         289
Name: count, dtype: int64

In [153]:
df_cleaned['LEWISSPLIT'].value_counts()

LEWISSPLIT
TRAIN       4984
TEST        1977
NOT-USED     214
Name: count, dtype: int64

In [155]:
# Split the data into training ( 'Train' and 'not used' ) and testing sets
train_data = df_cleaned[df_cleaned['LEWISSPLIT'].isin(['TRAIN', 'NOT-USED'])]
test_data = df_cleaned[df_cleaned['LEWISSPLIT'] == 'TEST']

print(f"Training data: {train_data.shape[0]} articles")
print(f"Testing data: {test_data.shape[0]} articles")

Training data: 5198 articles
Testing data: 1977 articles


Environ 70% train et 30% test

In [129]:
# Extract features and labels
X_train = train_data['BODY']
y_train = train_data['TOPICS']
X_test = test_data['BODY']
y_test = test_data['TOPICS']

## Traditional Machine Learning Approach

### Naive Bayes

In [145]:
# Create a pipeline that combines the TfidfVectorizer and the MultinomialNB classifier
model = make_pipeline(TfidfVectorizer(), MultinomialNB())

# Train the model
model.fit(X_train, y_train)

# Predict the topics for the test set
y_pred = model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.8730399595346484
Classification Report:
               precision    recall  f1-score   support

         acq       0.77      0.98      0.86       643
        earn       0.95      0.98      0.97      1042
    interest       0.85      0.23      0.36       102
    money-fx       0.81      0.46      0.59       105
        ship       1.00      0.02      0.05        85

    accuracy                           0.87      1977
   macro avg       0.88      0.53      0.56      1977
weighted avg       0.88      0.87      0.84      1977



### Logistic Regression

In [147]:
from sklearn.linear_model import LogisticRegression

# Create a pipeline that combines the TfidfVectorizer and the LogisticRegression classifier
logistic_model = make_pipeline(TfidfVectorizer(), LogisticRegression()) # Each row of the matrix (representing a document) is treated as a feature vector.

# Train the model
logistic_model.fit(X_train, y_train)

# Predict the topics for the test set
y_pred_logistic = logistic_model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred_logistic))
print("Classification Report:\n", classification_report(y_test, y_pred_logistic))

Accuracy: 0.9539706626201315
Classification Report:
               precision    recall  f1-score   support

         acq       0.94      0.98      0.96       643
        earn       0.98      0.99      0.99      1042
    interest       0.86      0.75      0.80       102
    money-fx       0.79      0.77      0.78       105
        ship       1.00      0.82      0.90        85

    accuracy                           0.95      1977
   macro avg       0.91      0.86      0.89      1977
weighted avg       0.95      0.95      0.95      1977



### Random Forest

In [148]:
from sklearn.ensemble import RandomForestClassifier

# Create a pipeline that combines the TfidfVectorizer and the RandomForestClassifier
random_forest_model = make_pipeline(TfidfVectorizer(), RandomForestClassifier())

# Train the model
random_forest_model.fit(X_train, y_train)

# Predict the topics for the test set
y_pred_rf = random_forest_model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Classification Report:\n", classification_report(y_test, y_pred_rf))

Accuracy: 0.9281740010116338
Classification Report:
               precision    recall  f1-score   support

         acq       0.88      0.98      0.93       643
        earn       0.98      0.99      0.98      1042
    interest       0.88      0.65      0.75       102
    money-fx       0.75      0.75      0.75       105
        ship       1.00      0.41      0.58        85

    accuracy                           0.93      1977
   macro avg       0.90      0.75      0.80      1977
weighted avg       0.93      0.93      0.92      1977



### SVC (Support Vector Classifier)

In [152]:
from sklearn.svm import SVC

# Create a pipeline that combines the TfidfVectorizer and the SVC classifier
svc_model = make_pipeline(TfidfVectorizer(), SVC())

# Train the model
svc_model.fit(X_train, y_train)

# Predict the topics for the test set
y_pred_svc = svc_model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred_svc))
print("Classification Report:\n", classification_report(y_test, y_pred_svc))

Accuracy: 0.9590288315629742
Classification Report:
               precision    recall  f1-score   support

         acq       0.95      0.98      0.96       643
        earn       0.98      0.99      0.99      1042
    interest       0.88      0.77      0.82       102
    money-fx       0.82      0.85      0.83       105
        ship       1.00      0.80      0.89        85

    accuracy                           0.96      1977
   macro avg       0.93      0.88      0.90      1977
weighted avg       0.96      0.96      0.96      1977



## Bonus

Prédire les topics des articles ayant un body non vide et un topic non renseigné.