In [34]:
from bs4 import BeautifulSoup
from tqdm import tqdm
import os
import pandas as pd

In [55]:
def parse_reuters_sgm(file_path):

    with open(file_path, 'r', encoding='latin-1') as file:
        data = file.read()
    
    soup = BeautifulSoup(data, 'html.parser')
    articles = []

    for reuters in soup.find_all('reuters'):

        # Extract TITLE
        title = reuters.find('title').text if reuters.find('title') else None

        # Extract LEWISSPLIT attribute
        lewissplit = reuters.get('lewissplit', 'UNKNOWN')

        # Extract PLACES
        places = [d.text for d in reuters.find('places').find_all('d')] if reuters.find('places') else []

        # Extract DATE
        date = reuters.find('date').text if reuters.find('date') else None

        # Extract DATELINE
        dateline = reuters.find('dateline').text if reuters.find('dateline') else None

        # Extract BODY
        body = reuters.find('text').body.text if reuters.find('text') and reuters.find('text').body else None

        # Extract TOPICS
        topics = [d.text for d in reuters.find('topics').find_all('d')] if reuters.find('topics') else []

        # Append all extracted features to articles list
        articles.append({
            'TITLE': title,
            'LEWISSPLIT': lewissplit,
            'PLACES': places,
            'DATE': date,
            'DATELINE': dateline,
            'BODY': body,
            'TOPICS': topics
        })
    
    return articles

def process_all_sgm_files(directory_path):
    """
    Process all .sgm files in a directory and combine the data into a single DataFrame.
    """
    all_articles = []
    for file_name in os.listdir(directory_path):
        if file_name.endswith('.sgm'):
            file_path = os.path.join(directory_path, file_name)
            articles = parse_reuters_sgm(file_path)
            all_articles.extend(articles)
    
    return pd.DataFrame(all_articles)


In [56]:
# Directory containing the .sgm files
directory = 'reuters21578'

# Process all .sgm files and create a DataFrame
df = process_all_sgm_files(directory)

In [58]:
df.sample(7)

Unnamed: 0,TITLE,LEWISSPLIT,PLACES,DATE,DATELINE,BODY,TOPICS
8690,ECOLAB <ECL> STARTS BID FOR CHEMLAWN <CHEM>,TRAIN,[usa],24-MAR-1987 08:07:26.91,"NEW YORK, March 24 -",Ecolab Inc said it has started its\npreviously...,[acq]
1384,JAPAN CUTTING CHINA CORN COMMITMENTS - USDA,TRAIN,"[usa, japan, china]",3-MAR-1987 17:40:09.18,"WASHINGTON, March 3 -",Japanese traders have apparently\nsharply redu...,"[wheat, corn]"
15210,WALGREEN CO <WAG> 2ND QTR FEB 28 NET,TEST,[usa],8-APR-1987 13:57:29.01,"DEERFIELD, Ill, April 8 -\n",Shr 62 cts vs 58 cts\n Qtly div 13-1/2 cts ...,[earn]
10020,HIGHER U.S. WEEKLY CAR OUTPUT ESTIMATED,TRAIN,[usa],26-MAR-1987 12:38:09.30,"DETROIT, March 26 -","U.S. automakers are expected to build\n167,236...",[]
13637,,TRAIN,[],7-APR-1987 08:31:45.50,,,[]
13723,,TRAIN,[],7-APR-1987 09:25:24.41,,,[]
1123,<ROYAL BANK OF CANADA> 1ST QTR JAN 31 NET,TRAIN,[canada],3-MAR-1987 11:44:14.11,"MONTREAL, March 3 -\n",Shr basic 88 cts vs 1.22 dlrs\n Shr diluted...,[earn]


In [59]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21578 entries, 0 to 21577
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   TITLE       20841 non-null  object
 1   LEWISSPLIT  21578 non-null  object
 2   PLACES      21578 non-null  object
 3   DATE        21578 non-null  object
 4   DATELINE    19043 non-null  object
 5   BODY        19043 non-null  object
 6   TOPICS      21578 non-null  object
dtypes: object(7)
memory usage: 1.2+ MB


In [63]:
# Supprimer les articles sans body
df = df.dropna(subset=['BODY'])

# Supprimer les articles avec des topics = '[]'
df = df[df['TOPICS'].apply(lambda x: x != [])]


In [64]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10377 entries, 0 to 21575
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   TITLE       10377 non-null  object
 1   LEWISSPLIT  10377 non-null  object
 2   PLACES      10377 non-null  object
 3   DATE        10377 non-null  object
 4   DATELINE    10377 non-null  object
 5   BODY        10377 non-null  object
 6   TOPICS      10377 non-null  object
dtypes: object(7)
memory usage: 648.6+ KB


In [75]:
df.sample(10)

Unnamed: 0,TITLE,LEWISSPLIT,PLACES,DATE,DATELINE,BODY,TOPICS
11965,WEATHERFORD INTERNATIONAL INC <WII> 4TH QTR LOSS,TRAIN,[usa],1-APR-1987 10:40:25.58,"HOUSTON, April 1 -\n",Shr loss 40 cts vs loss 1.30 dlrs\n Net los...,[earn]
941,NO BUNDESBANK POLICY CHANGES EXPECTED THURSDAY,TRAIN,[west-germany],3-MAR-1987 07:13:12.48,"FRANKFURT, March 3 -",The Bundesbank is unlikely to change\nits cred...,"[interest, money-fx]"
3911,WALL STREET STOCKS/SUPERMARKETS GENERAL <SGL>,TRAIN,[usa],11-MAR-1987 15:36:43.95,"NEW YORK, March 11 -","Dart Group Corp <DARTA>, which wants\nto acqui...",[acq]
5785,EC WARNS U.S. AND JAPAN ON TRADE TENSIONS,TRAIN,"[usa, belgium, japan]",17-MAR-1987 00:13:01.09,"BRUSSELS, March 17 -",The European Community (EC) yesterday\nwarned ...,"[trade, bop]"
2625,FED NOT EXPECTED TO ACT IN MONEY MARKETS,TRAIN,[usa],6-MAR-1987 10:52:09.53,"NEW YORK, March 6 -",The Federal Reserve is unlikely to\noperate in...,"[money-fx, interest]"
18299,PROFESSOR LIFTS BANC TEXAS <BTX> PREFERRED STAKE,TEST,[usa],2-JUN-1987 16:40:49.41,"WASHINGTON, June 2 -",A University of Massachusetts finance\nprofess...,[acq]
7112,MANOR CARE INC <MNR> 3RD QTR FEB 28 NET,TRAIN,[usa],19-MAR-1987 10:20:41.29,"SILVER SPRING, Md., March 19 -\n","Shr 24 cts vs 21 cts\n Net 9,700,000 vs 8,2...",[earn]
17306,TRADE MINISTERS SAY GOVERNMENTS NEED CREDIBILITY,NOT-USED,[japan],27-APR-1987 08:47:37.96,"KASHIKOJIMA, Japan, April 27 -",Four trade ministers ended a\nweekend meeting ...,"[trade, money-fx]"
10846,TEXAS INTERNATIONAL <TEI> HAS UNQUALIFIED AUDIT,TRAIN,[usa],30-MAR-1987 11:37:39.48,"OKLAHOMA CITY, March 30 -",Texas International Inc said it\nhas received ...,[earn]
14891,MALAYSIA MAY NOT MEET 1987 OIL PALM TARGET,TEST,[malaysia],8-APR-1987 06:50:05.68,"KUALA LUMPUR, April 8 -",Malaysia is unlikely to meet its\ntargeted out...,"[veg-oil, palm-oil, soy-oil, oilseed, soybean]"


In [76]:
target_topics = {'money-fx', 'ship', 'interest', 'acq', 'earn'}

# Supprimer les articles qui ne contiennent pas les topics cibles
df_cleaned = df[df['TOPICS'].apply(lambda x: bool(set(x) & target_topics))]

In [81]:
# Si un article contient plusieurs topics, on garde uniquement le topic présent dans target_topics
df_cleaned['TOPICS'] = df_cleaned['TOPICS'].apply(lambda x: list(set(x) & target_topics)[0])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['TOPICS'] = df_cleaned['TOPICS'].apply(lambda x: list(set(x) & target_topics)[0])


In [95]:
df_cleaned.sample(6)

Unnamed: 0,TITLE,LEWISSPLIT,PLACES,DATE,DATELINE,BODY,TOPICS
6685,SERVICE CORP INTERNATIONAL <SRV> SETS QUARTERLY,TRAIN,[usa],18-MAR-1987 13:43:47.83,"HOUSTON, March 18 -\n",Qtly div eight cts vs eight cts prior\n Pay...,earn
1420,SWAP DEALERS UNVEIL STANDARD CONTRACT,TRAIN,"[uk, usa]",4-MAR-1987 09:30:50.97,"London, March 4 -",The International Swap Dealers\nAssociation ha...,interest
16816,<BIRDSBORO CORP> 4TH QTR LOSS,TEST,[usa],17-APR-1987 09:24:56.94,"MIAMI, April 17 -\n",Shr loss 24 cts vs loss 20 cts\n Net loss 1...,earn
1505,CONVENIENT FOOD MART <CFMI> AGREES TO BUY CHAIN,TRAIN,[usa],4-MAR-1987 10:52:39.54,"ROSEMONT, Ill, March 4 -",Convenient Food Mart Inc said it\nhas tentativ...,acq
15714,FIRST FEDERAL OF MICHIGAN <FFOM> 1ST QTR NET,TEST,[usa],9-APR-1987 12:22:10.55,"DETROIT, April 9 -\n","Shr 3.33 dlrs vs 3.39 dlrs\n Net 37,069,000...",earn
5860,U.K. MONEY MARKET RECEIVES NO MORNING ASSISTANCE,TRAIN,[uk],17-MAR-1987 07:22:27.62,"LONDON, March 17 -",The Bank of England said it did not\noperate i...,interest


In [91]:
df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7175 entries, 8 to 21573
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   TITLE       7175 non-null   object
 1   LEWISSPLIT  7175 non-null   object
 2   PLACES      7175 non-null   object
 3   DATE        7175 non-null   object
 4   DATELINE    7175 non-null   object
 5   BODY        7175 non-null   object
 6   TOPICS      7175 non-null   object
dtypes: object(7)
memory usage: 448.4+ KB


## Bonus

Prédire les topics des articles ayant un body non vide et un topic non renseigné.