In [9]:
%pip install stop-words

Note: you may need to restart the kernel to use updated packages.


### Implements a Binary Classifier to check if text concerns a geothermal project or not.

In [10]:
import re
import requests
import pandas as pd
from bs4 import BeautifulSoup
from stop_words import get_stop_words
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

In [11]:
# Load the dataset
df_geothermal = pd.read_csv('../dataset.csv')
df_not_geothermal = pd.read_csv('../dataset-random.csv')

# Load a random subset of the dataset for development
# TODO: Load the entire dataset
df_geothermal = df_geothermal.sample(200)
df_not_geothermal = df_not_geothermal.sample(200)


def extract_text(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    text = soup.get_text()
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df_geothermal['extracted_text'] = df_geothermal['fulltext'].apply(extract_text)
df_not_geothermal['extracted_text'] = df_not_geothermal['fulltext'].apply(extract_text)

  k = self.parse_starttag(i)
  k = self.parse_starttag(i)


In [12]:
# Add a label to the data
df_geothermal['label'] = 1
df_not_geothermal['label'] = 0

# Combine the datasets
df = pd.concat([df_geothermal, df_not_geothermal])

In [13]:
def preprocess_documents(documents):
    stop_words = get_stop_words('fr')
    # Convert texts to TF-IDF features
    vectorizer = TfidfVectorizer(
        max_features=5000,
        stop_words=stop_words,
        ngram_range=(1, 2)
    )
    texts = documents.tolist()
    return vectorizer, vectorizer.fit_transform(texts)

In [14]:
def train_classifier(X, y):
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    
    # Train classifier
    classifier = LogisticRegression()
    classifier.fit(X_train, y_train)

    # Evaluate
    y_pred = classifier.predict(X_test)
    print(classification_report(y_test, y_pred))
    
    return classifier

In [15]:
def create_geothermal_filter(documents, labels):
    # Preprocess
    vectorizer, X = preprocess_documents(documents)

    # Train
    classifier = train_classifier(X, labels)

    # Create filter function
    def filter_document(new_doc):
        doc_vector = vectorizer.transform([new_doc])
        return classifier.predict(doc_vector)[0]

    return filter_document

# Split df into training and testing data
df_train, df_test = train_test_split(df, test_size=0.2)

filter_func = create_geothermal_filter(df_train['extracted_text'], df_train['label'])

# Apply the filter to the test set
df_test['predicted'] = df_test['extracted_text'].apply(filter_func)


              precision    recall  f1-score   support

           0       0.94      0.91      0.92        33
           1       0.91      0.94      0.92        31

    accuracy                           0.92        64
   macro avg       0.92      0.92      0.92        64
weighted avg       0.92      0.92      0.92        64



In [16]:
# Print, doc_id, predicted, label
print(df_test[['doc_id', 'predicted', 'label']])

                                                  doc_id  predicted  label
18407  35638/4771de8aecc7f92d2f4676ffcfbbd87353f0d017...          1      1
451                       2719/37b27_vc5ltu3jsb46v15.pdf          0      0
1481   2899/922a364ba232297cd39c5057a21e6d5e184f632a_...          1      1
910         2694/2772a_221206_FLYER_PLAN_DECHETS_WEB.pdf          0      0
12773  1928/b0fdf18ba81f63a57cfeefda4781d897fb216711_...          1      1
...                                                  ...        ...    ...
267    2401/b8f75_Budget%20primitif%20budget%20princi...          0      0
735    3073/56ef8_declaration-dinstallation-dANC.2023...          0      0
11441  6798/70ac5e6b7e382f1bfd5683ffa8fc8d7be8a149d5_...          0      1
207                       3181/3cba0_AT-2023-MEB-064.pdf          0      0
669    2646/e2749_ANNEXES-du-reglement-de-services-20...          0      0

[80 rows x 3 columns]
