
# Data Processing and Extraction using NLP
This notebook processes textual data, identifies important sentences using Natural Language Processing (NLP), 
and exports the results to a CSV file. The dataset used contains various text snippets, and our goal is to 
extract important information by recognizing entities and other key aspects.


In [1]:
import pandas as pd
import re
import spacy

# Load spaCy model (Dutch language model for this case)
nlp = spacy.load("nl_core_news_lg")

# Enable GPU usage for spaCy if available
spacy.prefer_gpu()


False

In [2]:
# Load the dataset
try:
    df = pd.read_csv('Antwerpen_c_2.csv')
    print("Dataset loaded successfully.")
except FileNotFoundError:
    print("Error: Dataset file not found.")

Dataset loaded successfully.


In [3]:
# Split the text into sentences using regex for sentence boundary detection
def split_sentences(text):
    # Pattern matches sentence boundaries after punctuation followed by a capital letter
    sentence_pattern = r'(?<=[.!?])\s+(?=[A-Z])'
    sentences = re.split(sentence_pattern, text)
    return sentences

# Define a function to determine if a sentence is important
def is_important(sentence):
    doc = nlp(sentence)
    
    # Check for named entities
    if len(doc.ents) > 0:
        return True
    
    # Check if the sentence length is within a reasonable range
    if 5 < len(doc) < 40:
        return True

    # Check for specific parts of speech such as proper nouns or numbers
    if any(token.pos_ == 'PROPN' or token.pos_ == 'NUM' for token in doc):
        return True
    
    return False

In [4]:

# Process all sentences in the dataframe
all_sentences = []

for text in df['body_content']:
    sentences = split_sentences(text)
    all_sentences.extend(sentences)

# Create a DataFrame for sentences
sentences_df = pd.DataFrame(all_sentences, columns=['sentence'])

# Remove duplicate sentences
sentences_df = sentences_df.drop_duplicates(subset='sentence')

# Mark important sentences
sentences_df['important'] = sentences_df['sentence'].apply(is_important)

# Filter only important sentences
important_sentences_df = sentences_df[sentences_df['important'] == True].drop(columns=['important'])

In [5]:
keywords=['browser','menu','contact','2020 antwerpen','sportcentrum','bel','surf','ook interessant','locatie',':','telefoneer','schrijf','website','aanbod','gezin','euro','uur','mail',':','tel+','@','€','stadsplan','leaflet','gemeentearchief','cookie', 'NL','internetbrowser','E-mail','©','™']

pattern = '|'.join(keywords)

important_sentences_df = important_sentences_df[~important_sentences_df['sentence'].str.contains(pattern, case=False)]

In [6]:
average_length = important_sentences_df['sentence'].str.len().mean()

important_sentences_df = important_sentences_df[important_sentences_df['sentence'].str.len() >= average_length-18]

In [7]:
important_sentences_df.reset_index(drop=True, inplace=True)

In [8]:
# Export important sentences to a CSV file
important_sentences_df.to_csv('important_sentences.csv', index=False)
print("Important sentences exported successfully.")

Important sentences exported successfully.


In [9]:
# Display a few important sentences
important_sentences_df.head()

Unnamed: 0,sentence
0,Je krijgt die als je bent ingeschreven en de n...
1,Zo krijg je snel de juiste informatie en instr...
2,Nationale test BE-Alert 18 steden en gemeenten...
3,Stap als het kan over op één van de park-and-r...
4,"Iedereen kan ze inkijken, er een afschrift van..."
