
# Data Processing and Extraction using NLP
This notebook processes textual data, identifies important sentences using Natural Language Processing (NLP), 
and exports the results to a CSV file. The dataset used contains various text snippets, and our goal is to 
extract important information by recognizing entities and other key aspects.


In [34]:
import pandas as pd
import re
import spacy
from langdetect import detect, DetectorFactory

# Load spaCy model (Dutch language model for this case)
nlp = spacy.load("nl_core_news_lg")

# Enable GPU usage for spaCy if available
spacy.prefer_gpu()


False

In [35]:
# Load the dataset
try:
    df = pd.read_csv('antwerpen.csv')
    print("Dataset loaded successfully.")
except FileNotFoundError:
    print("Error: Dataset file not found.")

Dataset loaded successfully.


In [36]:
# Split the text into sentences using regex for sentence boundary detection
def split_sentences(text):
    # Pattern matches sentence boundaries after punctuation followed by a capital letter
    sentence_pattern = r'(?<=[.!?])\s+(?=[A-Z])'
    sentences = re.split(sentence_pattern, text)
    return sentences

# Define a function to determine if a sentence is important
def is_important(sentence):
    doc = nlp(sentence)
    
    # Check for named entities
    if len(doc.ents) > 0:
        return True
    
    # Check if the sentence length is within a reasonable range
    if 3 < len(doc) < 30:
        return True

    # Check for specific parts of speech such as proper nouns or numbers
    if any(token.pos_ == 'PROPN' or token.pos_ == 'NUM' for token in doc):
        return True
    
    return False



# Ensure consistent results
DetectorFactory.seed = 0

def is_dutch(text):
    try:
        language = detect(text)
        if language == 'nl':
            return True
        else:
            return False
    except:
        return False

In [37]:

# Process all sentences in the dataframe
all_sentences = []

for text in df['body_content']:
    sentences = split_sentences(text)
    all_sentences.extend(sentences)

# Create a DataFrame for sentences
sentences_df = pd.DataFrame(all_sentences, columns=['sentence'])

# Remove duplicate sentences
sentences_df = sentences_df.drop_duplicates(subset='sentence')

# Mark important sentences
sentences_df['important'] = sentences_df['sentence'].apply(is_important)

# Filter only important sentences
important_sentences_df = sentences_df[sentences_df['important'] == True].drop(columns=['important'])

important_sentences_df['dutch'] = important_sentences_df['sentence'].apply(is_dutch)

dutch_sentences_df = important_sentences_df[important_sentences_df['dutch'] == True].drop(columns=['dutch'])

In [38]:
dutch_sentences_df

Unnamed: 0,sentence
0,Ga direct naar de inhoud NL FR DE EN Contact C...
1,Meld het ons Adressen en openingsuren Contactg...
2,"Als je optionele cookies weigert, worden allee..."
3,Bepaalde inhoud (zoals van YouTube) is alleen ...
4,Je kan je cookievoorkeuren altijd aanpassen vi...
...,...
331661,Dan ben je hier zeker op de juiste plaats! © V...
331670,Meld het ons Adressen en openingsuren Contactg...
331671,Tijd om als vrijwilliger aan de slag te gaan b...
331682,Meer info en inschrijven Meer info Arvid van L...


In [39]:
keywords=['browser','menu','contact','2020 antwerpen','sportcentrum','bel','surf','ook interessant','locatie',':','telefoneer','schrijf','website','aanbod','gezin','euro','mail','tel+','@','€','stadsplan','leaflet','gemeentearchief','cookie', 'NL','internetbrowser','E-mail','©','™']

pattern = '|'.join(keywords)

dutch_sentences_df = dutch_sentences_df[~dutch_sentences_df['sentence'].str.contains(pattern, case=False)]

In [40]:
average_length = dutch_sentences_df['sentence'].str.len().mean()

dutch_sentences_df = dutch_sentences_df[dutch_sentences_df['sentence'].str.len() >= average_length-18]

In [41]:
dutch_sentences_df.reset_index(drop=True, inplace=True)

In [42]:
# Export important sentences to a CSV file
dutch_sentences_df.to_csv('important_sentences.csv', index=True)
print("Important sentences exported successfully.")

Important sentences exported successfully.


In [43]:
# Display a few important sentences
dutch_sentences_df.head()

Unnamed: 0,sentence
0,"Bij een noodsituatie zoals een grote brand, ee..."
1,Nationale test BE-Alert 18 steden en gemeenten...
2,Vrouwen van 50 tot 69 jaar die de voorbije 2 j...
3,Spoor borstkanker tijdig op Borstkanker is de ...
4,Daarom is een vroegtijdige ontdekking van de k...


In [54]:
df = pd.read_csv('important_sentences.csv')

In [55]:
# Filter out rows in the 'sentence' column that contain both '|' and 'BE'
df_filtered = df[~(df['sentence'].str.contains(r'\|') & df['sentence'].str.contains('BE'))]

# Add double quotes at the beginning and end of sentences that do not already have them
df_filtered['sentence'] = df_filtered['sentence'].apply(lambda x: x if x.startswith('"') and x.endswith('"') else f'"{x}"')

# Rename the index column to 'index' and reset it to start from 0
df_filtered = df_filtered.reset_index(drop=True)
df_filtered.index.name = 'index'

# Save the modified DataFrame to 'important_sentences.csv'
df_filtered.to_csv('important_sentences.csv', index=True)


In [56]:
df_filtered.to_csv('important_sentences.csv', index=False)