# Import Dependencies

In [None]:
# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import spacy
import nltk
import re
from collections import Counter
from lexical_diversity import lex_div as ld

# Download NLTK stopwords dataset
nltk.download('stopwords')
from nltk.corpus import stopwords

# Loading Synthetic Data

## Open Training Dataset Social Services

In [None]:
# Path to your .txt file
file_path = '../data/synthetic-data-social-services.txt'

# Open the file and read its contents
with open(file_path, 'r') as file:
    raw_data = file.read()

# Split the raw data into entries based on '\n\n'
entries = raw_data.strip().split('\n\n')

# Initialize lists to hold titles and contents
titles = []
contents = []

# Process each entry
for entry in entries:
    # Split each entry into title and content based on '\n'
    lines = entry.split('\n', 1)  # Split only on the first '\n'
    if len(lines) == 2:
        title, content = lines
        titles.append(title)
        contents.append(content)
    else:
        # Handle cases where the entry does not have a proper title/content split
        titles.append(lines[0])
        contents.append('')

# Create a DataFrame from the lists
df_train = pd.DataFrame({
    'title': titles,
    'content': contents
})

print(df_train.shape)

## Add Manually Labeled Service Categories

In [None]:
# Path to your .txt file
file_path = '../data/synthetic-data-social-services-categories.txt'

# Open the file and read its contents
with open(file_path, 'r') as file:
    raw_data = file.read()
    
# Repeat each term 10 times
df_train['category'] = [term for term in raw_data.split('\n') for _ in range(10)]

## Add General Website Content

In [None]:
# Path to your .txt file
file_path = '../data/synthetic-data-press-releases.txt'

# Open the file and read its contents
with open(file_path, 'r') as file:
    raw_data = file.read()

# Split the raw data into entries based on '\n\n'
entries = raw_data.strip().split('\n\n')

# Initialize lists to hold titles and contents
titles = [el.split('\n')[0] for el in entries[::2]]
contents = entries[1::2]

# Create a DataFrame from the lists
df_test = pd.DataFrame({
    'title': titles,
    'content': contents
})

print(df_test.shape)

In [None]:
# Path to your .txt file
file_path = '../data/synthetic-data-non-service-texts.txt'

# Open the file and read its contents
with open(file_path, 'r') as file:
    raw_data = file.read()
    
# Split the raw data into entries based on '\n\n'
entries = [el.split('\n') for el in raw_data.strip().split('\n\n')]

# Initialize lists to hold titles and contents
titles = [el[0] for el in entries]
contents = [el[1] for el in entries]

# Create a DataFrame from the lists
df_noise = pd.DataFrame({
    'title': titles,
    'content': contents
})

print(df_noise.shape)

## Raw Web Data Social Services

In [None]:
# Concatenate DataFrames row-wise
combined_df = pd.concat([df_train[['title', 'content']], df_test, df_noise, df_noise, df_noise], ignore_index=True)

# Step 4: Shuffle the DataFrame
shuffled_df = combined_df.sample(frac=1, random_state=1).reset_index(drop=True)

shuffled_df

# Preprocessing

## Raw Data

In [None]:
print(f'Imagine the web scraping dataset contains {shuffled_df.shape[0]} web documents.')

In [None]:
# Function to tokenize text into words
def simple_word_tokenize(text):
    words = re.findall(r'\b\w+\b', text.lower())
    return words

# Combine content and titles
combined_texts = [title + " " + content for title, content in zip(shuffled_df["title"], shuffled_df["content"])]

# Tokenize the combined texts and calculate term frequencies
all_words = []
for text in combined_texts:
    all_words.extend(simple_word_tokenize(text))

# Calculate term frequencies
term_frequencies = Counter(all_words)

# Sort and select the top 15 most common terms
top_n = 15
top_terms = term_frequencies.most_common(top_n)

# Separate the terms and their frequencies for plotting
terms, frequencies = zip(*top_terms)

# # Plot the term frequencies as a bar plot
# plt.figure(figsize=(6, 4))
# plt.barh(terms, frequencies, color='skyblue')
# plt.xlabel('Frequency')
# plt.ylabel('Term')
# plt.title(f'Top {top_n} Term Frequencies')
# plt.gca().invert_yaxis()  # Invert y-axis to have the highest frequency on top

# # Display the plot
# plt.show()

## Remove Duplicates

In [None]:
shuffled_df.drop_duplicates(inplace=True)

print(f'After removing duplicates, {shuffled_df.shape[0]} documents are left')

## Lexical Diversity

In [None]:
lex_div = shuffled_df.content.apply(ld.tokenize).apply(ld.root_ttr)

# # Plotting the MSTTR values as a histogram
# plt.figure(figsize=(4, 2))
# plt.hist(lex_div, color='skyblue')
# plt.xlabel('Root Type Token Ratio')
# plt.ylabel('Website Count')
# plt.title('Root TTR of Website Texts')

# # Display the plot
# plt.show()

In [None]:
# Filter Web documents with high Root TTR
cleaned_df = shuffled_df[lex_div>4]

cleaned_df.head(5)

In [None]:
# Removed web documents, having low value content
shuffled_df[lex_div<4].head(5)

## Remove Stopwords

In [None]:
# List of German stopwords
german_stopwords = stopwords.words('german')

# Combine content and titles
combined_texts = [title + " " + content for title, content in zip(cleaned_df["title"], cleaned_df["content"])]

# Tokenize the combined texts and calculate term frequencies, excluding stopwords
all_words = []
for text in combined_texts:
    words = simple_word_tokenize(text)
    filtered_words = [word for word in words if word not in german_stopwords]
    all_words.extend(filtered_words)

# Calculate term frequencies
term_frequencies = Counter(all_words)

# Sort and select the top 15 most common terms
top_n = 15
top_terms = term_frequencies.most_common(top_n)

# Separate the terms and their frequencies for plotting
terms, frequencies = zip(*top_terms)

# # Plot the term frequencies as a bar plot
# plt.figure(figsize=(6, 4))
# plt.barh(terms, frequencies, color='skyblue')
# plt.xlabel('Frequency')
# plt.ylabel('Term')
# plt.title(f'Top {top_n} Term Frequencies (Excluding Stopwords)')
# plt.gca().invert_yaxis()  # Invert y-axis to have the highest frequency on top

# # Display the plot
# plt.show()

# Processing

## Anonymization with Entity Recognition

The synthetic dataset has been prepared making sure no personal identifable information is contained.

## Social Service Classifier

In [None]:
tmp = pd.DataFrame()

tmp['text'] = [title + " " + content for title, content in zip(df_train["title"], df_train["content"])]
tmp['cats'] = df_train['category']

In [None]:
import json

# Function to transform the DataFrame row
def transform_row(row):

    if row['cats'] == 'Obdachlosenhilfe':
        cats = {
            'Obdachlosenhilfe': 1.0,
            "OTHER": 0.0
        }
    else: 
        cats = {
            'Obdachlosenhilfe': 0.0,
            "OTHER": 1.0
        }
    
    return {
        "text": row['text'],
        "cats": cats
    }

# Apply the transformation to each row
transformed_data = tmp.apply(transform_row, axis=1)

In [None]:
import random

# Convert transformed_data to a list for easy manipulation
transformed_data_list = transformed_data.tolist()

# Split the data
train_data = [transformed_data_list[i] for i in range(len(transformed_data_list)) if i % 10 in range(2, 10)]
test_data = [transformed_data_list[i] for i in range(len(transformed_data_list)) if i % 10 in [0, 1]]

# Write training data to a JSONL file
with open('../textcat_demo/assets/docs_issues_training.jsonl', 'w') as f:
    for record in train_data:
        f.write(json.dumps(record) + '\n')

# Write test data to a JSONL file
with open('../textcat_demo/assets/docs_issues_eval.jsonl', 'w') as f:
    for record in test_data:
        f.write(json.dumps(record) + '\n')

In [None]:
# ! cd ../textcat_demo/ && weasel run all

In [None]:
# ! cd ../textcat_demo/ && weasel run visualize-model

In [None]:
# Load the spaCy model from the directory
nlp = spacy.load("../textcat_demo/training/model-best")

In [None]:
# Process a text
doc = nlp("In der Übernachtungsstätte gibt es 30 Plätze für Wohnungslose Menschen.")

# Access predictions (e.g., text categorization)
print(doc.cats)

In [None]:
# Process a text
doc = nlp("Die Pflegeeinrichtung begrüßt ihre Gäste.")

# Access predictions (e.g., text categorization)
print(doc.cats)

## Homeless Relief

In [None]:
# Combine all texts into a single string
text = " ".join(df_train[df_train.category=='Obdachlosenhilfe'].content)

# Create the word cloud, removing German stopwords
wordcloud = WordCloud(
    stopwords=german_stopwords,
    background_color='white',
    width=800,
    height=400
).generate(text)

# # Display the word cloud using Matplotlib
# plt.figure(figsize=(6, 3))
# plt.imshow(wordcloud, interpolation='bilinear')
# plt.axis('off')
# plt.show()

## Social Services

In [None]:
# Combine all texts into a single string
text = " ".join(df_train[df_train.category=='Seniorenhilfe'].content)

# Create the word cloud, removing German stopwords
wordcloud = WordCloud(
    stopwords=german_stopwords,
    background_color='white',
    width=800,
    height=400
).generate(text)

# # Display the word cloud using Matplotlib
# plt.figure(figsize=(6, 3))
# plt.imshow(wordcloud, interpolation='bilinear')
# plt.axis('off')
# plt.show()

# Data Applications

## Keyword Search

In [None]:
from whoosh.fields import Schema, TEXT
from whoosh.index import create_in
import os

# Define the schema
schema = Schema(id=TEXT(stored=True), text=TEXT(stored=True))

# Create a directory to store the index
if not os.path.exists("indexdir"):
    os.mkdir("indexdir")

# Create the index
index = create_in("indexdir", schema)

In [None]:
from whoosh.writing import AsyncWriter

# Open the index for writing
with index.writer() as writer:
    for id, text in enumerate(combined_texts):
        writer.add_document(id=str(id), text=text)

In [None]:
from whoosh.qparser import QueryParser
from whoosh import scoring

# Open the index for searching
with index.searcher() as searcher:
    # Create a query parser
    query_parser = QueryParser("text", schema=schema)
    
    # Define a query
    query = query_parser.parse("Obdachlose")
    
    # Perform the search
    results = searcher.search(query, limit=3)
    
    # Print results
    for result in results:
        print(f"ID: {result['id']}, Text: {result['text']}")

## Creating a Map

In [None]:
# Example services with their imaginary coordinates (x, y)
services = {
    "Niederschwelliger Tagesaufenthalt": (2, 5),
    "Übernachtungseinrichtung": (4, 9),
    "Streetwork-Programm": (7, 2),
    "Sozialberatung": (6, 8),
    "Wärmestube": (9, 5)
}

# Plotting the map
plt.figure(figsize=(8, 8))
for service, (x, y) in services.items():
    plt.scatter(x, y, s=100)  # Plot the point
    plt.text(x + 0.1, y + 0.1, service, fontsize=10)  # Add label with some offset

# Adding titles and labels
plt.title('Imaginary Map of Services')
plt.xlabel('X Coordinate')
plt.ylabel('Y Coordinate')

# Adding grid and setting limits for better visualization
plt.grid(True)
plt.xlim(0, 10)
plt.ylim(0, 10)

# Show the plot
plt.show()