# Preprocessing

In [None]:
import os
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords as nltk_stopwords
import re
import json
import spacy

with open('stopwords-iso.json', 'r', encoding='utf-8') as file:
    stopwords_iso = json.load(file)

stopwords = set(nltk_stopwords.words('english'))
custom_stopwords = set(['server', 'joined','scroll','scrolls','papyrus','image'])
stopwords.update(custom_stopwords)
stopwords.update(stopwords_iso['en'])

def preprocess_text(text):

    # Convert text to lowercase
    text = text.lower()

    
    # Load SpaCy model
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)

    # Lemmatize and lowercase each token
    text = ' '.join([token.lemma_.lower() for token in doc])
    
    # Remove function definitions
    text = re.sub(r'\b[a-zA-Z_][a-zA-Z0-9_]*\s*\([^)]*\)\s*', '', text)
    
    # Remove URLs
    text = re.sub(r'http[s]?://\S+', '', text)

    # Remove file names with common extensions
    text = re.sub(r'\b\w+\.(zip|tif|pdf|jpg|png|docx|xlsx|rar|txt|csv|json)\b', '', text)

    # Remove numbers
    text = re.sub(r'\d+', '', text)

    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)

    # Remove special characters and emojis
    text = re.sub(r'[\U0001F600-\U0001F64F'
              r'\U0001F300-\U0001F5FF'  
              r'\U0001F680-\U0001F6FF'  
              r'\U0001F700-\U0001F77F'  
              r'\U0001F780-\U0001F7FF'  
              r'\U0001F800-\U0001F8FF'  
              r'\U0001F900-\U0001F9FF'  
              r'\U0001FA00-\U0001FA6F'  
              r'\U0001FA70-\U0001FAFF'  
              r'\u2600-\u26FF'          
              r'\u2700-\u27BF'       
              ']+', '', text)

    # Remove stopwords
    tokens = text.split()
    tokens = [token for token in tokens if token not in stopwords]
    
    return ' '.join(tokens)

def preprocess_json(input_file, output_file):
    with open(input_file, 'r', encoding='utf-8') as file:
        data = json.load(file)

    for message in data.get('messages', []):
        if message.get('content'):
            message['content'] = preprocess_text(message['content'])

    with open(output_file, 'w', encoding='utf-8') as file:
        json.dump(data, file, indent=4)

    print(f'Preprocessed data has been saved to {output_file}')

 
def preprocess_all_files(input_folder, output_folder):
    for filename in os.listdir(input_folder):
        input_file = os.path.join(input_folder, filename)
        output_file = os.path.join(output_folder, filename)
        preprocess_json(input_file, output_file)


input_folder = 'filteredJSON'
output_folder = 'preprocessedJSON'

preprocess_all_files(input_folder, output_folder)

# Organizing Messages

In [None]:
import pandas as pd
input_folder = 'preprocessedJSON'
def get_all_documents(folder):
    all_docs = []
    files =  {'Vesuvius Challenge - Text Channels - papyrology [1108134343295127592]_filtered.json', 
              'Vesuvius Challenge - Text Channels - general [1079907750265499772]_filtered.json'} #for looking at specific files, can customize

    for fname in os.listdir(folder):
        if fname not in files:
            continue

        with open(os.path.join(folder, fname), 'r', encoding='utf-8') as f:
            data = json.load(f)
            channel = data.get("channel", "Unknown")
            channel_name = channel.get("name") if isinstance(channel, dict) else channel

            for msg in data.get("messages", []):
                content = msg.get("content", "")
                if content.strip():  # Only include non-empty messages
                    all_docs.append({
                        "channel": channel_name,
                        "user": msg.get("author", {}).get("name", "Unknown"),
                        "timestamp": msg.get("timestamp", "Unknown"),
                        "content": content
                    })

    return all_docs


df = pd.DataFrame(get_all_documents(input_folder))

df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce')
# Remove timezone only if present
if df['timestamp'].dt.tz is not None:
    df['timestamp'] = df['timestamp'].dt.tz_localize(None)

# Term Frequency

In [None]:
# For general, papyrology, and both together
import pandas as pd
from collections import Counter
from nltk.util import ngrams
import spacy
spacy.cli.download("en_core_web_sm")
nlp = spacy.load("en_core_web_sm")

def tokenize(text):
    doc = nlp(text.lower())
    return [token.text for token in doc if token.is_alpha]

# Frequency counter
def get_top_ngrams(texts, n=1, top_k=20):
    all_tokens = []
    for text in texts:
        all_tokens.extend(tokenize(text))
    if n == 1:
        return Counter(all_tokens).most_common(top_k)
    else:
        return Counter(ngrams(all_tokens, n)).most_common(top_k)

# Split by channel
general_msgs = df[df['channel'].str.lower() == 'general']['content'].dropna().tolist()
papyrology_msgs = df[df['channel'].str.lower() == 'papyrology']['content'].dropna().tolist()
combined_msgs = general_msgs + papyrology_msgs

# Get n-grams
unigrams_combined = get_top_ngrams(combined_msgs, n=1)
bigrams_combined = get_top_ngrams(combined_msgs, n=2)

unigrams_general = get_top_ngrams(general_msgs, n=1)
unigrams_papyrology = get_top_ngrams(papyrology_msgs, n=1)

bigrams_general = get_top_ngrams(general_msgs, n=2)
bigrams_papyrology = get_top_ngrams(papyrology_msgs, n=2)

# Convert to DataFrames
def ngram_df(ngrams_list, label='ngram'):
    return pd.DataFrame(ngrams_list, columns=[label, 'count'])

# Write to Excel
with pd.ExcelWriter('channel_analysis.xlsx') as writer:
    df.to_excel(writer, sheet_name='Messages', index=False)
    ngram_df(bigrams_combined, 'bigram').to_excel(writer, sheet_name='Bigrams Combined', index=False)
    
    # Separate bigrams
    bigrams_sep = pd.concat([
        ngram_df(bigrams_general, 'bigram').assign(channel='general'),
        ngram_df(bigrams_papyrology, 'bigram').assign(channel='papyrology')
    ])
    bigrams_sep.to_excel(writer, sheet_name='Bigrams Separate', index=False)
    
    # Combined unigrams
    ngram_df(unigrams_combined, 'unigram').to_excel(writer, sheet_name='Unigrams Combined', index=False)
    
    # Separate unigrams
    unigrams_sep = pd.concat([
        ngram_df(unigrams_general, 'unigram').assign(channel='general'),
        ngram_df(unigrams_papyrology, 'unigram').assign(channel='papyrology')
    ])
    unigrams_sep.to_excel(writer, sheet_name='Unigrams Separate', index=False)