# Set Up Notebook

In [None]:
##########
# IMPORT #
##########

# System and file handling
import os
import struct

# Data processing and math
import numpy as np
import pandas as pd
import re
from collections import Counter

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Language processing
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.tokenize import word_tokenize
from nltk.util import ngrams


#################
# CONFIGURATION #
#################

# Set path
base_path = r'/Users/amb/Desktop'
filename = 'dialog.tlk'

corpus_path = os.path.join(base_path, filename)

# Configure global display settings for pandas
pd.set_option('display.max_colwidth', 100)

# Set plotting style and colors
sns.set_style("whitegrid")
graph_color = '#2c7fb8'  

# Enable inline plotting
%matplotlib inline

# Parse File

In [None]:
def read_file(file_path):
    """
    Parses TLK file and returns DataFrame.
    """
    data = []
    
    with open(file_path, 'rb') as f:
        
        # Read header
        header_data = f.read(18)
        
        # Unpack type, version, lang, count, offset
        file_type, version, lang_id, string_count, string_offset = struct.unpack('<4s4sHII', header_data)
        
        print(f"File Type: {file_type.decode(errors = 'ignore')}")
        print(f"Version: {version.decode(errors = 'ignore')}")
        print(f"Strings: {string_count}")
        
        # Move to entries start
        f.seek(18)
        
        # Read all entries
        entries_data = f.read(string_count * 26)
        
        current_entry_idx = 0
        while current_entry_idx < string_count:
            chunk = entries_data[current_entry_idx * 26 : (current_entry_idx + 1) * 26]
            
            # Unpack flags, sound, vol, pitch, offset, length
            flags, sound, vol, pitch, text_relative_offset, text_length = struct.unpack('<H8sIIII', chunk)
            
            data.append({
                'index': current_entry_idx,
                'offset': text_relative_offset,
                'length': text_length
            })
            current_entry_idx += 1
        
        # Read strings
        base_text_offset = string_offset
        final_records = []
        
        for entry in data:
            if entry['length'] > 0:
                f.seek(base_text_offset + entry['offset'])
                text_bytes = f.read(entry['length'])
                try:
                    text = text_bytes.decode('latin-1', errors = 'replace')
                except:
                    text = ""
                
                final_records.append({
                    'Line': entry['index'],
                    'Text': text
                })
    
    return pd.DataFrame(final_records)

# Execute
if os.path.exists(corpus_path):
    df = read_file(corpus_path)
    print(f"\nLoaded {len(df)} lines.")
else:
    print(f"File not found in {corpus_path}.")

# Clean Corpus

In [None]:
def clean_corpus(text):
    """
    Extracts and cleans text.
    """
    if not isinstance(text, str):
        return ""
    text = re.sub(r'\^[A-Za-z0-9\-]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Execute
df['Cleaned_Text'] = df['Text'].apply(clean_corpus)

# Filter empty lines
df = df[df['Cleaned_Text'].str.len() > 0].reset_index(drop = True)

# Export file
df.to_csv('game_studies_pst_corpus.csv', index = False, encoding = 'utf-8')

print("Corpus saved to 'game_studies_pst_corpus.csv'.")

# Search for Keywords

In [None]:
def run_search(df, pattern, column = 'Cleaned_Text', export_filename = "game_studies_pst_search_results.csv"):
    """
    Scans the DataFrame for keywords and returns hits.
    """
    regex_pattern = pattern.replace('*', '.*').replace('?', '.')
    
    matches = df[df[column].str.contains(regex_pattern, case = False, regex = True)]
    
    print(f"{len(matches)} matches found.")
    
    if export_filename:
        if not export_filename.endswith('.csv'):
            export_filename += '.csv'
        
        matches.to_csv(export_filename, index = False, encoding = 'utf-8')
        print(f"Results exported to '{export_filename}'.")
    
    return matches

# Execute
results = run_search(df, "nameless")

print(results.head())

# Calculate Metrics

In [None]:
def calculate_metrics(df):
    # Tokenization
    df['Tokens'] = df['Cleaned_Text'].apply(word_tokenize)
    
    # Flatten list of tokens
    all_tokens = [word.lower() for sublist in df['Tokens'] for word in sublist if word.isalnum()]
    
    # Calculate lines, characters, and tokens
    total_lines = len(df)
    total_chars = df['Cleaned_Text'].str.len().sum()
    total_tokens = len(all_tokens)
    
    # Calculate unique types
    freq_dist = FreqDist(all_tokens)
    unique_types = len(freq_dist)
    
    # Calculate type-token ratio
    ttr = unique_types / total_tokens if total_tokens > 0 else 0
    
    # Calculate hapax legomena
    hapax_legomena = [word for word, count in freq_dist.items() if count == 1]
    hapax_count = len(hapax_legomena)
    percentage_hapax = (hapax_count / unique_types) * 100 if unique_types > 0 else 0
    
    # Calculate average line length
    df['Line_Length_Tokens'] = df['Tokens'].apply(len)
    avg_line_length = df['Line_Length_Tokens'].mean()
    
    # Calculate average word length
    word_lengths = [len(w) for w in all_tokens]
    avg_word_length = np.mean(word_lengths) if word_lengths else 0
    
    # Calculate lexical density
    stop_words = set(stopwords.words('english'))
    content_words = [w for w in all_tokens if w not in stop_words]
    lexical_density = (len(content_words) / total_tokens) * 100 if total_tokens > 0 else 0
    
    # Print results
    print(f"Total Lines: {total_lines}")
    print(f"Total Characters: {total_chars}")
    print(f"Total Tokens: {total_tokens}")
    print(f"Unique Types: {unique_types}")
    print(f"Type-Token Ratio: {ttr:.4f}")
    print(f"Hapax Legomena: {hapax_count}")
    print(f"Percentage of Hapax Legomena: {percentage_hapax:.2f}%")
    print(f"Average Line Length: {avg_line_length:.2f}")
    print(f"Average Word Length: {avg_word_length:.2f}")
    print(f"Lexical Density: {lexical_density:.2f}%")
    
    return all_tokens, freq_dist, word_lengths

# Execute
all_tokens, freq_dist, word_lengths = calculate_metrics(df)

# Calculate Top Bigrams

In [None]:
bigrams = list(ngrams(all_tokens, 2))
bigram_freq = Counter(bigrams)

for bg, count in bigram_freq.most_common(25):
    print(f"{bg}: {count}")

# Calculate Word Length Distribution

In [None]:
word_length_counts = Counter(word_lengths)
sorted_word_lengths = sorted(word_length_counts.items())

for length, count in sorted_word_lengths[:12]:
    print(f"{length}: {count}")

print(f"\nUnique Lengths: {len(sorted_word_lengths)}")

# Plot Word Length Distribution

In [None]:
def plot_word_length_distribution(word_lengths, color = graph_color, export_filename = "game_studies_pst_word_length_distribution.png"):
    """
    Creates a bar chart for word length distribution.
    """
    # Check data
    if not word_lengths:
        print("No data found.")
        return
    
    # Filter word lengths
    word_length_counts = Counter(word_lengths)
    
    filtered_counts = {length: count for length, count in word_length_counts.items() if 1 <= length <= 12}
    
    if not filtered_counts:
        print("No data found.")
        return
    
    # Sort data
    sorted_word_lengths = sorted(filtered_counts.items())
    x_vals, y_vals = zip(*sorted_word_lengths)
    
    # Plot figure
    plt.figure(figsize = (12, 6))
    
    sns.barplot(x = list(x_vals), y = list(y_vals), color = color)
    
    plt.xlabel("Characters per Word")
    plt.ylabel("Frequency")
    
    # Export graph
    if export_filename:
        if not export_filename.endswith('.png'):
            export_filename += '.png'
        plt.savefig(export_filename, dpi = 300, bbox_inches = 'tight')
        print(f"Graph saved as '{export_filename}'.")
    
    plt.show()

# Execute
plot_word_length_distribution(word_lengths)

# Plot Barcode for Selected Keywords

In [None]:
def plot_barcode_keywords(df, keywords, column = 'Cleaned_Text', color = graph_color, export_filename = "game_studies_pst_barcode_keywords.png"):
    """
    Creates a barcode of selected keywords.
    """
    
    # Plot figure
    plt.figure(figsize = (15, len(keywords) * 0.8))
    
    for i, keyword in enumerate(keywords):
        if '*' in keyword or '?' in keyword:
            regex_pattern = re.escape(keyword).replace(r'\*', '.*').replace(r'\?', '.')
            matches = df[df[column].str.contains(regex_pattern, case = False, regex = True)].index
        else:
            matches = df[df[column].str.contains(r'\b' + re.escape(keyword) + r'\b', case = False, regex = True)].index
        
        plt.vlines(matches, i, i + 1, colors = [color], linewidth = 0.8)
        plt.text(-len(df) * 0.02, i + 0.5, keyword, va = 'center', fontweight = 'bold', ha = 'right')
    
    plt.xlabel("Line in Corpus")
    plt.yticks([])
    plt.xlim(0, len(df))
    plt.ylim(0, len(keywords))
    
    # Export graph
    if export_filename:
        if not export_filename.endswith('.png'):
            export_filename += '.png'
        plt.savefig(export_filename, dpi = 300, bbox_inches = 'tight')
        print(f"Graph saved as '{export_filename}'.")
    
    plt.show()

# Set keywords
keywords = ['death', 'life', 'nameless', 'morte']

# Execute
plot_barcode_keywords(df, keywords)