# Multi-Word Expression (MWE) Extraction
## Extract candidate MWEs from movie subtitles

This notebook extracts MWEs from subtitle data using various techniques.

In [None]:
import sys
sys.path.append('..')

import pandas as pd
import json
from pathlib import Path
from tqdm import tqdm

from python.mwe_extraction.extractor import MWEExtractor
from python.utils.subtitle_parser import load_subtitles_from_directory
from python.config import *

## 1. Load Subtitle Data

In [None]:
# Select language to process
LANGUAGE = 'spanish'  # Change to 'hindi' or other languages
config = LANGUAGES[LANGUAGE]

# Load subtitles
subtitles = load_subtitles_from_directory(config['dir'])

# Flatten all subtitle texts into a single list
all_texts = []
for filename, texts in subtitles.items():
    all_texts.extend(texts)

print(f"Total subtitle lines: {len(all_texts)}")
print(f"\nSample texts:")
for text in all_texts[:5]:
    print(f"  - {text}")

## 2. Initialize MWE Extractor

In [None]:
# Initialize extractor with appropriate spaCy model
# Note: You may need to download the model first:
# python -m spacy download es_core_news_sm  (for Spanish)
# python -m spacy download xx_ent_wiki_sm   (for multilingual/Hindi)

extractor = MWEExtractor(
    language=config['code'],
    spacy_model=config['spacy_model']
)

## 3. Extract Candidate MWEs

In [None]:
# Extract MWEs using all methods
print("Extracting candidate MWEs...")
mwes = extractor.extract_candidate_mwes(
    texts=all_texts,
    min_length=MIN_MWE_LENGTH,
    max_length=MAX_MWE_LENGTH,
    min_freq=MIN_FREQUENCY
)

print(f"\nTotal candidate MWEs extracted: {len(mwes)}")

## 4. Analyze Extracted MWEs

In [None]:
# Convert to DataFrame for analysis
mwes_df = pd.DataFrame([
    {'mwe': mwe, **info} for mwe, info in mwes.items()
])

# Sort by frequency
mwes_df = mwes_df.sort_values('frequency', ascending=False)

# Display top MWEs
print("\nTop 20 MWEs by frequency:")
mwes_df.head(20)

In [None]:
# Analyze by type
print("\nMWE distribution by type:")
print(mwes_df['type'].value_counts())

# Analyze by length
print("\nMWE distribution by length:")
print(mwes_df['length'].value_counts().sort_index())

## 5. Filter and Save Results

In [None]:
# Filter MWEs (remove very common stop-word combinations, etc.)
# You can add custom filtering logic here

# Filter by minimum frequency
filtered_mwes = mwes_df[mwes_df['frequency'] >= 3]

print(f"MWEs after filtering: {len(filtered_mwes)}")

# Save to file
output_file = PROCESSED_DATA_DIR / f"{LANGUAGE}_mwes.csv"
filtered_mwes.to_csv(output_file, index=False)
print(f"\nSaved to: {output_file}")

## 6. Sample MWEs by Category

In [None]:
# Display samples from each category
for mwe_type in filtered_mwes['type'].unique():
    print(f"\n{mwe_type.upper()} samples:")
    samples = filtered_mwes[filtered_mwes['type'] == mwe_type].head(10)
    for _, row in samples.iterrows():
        print(f"  {row['mwe']} (freq: {row['frequency']})")