# Data Exploration
## English Idioms and Subtitle Analysis

This notebook explores the English idiom corpus and subtitle data.

In [None]:
import sys
sys.path.append('..')

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

from python.data_processing.idiom_loader import IdiomLoader
from python.utils.subtitle_parser import load_subtitles_from_directory
from python.config import *

%matplotlib inline
sns.set_style('whitegrid')

## 1. Load English Idioms

In [None]:
# Load English idiom corpus
idioms = IdiomLoader.load_idiom_corpus(ENGLISH_IDIOMS_DIR)
print(f"Total idioms loaded: {len(idioms)}")

# Convert to DataFrame for analysis
idioms_df = pd.DataFrame(idioms)
idioms_df.head()

## 2. Load Subtitle Data

In [None]:
# Load Spanish subtitles
spanish_subs = load_subtitles_from_directory(SPANISH_SUBTITLES)
print(f"\nSpanish subtitle files: {len(spanish_subs)}")

# Load Hindi subtitles
hindi_subs = load_subtitles_from_directory(HINDI_SUBTITLES)
print(f"Hindi subtitle files: {len(hindi_subs)}")

## 3. Data Statistics

In [None]:
# Analyze idiom lengths
idioms_df['word_count'] = idioms_df['text'].str.split().str.len()

plt.figure(figsize=(10, 5))
plt.hist(idioms_df['word_count'], bins=20, edgecolor='black')
plt.xlabel('Number of Words')
plt.ylabel('Frequency')
plt.title('Distribution of Idiom Lengths')
plt.show()

print(f"\nIdiom length statistics:")
print(idioms_df['word_count'].describe())

## 4. Sample Data

In [None]:
# Display random sample of idioms
print("Random sample of English idioms:")
idioms_df.sample(10)