In [None]:
# -------------------------------
# Quick EDA Notebook for Hackathon
# -------------------------------

# Step 1: Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

# Step 2: Load the dataset
train = pd.read_csv("train.csv")

# Step 3: Basic info
print("Shape of data:", train.shape)
print("\nColumns:", train.columns.tolist())

# Step 4: Describe price column
if 'price' in train.columns:
    print("\n--- Price Statistics ---")
    display(train['price'].describe())

# Step 5: Check word counts in catalog_content (if exists)
if 'catalog_content' in train.columns:
    train['word_count'] = train['catalog_content'].fillna("").apply(lambda x: len(x.split()))
    print("\n--- Word Count Statistics ---")
    display(train['word_count'].describe())

    # Word count distribution
    plt.figure(figsize=(8,5))
    sns.histplot(train['word_count'], bins=50)
    plt.title("Catalog Content Word Count Distribution")
    plt.xlabel("Word Count")
    plt.ylabel("Frequency")
    plt.show()

# Step 6: Check percentage of missing image_link
if 'image_link' in train.columns:
    missing_image_pct = train['image_link'].isna().mean() * 100
    print(f"\n% Missing image_link: {missing_image_pct:.2f}%")

# Step 7: Plot price histogram (log scale)
if 'price' in train.columns:
    plt.figure(figsize=(8,5))
    sns.histplot(train['price'], bins=50)
    plt.xscale('log')
    plt.title("Price Distribution (Log Scale)")
    plt.xlabel("Price (log)")
    plt.ylabel("Frequency")
    plt.show()

# Step 8: Correlation between word count and price
if 'catalog_content' in train.columns and 'price' in train.columns:
    plt.figure(figsize=(7,5))
    sns.scatterplot(x=np.log1p(train['word_count']), y=np.log1p(train['price']))
    plt.title("Word Count vs Price (log-log)")
    plt.xlabel("log(Word Count + 1)")
    plt.ylabel("log(Price + 1)")
    plt.show()

# Step 9: Inspect some catalog_content examples
if 'catalog_content' in train.columns:
    print("\n--- Sample catalog_content ---")
    for text in train['catalog_content'].dropna().head(3):
        print("\n", text[:400], "...")

# Step 10: Build regex for 'ipq' (or any token of interest)
if 'catalog_content' in train.columns:
    sample_texts = train['catalog_content'].dropna().head(10).tolist()
    pattern = re.compile(r"\bipq[-\s]?\d+\b", flags=re.IGNORECASE)
    print("\n--- Example matches for 'ipq' pattern ---")
    for text in sample_texts:
        matches = pattern.findall(text)
        if matches:
            print(matches)

# Step 11: Look at top brands/tokens if present
if 'brand' in train.columns:
    print("\n--- Top Brands ---")
    display(train['brand'].value_counts().head(10))

# Or try tokenizing catalog_content
if 'catalog_content' in train.columns:
    from collections import Counter
    tokens = []
    for text in train['catalog_content'].dropna():
        tokens.extend(re.findall(r'\b[a-zA-Z]{3,}\b', text.lower()))
    token_counts = Counter(tokens)
    print("\n--- Top Tokens ---")
    print(pd.DataFrame(token_counts.most_common(10), columns=['Token', 'Count']))

# Step 12: Identify extreme outliers (>99.9 percentile)
if 'price' in train.columns:
    upper_limit = train['price'].quantile(0.999)
    print(f"\n99.9th percentile price: {upper_limit:.2f}")
    outliers = train[train['price'] > upper_limit]
    print(f"Number of outliers: {len(outliers)}")

    # Optional: Clip outliers
    train['price_clipped'] = np.where(train['price'] > upper_limit, upper_limit, train['price'])
    print("Created 'price_clipped' column with capped values.")
