In [2]:
import pandas as pd
import re
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns

# --- Use the exact file path provided ---
file_path = '/Users/adityasharma/Github Projects/Amazon/input/train.csv'

try:
    df = pd.read_csv(file_path)
    print(f"Successfully loaded {file_path}")
    print(f"Dataset shape: {df.shape}")
except FileNotFoundError:
    print(f"ERROR: File not found at the specified path: {file_path}")
    print("Halting analysis. Please verify the path is correct.")
    exit()

# Ensure the content column is of string type, filling NaNs to prevent errors
df['catalog_content'] = df['catalog_content'].astype(str).fillna('')

# --- Step 1: High-Level Structural Characterization ---
print("\n--- 1. Structural Characterization ---")
df['line_count'] = df['catalog_content'].str.count('\n') + 1
df['char_count'] = df['catalog_content'].str.len()

print("\nDistribution of Line Counts per Entry:")
print(df['line_count'].describe())

print("\nDistribution of Character Counts per Entry:")
print(df['char_count'].describe())

# --- Step 2: Key-Value Pair Analysis ---
print("\n--- 2. Key-Value Pair Analysis ---")

# Regex to find potential keys (alphanumeric sequences before a colon)
key_pattern = re.compile(r'([a-zA-Z0-9\s/]+):')

def find_keys(text):
    # Find all potential keys, clean them up, and convert to lowercase
    return [key.strip().lower() for key in key_pattern.findall(text)]

# Apply the function and create a flat list of all keys found
all_keys = df['catalog_content'].apply(find_keys).sum()

# Count the frequency of each unique key
key_counts = Counter(all_keys)

print("\nTop 20 Most Common Keys Found:")
for key, count in key_counts.most_common(20):
    print(f"- '{key}': {count} occurrences")

# --- Step 3: Coverage Analysis ---
print("\n--- 3. Coverage Analysis ---")
total_entries = len(df)
if total_entries > 0:
    top_20_keys = {k for k, v in key_counts.most_common(20)}
    
    def has_top_key(text):
        # A more robust check for any of the top keys
        return any(re.search(fr'\b{re.escape(k)}:', text, re.IGNORECASE) for k in top_20_keys)

    df['has_top_key'] = df['catalog_content'].apply(has_top_key)
    coverage = df['has_top_key'].sum() / total_entries
    print(f"\nCoverage: {coverage:.2%} of entries contain at least one of the top 20 keys.")
else:
    print("Dataset is empty. Cannot perform coverage analysis.")

Successfully loaded /Users/adityasharma/Github Projects/Amazon/input/train.csv
Dataset shape: (75000, 4)

--- 1. Structural Characterization ---

Distribution of Line Counts per Entry:
count    75000.000000
mean         8.002773
std          2.340287
min          4.000000
25%          6.000000
50%          9.000000
75%         10.000000
max         31.000000
Name: line_count, dtype: float64

Distribution of Character Counts per Entry:
count    75000.000000
mean       908.886547
std        852.896151
min         32.000000
25%        251.000000
50%        643.000000
75%       1280.000000
max       7894.000000
Name: char_count, dtype: float64

--- 2. Key-Value Pair Analysis ---

Top 20 Most Common Keys Found:
- 'item name': 74994 occurrences
- '0
unit': 55005 occurrences
- 'value': 36582 occurrences
- 'bullet point 2': 25017 occurrences
- 'bullet point 3': 23910 occurrences
- 'bullet point 1': 20022 occurrences
- 'bullet point 4': 19788 occurrences
- 'bullet point 5': 17994 occurrences
- 

In [4]:
import pandas as pd
import re

# Assume 'df' is the DataFrame loaded in the previous steps.
# file_path = '/Users/adityasharma/Github Projects/Amazon/input/train.csv'
# df = pd.read_csv(file_path)
# df['catalog_content'] = df['catalog_content'].astype(str).fillna('')

def robust_extract_field(text, key):
    """
    A robust function to extract a value, even if it spans multiple lines.
    """
    pattern = re.compile(fr'^{re.escape(key)}:\s*(.*?)(?=\n[a-zA-Z0-9\s/]+:|\Z)', re.IGNORECASE | re.MULTILINE | re.DOTALL)
    match = pattern.search(text)
    if match:
        return " ".join(match.group(1).strip().split())
    return None

def aggregate_bullet_points(text):
    """
    Finds all lines starting with 'bullet point', extracts the text, and joins them.
    """
    pattern = re.compile(r'^(?:bullet point|bullet point \d+):\s*(.*)', re.IGNORECASE | re.MULTILINE)
    matches = pattern.findall(text)
    if matches:
        return ' '.join([match.strip() for match in matches])
    return None

# --- Re-run the extraction with the ROBUST function ---
df['title'] = df['catalog_content'].apply(lambda x: robust_extract_field(x, 'item name'))
df['aggregated_bullets'] = df['catalog_content'].apply(aggregate_bullet_points)


# --- Re-run the analysis to verify the new function's success rate ---
title_nulls = df['title'].isnull().sum()
bullets_nulls = df['aggregated_bullets'].isnull().sum()
total_rows = len(df)

print("--- Analysis of ROBUST 'title' column extraction ---")
print(f"- Extracted successfully in {total_rows - title_nulls} / {total_rows} rows ({(total_rows - title_nulls) / total_rows:.2%})")

print(f"\n--- Analysis of 'aggregated_bullets' column ---")
print(f"- Extracted successfully in {total_rows - bullets_nulls} / {total_rows} rows ({(total_rows - bullets_nulls) / total_rows:.2%})")

# --- CORRECTED VERIFICATION STEP ---
# Search for a more reliable key that we know exists from our analysis.
complex_entries_df = df[df['catalog_content'].str.contains("bullet point 6", na=False)]

# First, check if the search returned any results before trying to access an index.
if not complex_entries_df.empty:
    complex_entry = complex_entries_df.iloc[0]
    print("\n--- Example: Verification of a complex entry ---")
    # Display enough text to see the structure.
    print("Original Content Snippet:\n", complex_entry['catalog_content'][0:700])
    print("\nExtracted Title:\n", complex_entry['title'])
    print("\nExtracted Bullets:\n", complex_entry['aggregated_bullets'])
else:
    print("\n--- Example: No entry with 'bullet point 6' found for verification. ---")

--- Analysis of ROBUST 'title' column extraction ---
- Extracted successfully in 74993 / 75000 rows (99.99%)

--- Analysis of 'aggregated_bullets' column ---
- Extracted successfully in 60723 / 75000 rows (80.96%)

--- Example: No entry with 'bullet point 6' found for verification. ---


In [6]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split

# We assume 'df' is the DataFrame with the new 'title' and 'price' columns.
# To ensure we have the 'price' column, we need to reload the original train.csv
# and merge our new features onto it.

# Load original data to get the 'price' target variable
original_train_df = pd.read_csv('/Users/adityasharma/Github Projects/Amazon/input/train.csv')

# Use the 'df' from the previous step which already has 'title' and 'aggregated_bullets'
# Make sure the 'price' column is merged correctly. The dataframes should have the same index.
df['price'] = original_train_df['price']

# --- SMAPE Metric Definition ---
def smape(y_true, y_pred):
    """
    Calculates the Symmetric Mean Absolute Percentage Error (SMAPE).
    """
    numerator = np.abs(y_pred - y_true)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2
    return np.mean(numerator / denominator) * 100

# --- Data Preparation ---
# We will use a proper validation split to measure performance.
# Handle the few (<0.01%) titles that were not extracted by filling with an empty string.
df['title'] = df['title'].fillna('')
train_df = df.dropna(subset=['price']) # Ensure we only use rows with a price for training

X = train_df['title']
y = train_df['price']

# Create a validation set from the training data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training on {len(X_train)} samples, validating on {len(X_val)} samples.")

# --- Model Training ---
# Create the exact same pipeline as the original baseline.
pipeline = make_pipeline(
    TfidfVectorizer(stop_words='english', max_features=20000),
    Ridge(alpha=1.0)
)

print("\nTraining the TF-IDF + Ridge pipeline on the 'title' column...")
pipeline.fit(X_train, y_train)
print("Training complete.")

# --- Evaluation ---
print("\nMaking predictions on the validation set...")
y_pred = pipeline.predict(X_val)

# Prevent division by zero or negative price predictions
y_pred[y_pred < 0] = 0

validation_smape = smape(y_val, y_pred)

print("\n--- Performance Evaluation ---")
print(f"Original Baseline SMAPE (on catalog_content): 56.35")
print(f"New Baseline SMAPE (on extracted 'title'): {validation_smape:.2f}")

Training on 60000 samples, validating on 15000 samples.

Training the TF-IDF + Ridge pipeline on the 'title' column...
Training complete.

Making predictions on the validation set...

--- Performance Evaluation ---
Original Baseline SMAPE (on catalog_content): 56.35
New Baseline SMAPE (on extracted 'title'): 69.92


In [7]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

# We continue with the 'df' from the previous step, which has 'title', 
# 'aggregated_bullets', and 'price' columns.

# --- SMAPE Metric Definition ---
def smape(y_true, y_pred):
    numerator = np.abs(y_pred - y_true)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2
    return np.mean(numerator / denominator) * 100

# --- Data Preparation ---
# Fill any remaining NaNs in our text features
df['title'] = df['title'].fillna('')
df['aggregated_bullets'] = df['aggregated_bullets'].fillna('')
train_df = df.dropna(subset=['price'])

# Here, X is a DataFrame with two columns
X = train_df[['title', 'aggregated_bullets']]
y = train_df['price']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training on {len(X_train)} samples, validating on {len(X_val)} samples.")

# --- Multi-Input Model Pipeline ---
# We define a preprocessor that applies a separate TF-IDF to each column.
preprocessor = ColumnTransformer(
    transformers=[
        ('title_tfidf', TfidfVectorizer(stop_words='english', max_features=10000), 'title'),
        ('bullets_tfidf', TfidfVectorizer(stop_words='english', max_features=15000), 'aggregated_bullets')
    ],
    remainder='drop' # Drop any other columns
)

# Chain the preprocessor and the regressor in a final pipeline.
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', Ridge(alpha=1.0))
])

print("\nTraining a multi-input pipeline on 'title' and 'aggregated_bullets'...")
pipeline.fit(X_train, y_train)
print("Training complete.")

# --- Evaluation ---
print("\nMaking predictions on the validation set...")
y_pred = pipeline.predict(X_val)
y_pred[y_pred < 0] = 0

validation_smape = smape(y_val, y_pred)

print("\n--- Performance Evaluation ---")
print(f"Original Baseline SMAPE (on catalog_content): 56.35")
print(f"Second Baseline SMAPE (on title only): 69.92")
print(f"New Model SMAPE (on title + bullets): {validation_smape:.2f}")

Training on 60000 samples, validating on 15000 samples.

Training a multi-input pipeline on 'title' and 'aggregated_bullets'...
Training complete.

Making predictions on the validation set...

--- Performance Evaluation ---
Original Baseline SMAPE (on catalog_content): 56.35
Second Baseline SMAPE (on title only): 69.92
New Model SMAPE (on title + bullets): 73.79


In [8]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# --- Assume 'df' is loaded and has the 'price' column ---
# We'll use the robust extraction function from before
def robust_extract_field(text, key):
    pattern = re.compile(fr'^{re.escape(key)}:\s*(.*?)(?=\n[a-zA-Z0-9\s/]+:|\Z)', re.IGNORECASE | re.MULTILINE | re.DOTALL)
    match = pattern.search(text)
    if match:
        return " ".join(match.group(1).strip().split())
    return None

# --- 1. Create the structured features ---
df['title'] = df['catalog_content'].apply(lambda x: robust_extract_field(x, 'item name'))
df['manufacturer'] = df['catalog_content'].apply(lambda x: robust_extract_field(x, 'manufacturer'))
df['size'] = df['catalog_content'].apply(lambda x: robust_extract_field(x, 'size'))

# --- 2. Clean and process the new features ---
# Fill NaNs
df['title'] = df['title'].fillna('')
df['manufacturer'] = df['manufacturer'].fillna('unknown')

# A simple function to parse numbers from the 'size' column
def parse_size(size_str):
    if not isinstance(size_str, str):
        return np.nan
    # Find the first number (integer or float) in the string
    match = re.search(r'(\d+\.?\d*)', size_str)
    if match:
        return float(match.group(1))
    return np.nan

df['size_numeric'] = df['size'].apply(parse_size).fillna(0) # Fill missing sizes with 0

# --- 3. Prepare data for the pipeline ---
train_df = df.dropna(subset=['price'])
X = train_df[['title', 'manufacturer', 'size_numeric']]
y = train_df['price']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# --- 4. Build the Hybrid Pipeline ---
# This preprocessor handles each feature type correctly and separately
preprocessor = ColumnTransformer(
    transformers=[
        ('title_tfidf', TfidfVectorizer(stop_words='english', max_features=5000), 'title'),
        ('manufacturer_ohe', OneHotEncoder(handle_unknown='ignore'), ['manufacturer']),
        ('size_scaler', StandardScaler(), ['size_numeric'])
    ],
    remainder='passthrough'
)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', Ridge(alpha=1.0))
])

# --- 5. Train and Evaluate ---
print("Training the Hybrid Feature Pipeline...")
pipeline.fit(X_train, y_train)

print("\nEvaluating...")
y_pred = pipeline.predict(X_val)
y_pred[y_pred < 0] = 0

def smape(y_true, y_pred):
    numerator = np.abs(y_pred - y_true)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2
    return np.mean(numerator / denominator) * 100

validation_smape = smape(y_val, y_pred)

print("\n--- Final Performance Evaluation ---")
print(f"Original Baseline SMAPE (raw text): 56.35")
print(f"Hybrid Model SMAPE (structured features): {validation_smape:.2f}")


Training the Hybrid Feature Pipeline...

Evaluating...

--- Final Performance Evaluation ---
Original Baseline SMAPE (raw text): 56.35
Hybrid Model SMAPE (structured features): 70.22


In [9]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

# --- Assume 'df' is the fully processed DataFrame from the previous step ---
# It should have 'price', 'title', 'manufacturer', and 'size_numeric'

# --- 1. Prepare data, ensuring no NaNs in target ---
train_df = df.dropna(subset=['price'])
X = train_df[['title', 'manufacturer', 'size_numeric']]
y = train_df['price']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# --- 2. Build the Hybrid Preprocessor (same as before) ---
preprocessor = ColumnTransformer(
    transformers=[
        ('title_tfidf', TfidfVectorizer(max_features=10000), 'title'),
        ('manufacturer_ohe', OneHotEncoder(handle_unknown='ignore', sparse_output=False), ['manufacturer']),
        # Note: Tree models don't require feature scaling, so we can drop StandardScaler
    ],
    remainder='passthrough' # Keep size_numeric
)

# --- 3. Create the LightGBM Pipeline ---
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', lgb.LGBMRegressor(random_state=42))
])

# --- 4. Train the model on the LOG-TRANSFORMED target ---
print("Training the LightGBM pipeline on the log-transformed price...")
pipeline.fit(X_train, np.log1p(y_train))
print("Training complete.")

# --- 5. Evaluate the model ---
print("\nEvaluating...")
# Predict on the log scale
log_preds = pipeline.predict(X_val)
# Convert predictions BACK to the original price scale
y_pred = np.expm1(log_preds)
y_pred[y_pred < 0] = 0 # Ensure no negative prices

def smape(y_true, y_pred):
    numerator = np.abs(y_pred - y_true)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2
    return np.mean(numerator / denominator) * 100

validation_smape = smape(y_val, y_pred)

print("\n--- Final Performance Evaluation ---")
print(f"Original Baseline SMAPE (raw text, linear model): 56.35")
print(f"Final Model SMAPE (structured features, non-linear model): {validation_smape:.2f}")



Training the LightGBM pipeline on the log-transformed price...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.268675 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 148796
[LightGBM] [Info] Number of data points in the train set: 60000, number of used features: 3203
[LightGBM] [Info] Start training from score 2.740904
Training complete.

Evaluating...

--- Final Performance Evaluation ---
Original Baseline SMAPE (raw text, linear model): 56.35
Final Model SMAPE (structured features, non-linear model): 57.92




In [10]:

import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split

# --- Load original data ---
df = pd.read_csv('/Users/adityasharma/Github Projects/Amazon/input/train.csv')
df = df.dropna(subset=['price'])
df['catalog_content'] = df['catalog_content'].astype(str).fillna('')

# --- SMAPE Definition ---
def smape(y_true, y_pred):
    numerator = np.abs(y_pred - y_true)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2
    return np.mean(numerator / denominator) * 100

# --- Data Preparation ---
X = df['catalog_content']
y = df['price']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# --- The Strong Baseline Pipeline ---
# Combine the best features (raw text) with the best model (LightGBM)
pipeline = make_pipeline(
    TfidfVectorizer(stop_words='english', max_features=30000, ngram_range=(1, 2)),
    lgb.LGBMRegressor(random_state=42, n_estimators=200, learning_rate=0.1)
)

# --- Train on the LOG-TRANSFORMED target ---
print("Training the strong baseline (LGBM on raw text)...")
pipeline.fit(X_train, np.log1p(y_train))
print("Training complete.")

# --- Evaluation ---
print("\nEvaluating...")
log_preds = pipeline.predict(X_val)
y_pred = np.expm1(log_preds)
y_pred[y_pred < 0] = 0

validation_smape = smape(y_val, y_pred)

print("\n--- Final Performance Evaluation ---")
print(f"Original Baseline SMAPE (Ridge on raw text): 56.35")
print(f"Strong Baseline SMAPE (LGBM on raw text): {validation_smape:.2f}")

Training the strong baseline (LGBM on raw text)...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 4.428752 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1203571
[LightGBM] [Info] Number of data points in the train set: 60000, number of used features: 28338
[LightGBM] [Info] Start training from score 2.740904
Training complete.

Evaluating...

--- Final Performance Evaluation ---
Original Baseline SMAPE (Ridge on raw text): 56.35
Strong Baseline SMAPE (LGBM on raw text): 55.03


