In [4]:
import pandas as pd
import numpy as np
import re
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import lightgbm as lgb

In [5]:
file_path = '/Users/adityasharma/Github Projects/Amazon/input/train.csv'
train_df = pd.read_csv(file_path)
train_df = train_df.dropna(subset=['price'])
train_df['catalog_content'] = train_df['catalog_content'].astype(str).fillna('')
print("Data loaded successfully.")


Data loaded successfully.


In [6]:
print("\n--- Step 2: Discovering Keywords from Data ---")

# --- Accessory Keyword Discovery (Correlation Method) ---
def extract_title(text):
    match = re.search(r'^item name:\s*(.*)', text, re.IGNORECASE | re.MULTILINE)
    return match.group(1).strip() if match else ''
train_df['title'] = train_df['catalog_content'].apply(extract_title).fillna('')

vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 3), max_features=5000)
tfidf_matrix = vectorizer.fit_transform(train_df['title'])
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out(), index=train_df.index)
tfidf_df['price'] = train_df['price']

print("Calculating correlations for accessory keywords...")
correlations = tfidf_df.corr()['price'].drop('price')
# Programmatically select the top 20 terms most negatively correlated with price
DATA_DRIVEN_ACCESSORY_KEYWORDS = correlations.sort_values(ascending=True).head(20).index.tolist()
print(f"Discovered Accessory Keywords: {DATA_DRIVEN_ACCESSORY_KEYWORDS}")

# --- Quantity Pattern Discovery ---
pattern = re.compile(r'(\d+)\s+([a-zA-Z]{2,})')
all_units = [match[1] for text in train_df['catalog_content'] for match in pattern.findall(text.lower())]
unit_counts = Counter(all_units)
# Programmatically select the top 15 most common unit terms
DATA_DRIVEN_QUANTITY_UNITS = [unit for unit, count in unit_counts.most_common(15)]
print(f"Discovered Quantity Units: {DATA_DRIVEN_QUANTITY_UNITS}")


--- Step 2: Discovering Keywords from Data ---
Calculating correlations for accessory keywords...
Discovered Accessory Keywords: ['oz', 'fl oz', 'fl', 'ounce', 'seasoning', 'mix', '16 oz', 'campbell', 'soup', 'goya', 'sauce', '12 oz', 'bottle', 'betty crocker', 'crocker', 'chicken', 'betty', 'ounce pack', '16', 'amazon']
Discovered Quantity Units: ['unit', 'oz', 'ounce', 'pack', 'minutes', 'fl', 'count', 'calories', 'tea', 'cups', 'ounces', 'lb', 'grams', 'years', 'bullet']
