In [3]:
import spacy
from spacy.matcher import PhraseMatcher
import pandas as pd

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Sample DataFrames
concise_categories = ['Electronics', 'Clothing', 'Home Goods']
detailed_categories = ['Smartphone', 'Laptop', 'T-Shirt', 'Dresser', 'Microwave']

# Sample DataFrame with detailed categories
detailed_categories_df = pd.DataFrame({
    'Product': ['Smartphone', 'Laptop', 'T-Shirt', 'Dresser', 'Microwave'],
    'Detailed_Category': detailed_categories,
    'Price': [500, 1000, 20, 150, 80]
})

# Process concise categories with spaCy
concise_categories_docs = [nlp(category) for category in concise_categories]

# Define a function to find the best match using spaCy
def find_best_match_spacy(text, categories_docs):
    text_doc = nlp(text)
    similarities = [text_doc.similarity(category_doc) for category_doc in categories_docs]
    best_match_index = similarities.index(max(similarities))
    return concise_categories[best_match_index]

# Map detailed categories to concise categories using spaCy
detailed_categories_df['Concise_Category'] = detailed_categories_df['Detailed_Category'].apply(
    lambda x: find_best_match_spacy(x, concise_categories_docs)
)

# If there are unmatched categories, handle them as needed (e.g., assign a default category)
detailed_categories_df['Concise_Category'].fillna('Other', inplace=True)

# Now, you have a DataFrame with concise categories
print(detailed_categories_df)

      Product Detailed_Category  Price Concise_Category
0  Smartphone        Smartphone    500         Clothing
1      Laptop            Laptop   1000         Clothing
2     T-Shirt           T-Shirt     20       Home Goods
3     Dresser           Dresser    150         Clothing
4   Microwave         Microwave     80         Clothing


  similarities = [text_doc.similarity(category_doc) for category_doc in categories_docs]


In [9]:
import spacy
from fuzzywuzzy import fuzz
import pandas as pd

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Sample DataFrames
concise_categories = ['Electronics', 'Clothing', 'Home Goods']
detailed_categories = ['Smartphone', 'Laptop', 'T-Shirt', 'Dresser', 'Microwave']

# Sample DataFrame with detailed categories
detailed_categories_df = pd.DataFrame({
    'Product': ['Smartphone', 'Laptop', 'T-Shirt', 'Dresser', 'Microwave'],
    'Detailed_Category': detailed_categories,
    'Price': [500, 1000, 20, 150, 80]
})

# Process concise categories with spaCy
concise_categories_docs = [nlp(category) for category in concise_categories]

# Define a function to find the best match using spaCy and fuzzy matching
def find_best_match(text, categories_docs):
    text_doc = nlp(text)
    
    # Calculate similarity using spaCy
    spacy_similarities = [text_doc.similarity(category_doc) for category_doc in categories_docs]
    
    # Calculate similarity using fuzzywuzzy
    fuzzy_similarities = [fuzz.partial_ratio(text, category) for category in concise_categories]
    
    # Combine spaCy and fuzzywuzzy scores
    combined_similarities = [0.4 * spacy_sim + 0.6 * fuzzy_sim for spacy_sim, fuzzy_sim in zip(spacy_similarities, fuzzy_similarities)]
    
    # Find the best match index
    best_match_index = combined_similarities.index(max(combined_similarities))
    
    return concise_categories[best_match_index]

# Map detailed categories to concise categories using spaCy and fuzzy matching
detailed_categories_df['Concise_Category'] = detailed_categories_df['Detailed_Category'].apply(
    lambda x: find_best_match(x, concise_categories_docs)
)

# If there are unmatched categories, handle them as needed (e.g., assign a default category)
detailed_categories_df['Concise_Category'].fillna('Other', inplace=True)

# Now, you have a DataFrame with concise categories
print(detailed_categories_df)


      Product Detailed_Category  Price Concise_Category
0  Smartphone        Smartphone    500      Electronics
1      Laptop            Laptop   1000      Electronics
2     T-Shirt           T-Shirt     20         Clothing
3     Dresser           Dresser    150      Electronics
4   Microwave         Microwave     80      Electronics


  spacy_similarities = [text_doc.similarity(category_doc) for category_doc in categories_docs]


In [12]:

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Sample DataFrames
concise_categories = ['Electronics', 'Clothing', 'Home Goods']
detailed_categories = ['Smartphone', 'Laptop', 'T-Shirt', 'Dresser', 'Microwave']

# Sample DataFrame with detailed categories
detailed_categories_df = pd.DataFrame({
    'Product': ['Smartphone', 'Laptop', 'T-Shirt', 'Dresser', 'Microwave'],
    'Detailed_Category': detailed_categories,
    'Price': [500, 1000, 20, 150, 80]
})

# Process concise categories with spaCy
concise_categories_docs = [nlp(category) for category in concise_categories]

# Define a function to find the best match using spaCy and fuzzy matching
def find_best_match(text, categories_docs):
    text_doc = nlp(text)
    
    # Calculate similarity using spaCy (experiment with different methods)
    # spaCy method 1: text_doc.similarity(category_doc)
    # spaCy method 2: text_doc.similarity(category_doc.vector)
    spacy_similarities = [text_doc.similarity(category_doc) for category_doc in categories_docs]
    
    # Calculate similarity using fuzzywuzzy
    fuzzy_similarities = [fuzz.partial_ratio(text, category) for category in concise_categories]
    
    # Experiment with different weight combinations
    combined_similarities = [0.7 * spacy_sim + 0.3 * fuzzy_sim for spacy_sim, fuzzy_sim in zip(spacy_similarities, fuzzy_similarities)]
    
    # Fine-tune fuzzy matching threshold (experiment with different values)
    threshold = 80
    
    # Find the best match index
    best_match_index = combined_similarities.index(max(combined_similarities))
    
    return concise_categories[best_match_index]

# Map detailed categories to concise categories using spaCy and fuzzy matching
detailed_categories_df['Concise_Category'] = detailed_categories_df['Detailed_Category'].apply(
    lambda x: find_best_match(x, concise_categories_docs)
)

# If there are unmatched categories, handle them as needed (e.g., assign a default category)
detailed_categories_df['Concise_Category'].fillna('Other', inplace=True)

# Now, you have a DataFrame with concise categories
print(detailed_categories_df)

      Product Detailed_Category  Price Concise_Category
0  Smartphone        Smartphone    500      Electronics
1      Laptop            Laptop   1000      Electronics
2     T-Shirt           T-Shirt     20         Clothing
3     Dresser           Dresser    150      Electronics
4   Microwave         Microwave     80      Electronics


  spacy_similarities = [text_doc.similarity(category_doc) for category_doc in categories_docs]


In [13]:
import spacy
from fuzzywuzzy import fuzz
import pandas as pd

# Load spaCy model
nlp = spacy.load("en_core_web_md")  # Use en_core_web_md instead of en_core_web_sm

# Sample DataFrames
concise_categories = ['Electronics', 'Clothing', 'Home Goods']
detailed_categories = ['Smartphone', 'Laptop', 'T-Shirt', 'Dresser', 'Microwave']

# Sample DataFrame with detailed categories
detailed_categories_df = pd.DataFrame({
    'Product': ['Smartphone', 'Laptop', 'T-Shirt', 'Dresser', 'Microwave'],
    'Detailed_Category': detailed_categories,
    'Price': [500, 1000, 20, 150, 80]
})

# Process concise categories with spaCy
concise_categories_docs = [nlp(category) for category in concise_categories]

# Define a function to find the best match using spaCy and fuzzy matching
def find_best_match(text, categories_docs):
    text_doc = nlp(text)
    
    # Calculate similarity using spaCy (experiment with different methods)
    # spaCy method 1: text_doc.similarity(category_doc)
    # spaCy method 2: text_doc.similarity(category_doc.vector)
    spacy_similarities = [text_doc.similarity(category_doc) for category_doc in categories_docs]
    
    # Calculate similarity using fuzzywuzzy
    fuzzy_similarities = [fuzz.partial_ratio(text, category) for category in concise_categories]
    
    # Experiment with different weight combinations
    combined_similarities = [0.7 * spacy_sim + 0.3 * fuzzy_sim for spacy_sim, fuzzy_sim in zip(spacy_similarities, fuzzy_similarities)]
    
    # Fine-tune fuzzy matching threshold (experiment with different values)
    threshold = 80
    
    # Find the best match index
    best_match_index = combined_similarities.index(max(combined_similarities))
    
    return concise_categories[best_match_index]

# Map detailed categories to concise categories using spaCy and fuzzy matching
detailed_categories_df['Concise_Category'] = detailed_categories_df['Detailed_Category'].apply(
    lambda x: find_best_match(x, concise_categories_docs)
)

# If there are unmatched categories, handle them as needed (e.g., assign a default category)
detailed_categories_df['Concise_Category'].fillna('Other', inplace=True)

# Now, you have a DataFrame with concise categories
print(detailed_categories_df)

      Product Detailed_Category  Price Concise_Category
0  Smartphone        Smartphone    500      Electronics
1      Laptop            Laptop   1000      Electronics
2     T-Shirt           T-Shirt     20         Clothing
3     Dresser           Dresser    150       Home Goods
4   Microwave         Microwave     80      Electronics
