In [286]:
import pandas as pd
import numpy as np

In [287]:
df = pd.read_csv("sales_data.csv",index_col=0)

In [288]:
df["brand"] = df["brand"].apply(lambda x: x[:-1])

In [289]:
df.head()

Unnamed: 0_level_0,sku,product_line,brand,sales,price
SKU_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,Samsung Stereo Headset Wird EO-EG920BWEG,Phones Mobile Accessories,SAMSUNG,1,1359.0
2,HITACHI REF SBS RM-700AGPND4X-(DIA),Refrigerators,HITACHI,1,184408.0
3,SIEMENS DISHWASHER SN26L801IN,Dishwashers,SIEMENS,1,34425.0
4,Super General S/AC 1.5T SGSI185-3BE 3S,Air Conditioners,Super General,1,29750.0
5,Apple 12W USB Power Adapter,Tablets & Detachables,APPLE,1,1700.0


In [290]:
df.info(), df.shape

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2015 entries, 1 to 2015
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sku           2015 non-null   object 
 1   product_line  2015 non-null   object 
 2   brand         2015 non-null   object 
 3   sales         2015 non-null   int64  
 4   price         2015 non-null   float64
dtypes: float64(1), int64(1), object(3)
memory usage: 94.5+ KB


(None, (2015, 5))

In [291]:
df = df.astype({"sku":"string","product_line":"string","brand":"string"})

In [292]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2015 entries, 1 to 2015
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sku           2015 non-null   string 
 1   product_line  2015 non-null   string 
 2   brand         2015 non-null   string 
 3   sales         2015 non-null   int64  
 4   price         2015 non-null   float64
dtypes: float64(1), int64(1), string(3)
memory usage: 94.5 KB


In [293]:
df["product_line"].unique()

<StringArray>
[    'Phones Mobile Accessories',                 'Refrigerators',
                   'Dishwashers',              'Air Conditioners',
         'Tablets & Detachables',               'Gaming Software',
              'Mobile Computing',            'Kitchen Appliances',
                   'MP3 Players',                 'Input Devices',
          'Earphones/Headphones',               'Home Appliances',
              'Washing Machines',               'Water Purifiers',
                    'Microwaves',            'Travel Accessories',
                  'Phones Fixed',       'Smart Phones (OS Based)',
                 'Computer Bags',                  'Shop in Shop',
        'Health & Personal Care',                  'Home Theatre',
                 'Phones Mobile',         'Lifestyle & Wearables',
               'Digital Cameras',                      'Security',
                        'TV LCD',                   'Peripherals',
                 'Storage Media',   'Printers & 

In [294]:
import re
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer


lemma = WordNetLemmatizer()
replacement = {
    re.compile(r"&|/"):" ",
    re.compile(r"( )+"):" ",
    re.compile(r" \(.*\)"):"",
    re.compile(r"^\s+|\s+$"):""
}

df["product_line_clean"] = df["product_line"].str.lower().replace(
    regex=replacement
    ).apply(
    lambda x: (" ".join(lemma.lemmatize(word) for word in x.split(" ")))
    )

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/slowgamer/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [295]:
categories = df["product_line_clean"].unique()
categories

array(['phone mobile accessory', 'refrigerator', 'dishwasher',
       'air conditioner', 'tablet detachables', 'gaming software',
       'mobile computing', 'kitchen appliance', 'mp3 player',
       'input device', 'earphone headphone', 'home appliance',
       'washing machine', 'water purifier', 'microwave',
       'travel accessory', 'phone fixed', 'smart phone', 'computer bag',
       'shop in shop', 'health personal care', 'home theatre',
       'phone mobile', 'lifestyle wearable', 'digital camera', 'security',
       'tv lcd', 'peripheral', 'storage medium',
       'printer office solution', 'gaming hardware', 'imaging accessory',
       'speaker', 'pc accessory', 'phone mobile provider', 'cookware',
       'fan', 'entertainment accessory', 'desktop', 'dvd vcd content',
       'toy', 'networking', 'dummy laptop netbook tab',
       'cooling heating appliance', 'oven cooker', 'power',
       'air purifier', 'connected home housewares', 'audio system',
       'software', 'camcorde

In [296]:
df["brand_lower"] = df["brand"].str.lower()

brand_names = df["brand_lower"].unique()
brand_names

<StringArray>
[      'samsung',       'hitachi',       'siemens', 'super general',
         'apple',     'stuffcool',       'ubisoft',          'asus',
       'airplus',         'bajaj',
 ...
       'digisol',      'hindware',       'livpure',         'ricoh',
      'reliance',         'intel',        'ifrogz',          'drav',
        'amazon',       'mitashi']
Length: 151, dtype: string

In [315]:
from Levenshtein import ratio,distance

#max_win_score uses window size of category words and calculate Levenshtein similarity ratio (Windows shrink at the end)
#if score is >= 0.5 particuar brand df is return else all brands df
def max_win_score(cats,txt_ls):
    txt_n = len(txt_ls)
    cat_scores = {cat:0 for cat in cats}
    for cat in cats:
        cat_ls = cat.split(" ")
        n = len(cat_ls)

        for i in range(txt_n):
            temp = " ".join(txt_ls[i:i+n])
            cat_scores[cat] = max(cat_scores[cat],ratio(cat.lower(),temp.lower()))

    return cat_scores

#average_score take cartesian cross product, calculate Levenshtein similarity ratio
#and average max similarity ratio for each item
#if score is >= 0.5 particuar brand df is return else all brands df
def average_score(cats,txt_ls):
    txt_n = len(txt_ls)
    cat_scores = {cat:0 for cat in cats}
    for cat in cats:
        cat_ls = cat.split(" ")
        n = len(cat_ls)
        score = {word:0 for word in cat_ls}
        
        for word_cat in cat_ls:
            for word_txt in txt_ls:
                score[word_cat] = max(score[word_cat],ratio(word_cat,word_txt))
     
        cat_scores[cat] = np.mean(list(score.values()))
   
    return cat_scores
            

#exact_match function first try exact matching of brand name in search text and return that brand or  dataframe
#if no exact match found, partial match is done using average_score or max_win_score

def exact_match(df,cat,txt,method="average_score"):
    txt_ls = txt.lower().split(" ")
    ind = df[cat].isin(txt_ls)
    
    if ind.any():
        return df[ind]
    
    if method=="average_score":
        tp = average_score(df[cat].unique(),txt_ls)
    else:
        tp = max_win_score(df[cat].unique(),txt_ls)
        
    ele = max(tp.items(),key= lambda x:x[1])
    return df.loc[df[cat]==ele[0]].copy() if ele[1]>=0.5 else df.copy()

#partial match return top_scoring product_lines using average_score or max_win_score
def partial_match(df,cat,txt,top_ele=3,method="average_score",lemmatize=True):
    
    if lemmatize:
        txt_ls = [lemma.lemmatize(word) for word in txt.lower().split(" ")]
    else:
        txt_ls = txt.lower().split(" ")

    if method=="average_score":
        tp = average_score(df[cat].unique(),txt_ls)
    else:
        tp = max_win_score(df[cat].unique(),txt_ls)
        
    elements = [x for x,y in sorted(tp.items(),key = lambda x: x[1],reverse=True)[:top_ele]]
    ind = df[cat].isin(elements)
    return df[ind].copy()

def filter_sku(df,)

In [316]:
res = exact_match(df,"brand_lower","Samsng Galaxy J7 Mobile Phone")
partial_match(res,"product_line_clean","Samsng Galaxy J7 Mobile Phone",method="max_win_score")

Unnamed: 0_level_0,sku,product_line,brand,sales,price,product_line_clean,brand_lower
SKU_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,Samsung Stereo Headset Wird EO-EG920BWEG,Phones Mobile Accessories,SAMSUNG,1,1359.000000,phone mobile accessory,samsung
20,Samsung Wash/Dry 8/6K-WD80J6410,Washing Machines,SAMSUNG,6,54485.000000,washing machine,samsung
27,SAMSUNG W/M TL 6KG WA60M4300HD/TL SLV,Washing Machines,SAMSUNG,15,14990.000000,washing machine,samsung
35,Samsung Galaxy J200G Black,Smart Phones (OS Based),SAMSUNG,15,7297.000000,smart phone,samsung
64,Samsung Galaxy J200G Gold,Smart Phones (OS Based),SAMSUNG,30,7335.033333,smart phone,samsung
...,...,...,...,...,...,...,...
1918,Samsung S8+ Black,Smart Phones (OS Based),SAMSUNG,9,64900.000000,smart phone,samsung
1921,Samsung Galaxy J700F Black,Smart Phones (OS Based),SAMSUNG,4,9949.250000,smart phone,samsung
1941,SAMSUNG WM TL 11KG WA11J5750SP,Washing Machines,SAMSUNG,1,34332.000000,washing machine,samsung
1983,Samsung A520F Black 2017,Smart Phones (OS Based),SAMSUNG,6,26900.000000,smart phone,samsung


In [320]:
res = exact_match(df,"brand_lower","Itachi Ref")
partial_match(res,"product_line_clean","Itachi Ref")

Unnamed: 0_level_0,sku,product_line,brand,sales,price,product_line_clean,brand_lower
SKU_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2,HITACHI REF SBS RM-700AGPND4X-(DIA),Refrigerators,HITACHI,1,184408.0,refrigerator,hitachi
42,E-HITACHI S/AC 2T ZUN 3300 FRAU324IVD 3S,Air Conditioners,HITACHI,1,50571.0,air conditioner,hitachi
57,Hitachi W/AC 1T KAZE+ RAW511KUD 5Str,Air Conditioners,HITACHI,5,24642.0,air conditioner,hitachi
518,HITACHI S/AC 1T RIDA 3200F RSG312EAD 3S,Air Conditioners,HITACHI,3,28042.0,air conditioner,hitachi
540,Hitachi W/AC 1.5T SMR TM RAT518HUD 5S,Air Conditioners,HITACHI,3,32886.0,air conditioner,hitachi
783,Hitachi W/AC 1.5T KAZE+ RAW518KUD/Z1 5S,Air Conditioners,HITACHI,3,29486.0,air conditioner,hitachi
828,HITACHI REF FF 404L R-SG38FPND GBK,Refrigerators,HITACHI,1,60000.0,refrigerator,hitachi
1070,HITACHI REF FF 601L R-VG660PND3GGR 3S,Refrigerators,HITACHI,1,75055.0,refrigerator,hitachi
1160,HITACHI W/AC 1.5T RAW318KUD 3S,Air Conditioners,HITACHI,2,25322.0,air conditioner,hitachi
1236,HITACHI REF FF 415L VG440PND3K GBK 2S,Refrigerators,HITACHI,1,48118.0,refrigerator,hitachi


In [321]:
ratio("ref","refrigerator")

0.4