In [9]:
import pandas as pd
from fuzzywuzzy import fuzz
from nltk.corpus import stopwords
import json
import re
from pprint import pprint
import string
import sys

exclude = set(string.punctuation)
include = set('.')
exclude -= include

cachedStopWords = set(stopwords.words("english"))

def handle_titles(x):
    """
    Helper function to make string all lowercase and remove punctuation & stopwords.
    
    x: any string
    """
    x = x.strip().lower()    
    x = x.replace('/', ' ')  # replace '/' with space
    x = x.replace('-', ' ')  # replace '-' with space
    x = ''.join(ch for ch in x if ch not in exclude)
    x = ' '.join(word for word in x.split() if word not in cachedStopWords)
    return x.strip()

def handle_products(x):
    x = x.strip().lower()    
    return re.sub(r'[\W_]', ' ', x).strip()  # replace '_', '-', and all non-alphanumeric with ' '

pd.options.display.max_colwidth=150
pd.set_option('display.max_rows', 1000)
products = []
listings = []
with open('data/products.txt') as products_file:
    for line in products_file:
        products.append(json.loads(line))
with open('data/listings.txt') as listings_file:
    for line in listings_file:
        listings.append(json.loads(line))
#pprint(products)
df_products = pd.DataFrame(products, columns=['product_name', 'manufacturer', 'family', 'model'])
df_listings = pd.DataFrame(listings, columns=['title', 'manufacturer', 'price', 'currency'])
df_products = df_products.fillna('')

### Looks like product_name itself contains enough information (including manufacturer, family, model)
### model should be a key differentiator
### For now, let's use "model", "product_name" and "manufacturer"

#df_products['product_fingerprint'] = df_products.product_name + " " + df_products.manufacturer \
#    + " " + df_products.family + " " + df_products.model
#df_products['product_fingerprint'] = df_products.product_fingerprint.apply(handle_products)
df_products['product_name'] = df_products.product_name.apply(handle_products)
df_products['manufacturer'] = df_products.manufacturer.apply(handle_products)
df_products['model'] = df_products.model.apply(handle_products)
df_listings['title'] = df_listings.title.apply(handle_titles)
df_listings['manufacturer'] = df_listings.manufacturer.apply(handle_products)
df_products

title           sony cybershot dsc w310 appareil photo numérique 121 mpix rose
manufacturer                                                              sony
price                                                                   118.00
currency                                                                   EUR
Name: 18605, dtype: object


Unnamed: 0,product_name,manufacturer,family,model
0,sony cyber shot dsc w310,sony,Cyber-shot,dsc w310
1,samsung tl240,samsung,,tl240
2,nikon s6100,nikon,Coolpix,s6100
3,samsung tl220,samsung,,tl220
4,fujifilm t205,fujifilm,FinePix,t205
5,casio qv 5000sx,casio,,qv 5000sx
6,canon digital ixus 130 is,canon,Digital IXUS,130 is
7,leica digilux,leica,,digilux
8,fujifilm finepix 1500,fujifilm,FinePix,1500
9,sony hx100v,sony,Cybershot,dsc hx100v


In [2]:
df_listings

Unnamed: 0,title,manufacturer,price,currency
0,led flash macro ring light 48 x led 6 adapter rings canon sony nikon sigma lenses,neewer electronics accessories,35.99,CAD
1,canon powershot sx130is 12.1 mp digital camera 12x wide angle optical image stabilized zoom 3.0 inch lcd,canon canada,199.96,CAD
2,canon powershot sx130is 12.1 mp digital camera 12x wide angle optical image stabilized zoom 3.0 inch lcd,canon canada,209.00,CAD
3,canon powershot d10 12.1 mp waterproof digital camera 3x optical image stabilized zoom 2.5 inch lcd blue silver,canon canada,306.24,CAD
4,canon powershot d10 12.1 mp waterproof digital camera 3x optical image stabilized zoom 2.5 inch lcd blue silver,canon canada,420.33,CAD
5,genuine samsung eb575152vu i9000 galaxys battery,samsung,13.99,CAD
6,canon powershot a1200 black,canon canada,129.99,CAD
7,canon powershot a495 10.1 mp digital camera 3.3x optical zoom 2.5 inch lcd blue,canon canada,88.00,CAD
8,canon powershot a495 10.1 mp digital camera 3.3x optical zoom 2.5 inch lcd blue,canon canada,129.92,CAD
9,canon powershot elph 300 hs black,canon canada,259.99,CAD


In [3]:
df_listings = df_listings.sort_values('manufacturer', axis=0)  # sorted by manufacturer
df_listings

Unnamed: 0,title,manufacturer,price,currency
8396,nikon coolpix s5 6mp digital camera,,399.95,USD
11918,panasonic dmc lx3 black,,329.00,GBP
7634,minolta dimage s404 4mp digital camera 4x optical zoom,,65.00,USD
8333,nikon coolpix 5200 5mp digital camera 3x optical zoom,,288.53,USD
19079,cadre photo reveil radio,,71.40,EUR
8332,nikon coolpix 5200 5mp digital camera 3x optical zoom,,160.00,USD
11864,samsung l201 black including charger lithium battery,,70.00,GBP
19434,cadre photo numerique 7 pvid,,61.30,EUR
7595,gba sp digital camera,,49.95,USD
7887,fujifilm 12.2 magapixels digital camera 2.7 lcd screen 3x optical zoom,,109.87,USD


In [7]:
def match_manufacturer(listing_manuf, product_manuf):
    if product_manuf in listing_manuf or listing_manuf in product_manuf:
        return "MANUF-MATCH:EXACT"
    return "NOTMATCH"
def find_listings_with_matched_manufacturer(df_listings_sorted, product_manuf):
    """
    Assume that manufacturer (in products) is not empty string
    :return: a list of index (of df_listings_sorted) in ascending order
    """

    listings_index = []
    start_idx = 0
    end_idx = len(df_listings_sorted) - 1
    mid_idx = (start_idx + end_idx) / 2

    while end_idx-start_idx > 1:
        mid_value = df_listings_sorted.iloc[mid_idx].manufacturer
        if match_manufacturer(mid_value, product_manuf) == "MANUF-MATCH:EXACT":
            listings_index.append(mid_idx)
            # expand from mid_idx in both directions
            i = 1
            while mid_idx-i >= 0:
                v = df_listings_sorted.iloc[mid_idx-i].manufacturer
                if match_manufacturer(v, product_manuf) == "MANUF-MATCH:EXACT":
                    listings_index.append(mid_idx-i)
                    i += 1
                else:
                    break
            i = 1
            while mid_idx+i <= len(df_listings_sorted)-1:
                v = df_listings_sorted.iloc[mid_idx+i].manufacturer
                if match_manufacturer(v, product_manuf) == "MANUF-MATCH:EXACT":
                    listings_index.append(mid_idx+i)
                    i += 1
                else:
                    break
            break
        elif mid_value < product_manuf:
            start_idx = mid_idx
            mid_idx = (start_idx + end_idx) / 2
        elif mid_value > product_manuf:
            end_idx = mid_idx
            mid_idx = (start_idx + end_idx) / 2

    return sorted(listings_index)

In [6]:
original_index_of_empty_manufacturers = []
index_of_empty_manufacturers = []
i = 0
for index, row in df_listings.iterrows():
    if not row['manufacturer']:
        original_index_of_empty_manufacturers.append(index)
        index_of_empty_manufacturers.append(i)
        i += 1
    else:
        break
print(original_index_of_empty_manufacturers)
print(index_of_empty_manufacturers)
#print(df_listings.iloc[0])
for prod_idx, prod_row in df_products.iterrows():        
    match_product(prod_row, df_listings)
    sys.exit(1)

[8396, 11918, 7634, 8333, 19079, 8332, 11864, 19434, 7595, 7887, 7425, 6483, 6482, 11886, 16456, 15707, 12263, 12609, 12236, 8411, 759, 7596, 10938, 19074, 7633, 18340, 15706, 12306, 12305]
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28]
(0, product_name    sony cyber shot dsc w310
manufacturer                        sony
family                        Cyber-shot
model                           dsc w310
Name: 0, dtype: object)


SystemExit: 1

To exit: use 'exit', 'quit', or Ctrl-D.


In [None]:
df_listings_sorted.iloc[20104].title

In [15]:
print(str(df_listings.iloc[297].name) + "is good")
print('%d is good"' % df_listings.iloc[297].name)

2902is good
2902 is good"


In [None]:
df_products.iloc[303].product_name, df_products.iloc[303].model

In [None]:
print(df_products.iloc[303].product_name)
print(df_listings.iloc[1].title)
print(fuzz.partial_ratio(df_products.iloc[303].product_name, df_listings.iloc[1].title))
print(fuzz.ratio("canon powershot sx130 is","canon powershot sx130is 12.1 mp digital camera 12x wide angle optical image stabilized zoom 3.0 inch lcd"))
print(fuzz.partial_ratio("canon powershot sx130 is","canon powershot sx130is 12.1 mp digital camera 12x wide angle optical image stabilized zoom 3.0 inch lcd"))
print(fuzz.token_sort_ratio("canon powershot sx130 is","canon powershot sx130is 12.1 mp digital camera 12x wide angle optical image stabilized zoom 3.0 inch lcd"))
print(fuzz.token_set_ratio("canon powershot sx130 is","canon powershot sx130is 12.1 mp digital camera 12x wide angle optical image stabilized zoom 3.0 inch lcd"))

In [None]:
for i in range(100):
    print(i+1, fuzz.partial_ratio(df_products.iloc[303].product_name, df_listings.iloc[i].title))

In [None]:
df_listings.apply(lambda x: fuzz.ratio(x['title'], x['title2']), axis = 1)

In [None]:
df_listings['title_ratio'] = df_listings.apply