In [54]:
import pandas as pd
from fuzzywuzzy import fuzz
from nltk.corpus import stopwords
import json
import re
from pprint import pprint
import string
import sys

In [69]:
exclude = set(string.punctuation)
include = set('.-')
exclude -= include

cachedStopWords = set(stopwords.words("english"))

def handle_titles(x):
    """
    Helper function to make string all lowercase and remove punctuation & stopwords.
    
    x: any string
    """
    x = x.strip()
    x = x.lower()
    x = x.replace('/', ' ')  # replace '/' with space
    x = x.replace('-', ' ')  # replace '-' with space
    x = ''.join(ch for ch in x if ch not in exclude)
    x = ' '.join(word for word in x.split() if word not in cachedStopWords)
    return x

def handle_products(x):
    x = x.strip()
    x = x.lower()
    return re.sub(r'[\W_-]', ' ', x)  # replace '_', '-', and all non-alphanumeric with ' '

In [92]:
pd.options.display.max_colwidth=150
products = []
listings = []
with open('data/products.txt') as products_file:
    for line in products_file:
        products.append(json.loads(line))
with open('data/listings.txt') as listings_file:
    for line in listings_file:
        listings.append(json.loads(line))
#pprint(products)
df_products = pd.DataFrame(products, columns=['product_name', 'manufacturer', 'family', 'model'])
df_listings = pd.DataFrame(listings, columns=['title', 'manufacturer', 'price', 'currency'])
df_products = df_products.fillna('')
df_listings

Unnamed: 0,title,manufacturer,price,currency
0,LED Flash Macro Ring Light (48 X LED) with 6 Adapter Rings for For Canon/Sony/Nikon/Sigma Lenses,Neewer Electronics Accessories,35.99,CAD
1,Canon PowerShot SX130IS 12.1 MP Digital Camera with 12x Wide Angle Optical Image Stabilized Zoom with 3.0-Inch LCD,Canon Canada,199.96,CAD
2,Canon PowerShot SX130IS 12.1 MP Digital Camera with 12x Wide Angle Optical Image Stabilized Zoom with 3.0-Inch LCD,Canon Canada,209.00,CAD
3,Canon PowerShot D10 12.1 MP Waterproof Digital Camera with 3x Optical Image Stabilized Zoom and 2.5-inch LCD (Blue/Silver),Canon Canada,306.24,CAD
4,Canon PowerShot D10 12.1 MP Waterproof Digital Camera with 3x Optical Image Stabilized Zoom and 2.5-inch LCD (Blue/Silver),Canon Canada,420.33,CAD
5,Genuine Samsung EB575152VU i9000 GalaxyS Battery,Samsung,13.99,CAD
6,Canon PowerShot A1200 (Black),Canon Canada,129.99,CAD
7,Canon PowerShot A495 10.1 MP Digital Camera with 3.3x Optical Zoom and 2.5-Inch LCD (Blue),Canon Canada,88.00,CAD
8,Canon PowerShot A495 10.1 MP Digital Camera with 3.3x Optical Zoom and 2.5-Inch LCD (Blue),Canon Canada,129.92,CAD
9,Canon PowerShot ELPH 300 HS (Black),Canon Canada,259.99,CAD


In [93]:
### Looks like product_name itself contains enough information (including manufacturer, family, model)
### For now, let's only use product_name

#df_products['product_fingerprint'] = df_products.product_name + " " + df_products.manufacturer \
#    + " " + df_products.family + " " + df_products.model
#df_products['product_fingerprint'] = df_products.product_fingerprint.apply(handle_products)
df_products['product_name'] = df_products.product_name.apply(handle_products)
df_products

Unnamed: 0,product_name,manufacturer,family,model
0,sony cyber shot dsc w310,Sony,Cyber-shot,DSC-W310
1,samsung tl240,Samsung,,TL240
2,nikon s6100,Nikon,Coolpix,S6100
3,samsung tl220,Samsung,,TL220
4,fujifilm t205,Fujifilm,FinePix,T205
5,casio qv 5000sx,Casio,,QV-5000SX
6,canon digital ixus 130 is,Canon,Digital IXUS,130 IS
7,leica digilux,Leica,,Digilux
8,fujifilm finepix 1500,Fujifilm,FinePix,1500
9,sony hx100v,Sony,Cybershot,DSC-HX100v


In [63]:
df_products.iloc[303].product_name

u'canon powershot sx130 is'

In [94]:
df_listings.title = df_listings.title.apply(handle_titles)

In [95]:
df_listings

Unnamed: 0,title,manufacturer,price,currency
0,led flash macro ring light 48 x led 6 adapter rings canon sony nikon sigma lenses,Neewer Electronics Accessories,35.99,CAD
1,canon powershot sx130is 12.1 mp digital camera 12x wide angle optical image stabilized zoom 3.0 inch lcd,Canon Canada,199.96,CAD
2,canon powershot sx130is 12.1 mp digital camera 12x wide angle optical image stabilized zoom 3.0 inch lcd,Canon Canada,209.00,CAD
3,canon powershot d10 12.1 mp waterproof digital camera 3x optical image stabilized zoom 2.5 inch lcd blue silver,Canon Canada,306.24,CAD
4,canon powershot d10 12.1 mp waterproof digital camera 3x optical image stabilized zoom 2.5 inch lcd blue silver,Canon Canada,420.33,CAD
5,genuine samsung eb575152vu i9000 galaxys battery,Samsung,13.99,CAD
6,canon powershot a1200 black,Canon Canada,129.99,CAD
7,canon powershot a495 10.1 mp digital camera 3.3x optical zoom 2.5 inch lcd blue,Canon Canada,88.00,CAD
8,canon powershot a495 10.1 mp digital camera 3.3x optical zoom 2.5 inch lcd blue,Canon Canada,129.92,CAD
9,canon powershot elph 300 hs black,Canon Canada,259.99,CAD


In [96]:
print(df_products.iloc[303].product_name)
print(df_listings.iloc[1].title)
print(fuzz.partial_ratio(df_products.iloc[303].product_name, df_listings.iloc[1].title))
print(fuzz.ratio("canon powershot sx130 is","canon powershot sx130is 12.1 mp digital camera 12x wide angle optical image stabilized zoom 3.0 inch lcd"))
print(fuzz.partial_ratio("canon powershot sx130 is","canon powershot sx130is 12.1 mp digital camera 12x wide angle optical image stabilized zoom 3.0 inch lcd"))
print(fuzz.token_sort_ratio("canon powershot sx130 is","canon powershot sx130is 12.1 mp digital camera 12x wide angle optical image stabilized zoom 3.0 inch lcd"))
print(fuzz.token_set_ratio("canon powershot sx130 is","canon powershot sx130is 12.1 mp digital camera 12x wide angle optical image stabilized zoom 3.0 inch lcd"))

canon powershot sx130 is
canon powershot sx130is 12.1 mp digital camera 12x wide angle optical image stabilized zoom 3.0 inch lcd
96
38
96
36
77


In [99]:
for i in range(100):
    print(i+1, fuzz.partial_ratio(df_products.iloc[303].product_name, df_listings.iloc[i].title))

(1, 46)
(2, 96)
(3, 96)
(4, 79)
(5, 79)
(6, 29)
(7, 79)
(8, 75)
(9, 75)
(10, 75)
(11, 75)
(12, 88)
(13, 29)
(14, 29)
(15, 29)
(16, 38)
(17, 42)
(18, 33)
(19, 75)
(20, 75)
(21, 88)
(22, 42)
(23, 33)
(24, 33)
(25, 83)
(26, 75)
(27, 75)
(28, 39)
(29, 38)
(30, 83)
(31, 83)
(32, 38)
(33, 29)
(34, 29)
(35, 38)
(36, 38)
(37, 83)
(38, 33)
(39, 33)
(40, 33)
(41, 33)
(42, 38)
(43, 29)
(44, 33)
(45, 33)
(46, 75)
(47, 75)
(48, 92)
(49, 92)
(50, 29)
(51, 33)
(52, 33)
(53, 33)
(54, 75)
(55, 75)
(56, 75)
(57, 75)
(58, 88)
(59, 88)
(60, 88)
(61, 88)
(62, 92)
(63, 33)
(64, 33)
(65, 33)
(66, 33)
(67, 83)
(68, 83)
(69, 33)
(70, 33)
(71, 38)
(72, 38)
(73, 38)
(74, 27)
(75, 27)
(76, 33)
(77, 33)
(78, 33)
(79, 33)
(80, 33)
(81, 38)
(82, 38)
(83, 33)
(84, 75)
(85, 75)
(86, 33)
(87, 33)
(88, 92)
(89, 92)
(90, 83)
(91, 83)
(92, 29)
(93, 29)
(94, 38)
(95, 29)
(96, 29)
(97, 29)
(98, 33)
(99, 29)
(100, 29)


In [44]:
df_listings.apply(lambda x: fuzz.ratio(x['title'], x['title2']), axis = 1)

0    100
1    100
2    100
3    100
4    100
5    100
6    100
7    100
8    100
9    100
dtype: int64

In [None]:
df_listings['title_ratio'] = df_listings.apply