In [41]:
import pandas as pd
from fuzzywuzzy import fuzz
from nltk.corpus import stopwords
import json
import re
from pprint import pprint
import string
import sys

exclude = set(string.punctuation)
include = set('.')
exclude -= include

cachedStopWords = set(stopwords.words("english"))

def handle_titles(x):
    """
    Helper function to make string all lowercase and remove punctuation & stopwords.
    
    x: any string
    """
    x = x.strip().lower()    
    x = x.replace('/', ' ')  # replace '/' with space
    x = x.replace('-', ' ')  # replace '-' with space
    x = ''.join(ch for ch in x if ch not in exclude)
    x = ' '.join(word for word in x.split() if word not in cachedStopWords)
    return x.strip()

def handle_products(x):
    x = x.strip().lower()    
    return re.sub(r'[\W_]', ' ', x).strip()  # replace '_', '-', and all non-alphanumeric with ' '

pd.options.display.max_colwidth=150
products = []
listings = []
with open('data/products.txt') as products_file:
    for line in products_file:
        products.append(json.loads(line))
with open('data/listings.txt') as listings_file:
    for line in listings_file:
        listings.append(json.loads(line))
#pprint(products)
df_products = pd.DataFrame(products, columns=['product_name', 'manufacturer', 'family', 'model'])
df_listings = pd.DataFrame(listings, columns=['title', 'manufacturer', 'price', 'currency'])
df_products = df_products.fillna('')
df_products

Unnamed: 0,product_name,manufacturer,family,model
0,Sony_Cyber-shot_DSC-W310,Sony,Cyber-shot,DSC-W310
1,Samsung_TL240,Samsung,,TL240
2,Nikon-s6100,Nikon,Coolpix,S6100
3,Samsung_TL220,Samsung,,TL220
4,Fujifilm-T205,Fujifilm,FinePix,T205
5,Casio_QV-5000SX,Casio,,QV-5000SX
6,Canon_Digital_IXUS_130_IS,Canon,Digital IXUS,130 IS
7,Leica_Digilux,Leica,,Digilux
8,Fujifilm_FinePix_1500,Fujifilm,FinePix,1500
9,Sony-HX100v,Sony,Cybershot,DSC-HX100v


In [24]:
### Looks like product_name itself contains enough information (including manufacturer, family, model)
### model should be a key differentiator
### For now, let's use "model", "product_name" and "manufacturer"

#df_products['product_fingerprint'] = df_products.product_name + " " + df_products.manufacturer \
#    + " " + df_products.family + " " + df_products.model
#df_products['product_fingerprint'] = df_products.product_fingerprint.apply(handle_products)
df_products['product_name'] = df_products.product_name.apply(handle_products)
df_products['manufacturer'] = df_products.manufacturer.apply(handle_products)
df_products['model'] = df_products.model.apply(handle_products)
df_listings['title'] = df_listings.title.apply(handle_titles)
df_listings['manufacturer'] = df_listings.manufacturer.apply(handle_products)

In [40]:
for index, row in df_products.iterrows():
    print index, row

0 product_name    sony cyber shot dsc w310
manufacturer                        sony
family                        Cyber-shot
model                           dsc w310
Name: 0, dtype: object
1 product_name    samsung tl240
manufacturer          samsung
family                       
model                   tl240
Name: 1, dtype: object
2 product_name    nikon s6100
manufacturer          nikon
family              Coolpix
model                 s6100
Name: 2, dtype: object
3 product_name    samsung tl220
manufacturer          samsung
family                       
model                   tl220
Name: 3, dtype: object
4 product_name    fujifilm t205
manufacturer         fujifilm
family                FinePix
model                    t205
Name: 4, dtype: object
5 product_name    casio qv 5000sx
manufacturer              casio
family                         
model                 qv 5000sx
Name: 5, dtype: object
6 product_name    canon digital ixus 130 is
manufacturer                        canon


In [25]:
pd.set_option('display.max_rows', 1000)
df_listings_sorted = df_listings.sort_values('manufacturer', axis = 0)
print(df_listings_sorted.iloc[0])
df_listings_sorted
#df_listings_sorted[df_listings_sorted['manufacturer']=='canon']

title           nikon coolpix s5 6mp digital camera
manufacturer                                       
price                                        399.95
currency                                        USD
Name: 8396, dtype: object


Unnamed: 0,title,manufacturer,price,currency
8396,nikon coolpix s5 6mp digital camera,,399.95,USD
11918,panasonic dmc lx3 black,,329.00,GBP
7634,minolta dimage s404 4mp digital camera 4x optical zoom,,65.00,USD
8333,nikon coolpix 5200 5mp digital camera 3x optical zoom,,288.53,USD
19079,cadre photo reveil radio,,71.40,EUR
8332,nikon coolpix 5200 5mp digital camera 3x optical zoom,,160.00,USD
11864,samsung l201 black including charger lithium battery,,70.00,GBP
19434,cadre photo numerique 7 pvid,,61.30,EUR
7595,gba sp digital camera,,49.95,USD
7887,fujifilm 12.2 magapixels digital camera 2.7 lcd screen 3x optical zoom,,109.87,USD


In [44]:
df_listings_sorted.iloc[20104].title

u'vtech kidizoom pro pink'

In [35]:
type(df_listings_sorted.iloc[297].name)

numpy.int64

In [6]:
df_products.iloc[303].product_name, df_products.iloc[303].model

(u'canon powershot sx130 is', u'sx130 is')

In [96]:
print(df_products.iloc[303].product_name)
print(df_listings.iloc[1].title)
print(fuzz.partial_ratio(df_products.iloc[303].product_name, df_listings.iloc[1].title))
print(fuzz.ratio("canon powershot sx130 is","canon powershot sx130is 12.1 mp digital camera 12x wide angle optical image stabilized zoom 3.0 inch lcd"))
print(fuzz.partial_ratio("canon powershot sx130 is","canon powershot sx130is 12.1 mp digital camera 12x wide angle optical image stabilized zoom 3.0 inch lcd"))
print(fuzz.token_sort_ratio("canon powershot sx130 is","canon powershot sx130is 12.1 mp digital camera 12x wide angle optical image stabilized zoom 3.0 inch lcd"))
print(fuzz.token_set_ratio("canon powershot sx130 is","canon powershot sx130is 12.1 mp digital camera 12x wide angle optical image stabilized zoom 3.0 inch lcd"))

canon powershot sx130 is
canon powershot sx130is 12.1 mp digital camera 12x wide angle optical image stabilized zoom 3.0 inch lcd
96
38
96
36
77


In [99]:
for i in range(100):
    print(i+1, fuzz.partial_ratio(df_products.iloc[303].product_name, df_listings.iloc[i].title))

(1, 46)
(2, 96)
(3, 96)
(4, 79)
(5, 79)
(6, 29)
(7, 79)
(8, 75)
(9, 75)
(10, 75)
(11, 75)
(12, 88)
(13, 29)
(14, 29)
(15, 29)
(16, 38)
(17, 42)
(18, 33)
(19, 75)
(20, 75)
(21, 88)
(22, 42)
(23, 33)
(24, 33)
(25, 83)
(26, 75)
(27, 75)
(28, 39)
(29, 38)
(30, 83)
(31, 83)
(32, 38)
(33, 29)
(34, 29)
(35, 38)
(36, 38)
(37, 83)
(38, 33)
(39, 33)
(40, 33)
(41, 33)
(42, 38)
(43, 29)
(44, 33)
(45, 33)
(46, 75)
(47, 75)
(48, 92)
(49, 92)
(50, 29)
(51, 33)
(52, 33)
(53, 33)
(54, 75)
(55, 75)
(56, 75)
(57, 75)
(58, 88)
(59, 88)
(60, 88)
(61, 88)
(62, 92)
(63, 33)
(64, 33)
(65, 33)
(66, 33)
(67, 83)
(68, 83)
(69, 33)
(70, 33)
(71, 38)
(72, 38)
(73, 38)
(74, 27)
(75, 27)
(76, 33)
(77, 33)
(78, 33)
(79, 33)
(80, 33)
(81, 38)
(82, 38)
(83, 33)
(84, 75)
(85, 75)
(86, 33)
(87, 33)
(88, 92)
(89, 92)
(90, 83)
(91, 83)
(92, 29)
(93, 29)
(94, 38)
(95, 29)
(96, 29)
(97, 29)
(98, 33)
(99, 29)
(100, 29)


In [44]:
df_listings.apply(lambda x: fuzz.ratio(x['title'], x['title2']), axis = 1)

0    100
1    100
2    100
3    100
4    100
5    100
6    100
7    100
8    100
9    100
dtype: int64

In [None]:
df_listings['title_ratio'] = df_listings.apply