In [None]:
# import packages
import pandas as pd
import numpy as np
import json 
import random
from sklearn.feature_extraction.text import CountVectorizer
import re
import math
from similarity.normalized_levenshtein import NormalizedLevenshtein
normalized_levenshtein = NormalizedLevenshtein()
from similarity.qgram import QGram
import copy
import string
from math import comb

In [None]:
# Load data and preprocess
# load data
f = open('TVs-all-merged.json')
data = json.load(f)
f.close()
data 
modelids = [item['modelID'] for sublist in data.values() for item in sublist]
shops = [item['shop'] for sublist in data.values() for item in sublist]
features = [item['featuresMap'] for sublist in data.values() for item in sublist]
# Get the titles and convert to lower case
titles = [item['title'] for sublist in data.values() for item in sublist]
titles = [s.lower() for s in titles]
pattern = re.compile(r'[- /]*((hz)|hertz)')
# Replace occurrences in each item
titles = [pattern.sub(r'hz', title) for title in titles]
# Define the pattern to match "-inch" or " inch"
pattern = re.compile(r'[- /]*((inch)|inches)')
# Replace occurrences in each item
titles = [pattern.sub(r'"', title) for title in titles]
pattern = re.compile(r'["]')
titles = [pattern.sub(r'inch ', title) for title in titles]
pattern = re.compile(r'(diagonal)+|(diagonally)')
titles = [pattern.sub(r'diag.', title) for title in titles]
pattern = re.compile(r'(newegg.com)|[-()/]|(thenerds.net)|(best)|(buy)|(refurbished)')
titles= [pattern.sub(r'', title) for title in titles]
titles = [title.replace('ledlcd', ' led lcd ') for title in titles]
pattern = re.compile(r'[^a-zA-Z0-9-]')
titles = [pattern.sub(r' ', title) for title in titles]


setShops = set(shops)
# Initialize an empty set to store unique brands
unique_brands = set()

# Iterate through each dictionary in the list
for product_info in features:
    # Check if the 'Brand' key is present in the dictionary
    if 'Brand' in product_info:
        # Add the brand to the set (case-insensitive)
        unique_brands.add(product_info['Brand'].lower())
unique_brands
brands = {'philips', 'azend','sharp', 'supersonic', 'sony', 'toshiba', 'tcl', 'samsung', 'sanyo', 'panasonic', 'vizio', 'coby', 
          'viewsonic', 'sunbritetv', 'westinghouse', 'epson', 'avue', 'insignia', 'jvc', 'nec', 'venturer', 'rca', 'hisense',
          'affinity', 'compaq', 'craig', 'elo', 'optoma', 'dynex', 'mitsubishi', 'contex', 'viore', 'hiteker', 'elite', 'gpx', 'curtisyoung', 'lg'}
test = unique_brands - brands
brands = brands.union((test))
brands = list(brands)
brands.remove('jvc tv')
brands.remove('lg electronics')
brands_list = [None]*len(titles)
count = -1
for title in titles:
    count = count + 1
    for brand in brands:
        if brand in title:
            brands_list[count] = brand
            break
    if 'Brand' in features[count]:
        brands_list[count] = features[count]['Brand'].lower()

In [None]:
# Obtain model words from the titles and features
def convertToMWList(matrix):
    titlesA = matrix[:,0]
    featuresA = matrix[:,1]

    filtered_titles = []
    filtered_features = []
    pattern = r'(([a-zA-Z0-9]*(([0-9]+[^0-9, ]+)|([^0-9, ]+[0-9]+))[a-zA-Z0-9]*)|(led)|(lcd))'
    for title in titlesA:
        title_parts  = title.split(" ")
        elementos = " ".join([word for word in title_parts if re.match(pattern, word)])
        elementos = list(set(re.findall(r'([a-zA-Z]+|\d+)', elementos)))
        filtered_titles.append(elementos)
    for feature in featuresA:
        elementos = " ".join(feature.values()).lower()
        elementos = re.sub(r'[)(!@#$-/%&:;(Year)\\\\]', ' ', elementos).split(" ")
        elementos = " ".join([word for word in elementos if re.match(pattern, word) and len(word)>5])
        elementos = list(set(re.findall(r'([a-zA-Z]+|\d+)', elementos)))
        filtered_features.append(elementos)
    return [filtered_titles,filtered_features]

# Obtain model words from the titles and features but the features are preprocessed differently for FISM
def convertToMWList3(matrix):
    titlesA = matrix[:,0]
    featuresA = matrix[:,1]

    filtered_titles = []
    filtered_features = []
    pattern = r'(([a-zA-Z0-9]*(([0-9]+[^0-9, ]+)|([^0-9, ]+[0-9]+))[a-zA-Z0-9]*)|(led)|(lcd))'
    for title in titlesA:
        title_parts  = title.split(" ")
        elementos = [word for word in title_parts if re.match(pattern, word)]
        filtered_titles.append(elementos)
    for feature in featuresA:
        elementos = re.sub(r'[)(!@#$-/%&:;(Year)\\\\]', ' ', feature).split(" ")
        elementos = [word for word in elementos if re.match(pattern, word)]
        filtered_features.append(elementos)
    return [filtered_titles,filtered_features]
    
# Convert the column vectors in signature matrix to buckets
def vector_to_bucket(vector):
    return ''.join(map(str, vector))

# Obtain the modelID from the title
def process_title(title):
    # Use regular expression to match words with length smaller than 5 and containing non-alphanumeric characters
    pattern = pattern = re.compile(r'\b(?:\w{1,4}|(?:\d+hz)|(?:\d+p)|(?:\d+inch))\b')
    # Replace matching words with an empty string
    processed_title = pattern.sub('', title)
    return processed_title

In [None]:
# Create the signature matrix through min hashing
def createSigMatrix(matrix):
    titlesA = matrix[:,0]
    featuresA = matrix[:,1]
    shopsA = matrix[:,2]
    modelids = matrix[:,3]
    brandsA = matrix[:,4]
    
    matches = set()
    matches.add('lcd')
    matches.add('led')
    matches.add('series')
    matches.add('“')
    matches.add('”')
    matches.add('hz')
    matches.add('inch')
    
    modelWordsList = convertToMWList(matrix)
    filtered_titles = modelWordsList[0]
    filtered_features = modelWordsList[1]
    
    for sublist in filtered_titles:
        matches.update(sublist)
    matches.remove('series')
    matches.remove('“')
    matches.remove('”')
    matches.remove('hz')
    matches.remove('inch')
    
    matches = list(matches)
    df = np.full(((len(matches) + len(brands)), len(titlesA)), 0)
    for i,match in enumerate(matches):
        if len(match) > 4:
            continue
        for j in range(len(titlesA)):
            for filTit in filtered_titles[j]:
                if filTit == match:
                    df[i,j] = 1
                    break
            for feSt in filtered_features[j]:
                if feSt == match:
                    df[i,j] = 1
                    break
    for i, title in enumerate(titlesA):
        for j, brand in enumerate(brands):
            if brand in title:
                df[(j+len(matches)),i] = 1
    num_cols = np.shape(df)[1]
    num_rows = np.shape(df)[0]
    sig_length = 1000
    sig_matrix = np.full((sig_length, num_cols), np.inf)
    counter = 0;
    num_cols = np.shape(df)[1]
    num_rows = np.shape(df)[0]
    sig_length = 1000
    sig_matrix = np.full((sig_length, num_cols), np.inf)
    counter = 0;
    prime = primesInRange((num_rows+1))
    while counter < sig_length:
        a = random.randint(1,num_rows)
        c = random.randint(0,num_rows)
        hash_values = [hash_function(a,x,c, prime) for x in range(num_rows)]
        hash_values = np.array(hash_values)
        for i in range(num_cols):
            sig_matrix[counter,i] = min(hash_values[df[:,i]==1])
        counter = counter + 1
    return sig_matrix

In [None]:
# Create bootstraps
def bootstrap_sampling(data_matrix, num_bootstraps=5):
    bootstrap = []
    for _ in range(num_bootstraps):
        # Randomly sample with replacement to create a bootstrap training set
        bootstrap_indices = np.random.choice(len(data_matrix), len(data_matrix), replace=True)
        bootstrap_sample = data_matrix[bootstrap_indices]

        # The remaining data is the test set
        test_set_indices = np.setdiff1d(np.arange(len(data_matrix)), bootstrap_indices)
        test_set = data_matrix[test_set_indices]

        bootstrap.append((bootstrap_sample, test_set))
    return bootstrap

In [None]:
# Perform LSH and run FISM
def performLSH(matrix, bootstrap,es,split, trueDup):
    setComparisons = set()
    for j in range(split):
        bucket_dict = {}
        sub_matrix_size = len(sigMatr) // split
        start_index = j * sub_matrix_size
        end_index = (j + 1) * sub_matrix_size
        sub_matrix = sigMatr[start_index:end_index, :]
        for t in range(np.shape(matrix)[1]):
            bucket = vector_to_bucket(sub_matrix[:,t])
            if bucket in bucket_dict:
                bucket_dict[bucket].append(t)
            else:
                bucket_dict[bucket] = [t]
        for bucket, indices in bucket_dict.items():
            for b in range(len(indices)):
                for g in range(b + 1, len(indices)):
                    setComparisons.add((indices[b],indices[g]))
    duplicate_list = set()
    comps = len(setComparisons)
    indices = []
    pattern = r'(([a-zA-Z0-9]*(([0-9]+[^0-9, ]+)|([^0-9, ]+[0-9]+))[a-zA-Z0-9]*)|(led)|(lcd))'
    for left,right in setComparisons:
        if bootstrap[left,4] != bootstrap[right,4] or bootstrap[left,2] == bootstrap[right,2]:
            continue
        if FISM(np.vstack((bootstrap[left,:],bootstrap[right,:]))) == 1:
            duplicate_list.add((left,right))
            continue
    return [duplicate_list,comps]

In [None]:
# Feature intersection similarity method
def FISM(matrix):
    titlesA = matrix[:,0]
    featuresA = copy.deepcopy(matrix[:,1])
    shopsA = matrix[:,2]
    modelids = matrix[:,3]
    brandsA = matrix[:,4]
    
    for i,ft in enumerate(featuresA):
        featuresA[i] = " ".join(ft.values()).lower()
        
    pattern = re.compile(r'[- /]*((hz)|hertz)')
    featuresA = [pattern.sub(r'hz', title) for title in featuresA]
    pattern = re.compile(r'[- /]*((inch)|inches)')
    featuresA = [pattern.sub(r'"', title) for title in featuresA]
    pattern = re.compile(r'["]')
    featuresA = [pattern.sub(r'inch ', title) for title in featuresA]
    pattern = re.compile(r'(diagonal)+|(diagonally)')
    featuresA = [pattern.sub(r'diag.', title) for title in featuresA]
    pattern = re.compile(r'(newegg.com)|[-()/]|(thenerds.net)|(best)|(buy)|(refurbished)')
    featuresA= [pattern.sub(r'', title) for title in featuresA]
    featuresA = [title.replace('ledlcd', 'led lcd') for title in featuresA]
    featuresA = [title.replace('led', ' led ') for title in featuresA]
    featuresA = [title.replace('lcd', ' lcd ') for title in featuresA]
    pattern = re.compile(r'[^a-zA-Z0-9-]')
    featuresA = [pattern.sub(r' ', title) for title in featuresA]
    
    modelWordsList = convertToMWList3(np.column_stack((titlesA,featuresA)))
    filtered_titles = modelWordsList[0]
    filtered_features = modelWordsList[1]
    
    setMW1 = set(filtered_titles[0])
    setMW2 = set(filtered_titles[1])
    
    matches = (setMW1.copy()).union(setMW2.copy())
    
    setMW1.update(filtered_features[0])
    setMW2.update(filtered_features[1])
    matches.add('lcd')
    matches.add('led')
    matches.add('series')
    matches.add('“')
    matches.add('”')
    matches.add('hz')
    matches.add('inch')
    matches.remove('series')
    matches.remove('“')
    matches.remove('”')
    matches.remove('hz')
    matches.remove('inch')
    
    set1 = set()
    set2 = set()
    for match in matches:
        if match in setMW1:
            set1.add(match)
        if match in setMW2:
            set2.add(match)
        
    
    intersectMW = set1.intersection(set2)
    
    set1 = set1.difference(intersectMW)
    set2 = set2.difference(intersectMW)

    if ('led' in set1 and 'lcd' in set2) or ('lcd' in set1 and 'led' in set2):
        return 0
    for i,word1 in enumerate(set1):
        alpha1 = ''.join(re.findall(r'[a-z]+',word1))
        num1 = ''.join(re.findall(r'[0-9]+',word1))
        for j,word2 in enumerate(set2):
            alpha2 = ''.join(re.findall(r'[a-z]+',word2))
            num2 = ''.join(re.findall(r'[0-9]+',word2))
            if (len(alpha1) > 0 and len(alpha2) > 0):
                if(normalized_levenshtein.distance(alpha1, alpha2) < 1.1 and num1 != num2):
                    if (alpha1 == 'inch') and (len(num1) != len(num2)):
                        continue
                    return 0
                
    if (len(set1) == 0 and len(set2) == 0):
        return 1
    if (len(intersectMW) < 5):
        return 0
    return 1

In [None]:
# Main execution file
random.seed(694204)
fullMatrix = np.column_stack((titles, features,shops, modelids, brands_list))
bootstraps = bootstrap_sampling(fullMatrix, num_bootstraps=20)


divisions = [2,50,100,125,200,500] 
comparisons = np.zeros(len(divisions))
truePos = np.zeros(len(divisions))
falsePos  = np.zeros(len(divisions))
falseNeg = np.zeros(len(divisions))
actualities = np.zeros(len(divisions))
F1= np.zeros(len(divisions))
PR = np.zeros(len(divisions))
RC = np.zeros(len(divisions))
PQ = np.zeros(len(divisions))
PC = np.zeros(len(divisions))
for i,(bootstrap,test) in enumerate(bootstraps):
    print(i)
    sigMatr = createSigMatrix(test)
    bucket_dict1 = {}
    for t,r in enumerate(test[:,3]):
        rlower = r.lower()
        if rlower in bucket_dict1:
            bucket_dict1[rlower].append(t)
        else:
            bucket_dict1[rlower] = [t]
    listB = set()
    for g in bucket_dict1.values():
        for v in range(len(g)):
            for k in range(v+1,len(g)):
                listB.add((g[v],g[k]))
            
    for p,j in enumerate(divisions):
        temp5 = performLSH(sigMatr, test,0.522,j, listB)
        temp6 = temp5[0]
        truePos[p] = len(temp6.intersection(listB))
        falsePos[p] = len(temp6.difference(listB))
        falseNeg[p] = len(listB) - truePos[p]
        PC[p] = PC[p] + truePos[p] / len(listB)
        PQ[p] = PQ[p] + truePos[p] / temp5[1]
        comparisons[p] = comparisons[p] + temp5[1]
        if (truePos[p] != 0):
            PR[p] = PR[p] + truePos[p]/(truePos[p] + falsePos[p])
            RC[p] = RC[p] + truePos[p]/(truePos[p] + falseNeg[p])
PR = np.divide(PR,len(bootstraps))
RC = np.divide(RC,len(bootstraps))
PC = np.divide(PC,len(bootstraps))
PQ = np.divide(PQ,len(bootstraps))
F1 = np.divide(2*PR*RC,PR+RC)
comparisons = np.divide(comparisons,len(bootstraps))