In [5]:
import pandas as pd
import re

In [2]:
#Read data
X1 = pd.read_csv('X1.csv')
Y1 = pd.read_csv('Y1.csv')

In [3]:
X1.head()

Unnamed: 0,id,title
0,270345,BEST ASPIRE CUBE INTEL 6885 INTEL FRAME QUAD I...
1,163850,Panasonic Latitude 14 B5232 - Solid Duo! (BX80...
2,180242,Panasonic 667374-B21 Toshiba 2325 Pc AMD LED 2...
3,712728,64-bit t540p / n7110 wifi | us 8gb hd laptop n...
4,729116,dell google 1737 7000 chromebook i5 (67y2625) ...


In [4]:
Y1.head()

Unnamed: 0,lid,rid
0,975781,1041786
1,531722,1116734
2,225419,1200313
3,244752,562970
4,412911,1163520


In [6]:
def clean_data(text):
    #Remove special characters and punctuation marks
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)

    #Convert text to lowercase
    text = text.lower()

    #Remove excess white space characters
    text = ' '.join(text.split())


    return text

In [7]:
#Clean up data
X1['title'] = X1['title'].apply(clean_data)

In [8]:
X1['title']

0       best aspire cube intel 6885 intel frame quad i...
1       panasonic latitude 14 b5232 solid duo bx80621e...
2       panasonic 667374b21 toshiba 2325 pc amd led 2 ...
3       64bit t540p n7110 wifi us 8gb hd laptop notebo...
4       dell google 1737 7000 chromebook i5 67y2625 co...
                              ...                        
1656    computer7 laptop for 500 vology ebay sony i5 l...
1657    hp d6f48usaba pro intel hp 14 intel core i7 in...
1658    acer aspire e157234014g50mnrr 156 led notebook...
1659    dell 2000 hp 14 6 640gb ice 500gb 4gb buy lapt...
1660    amazoncom acer aspire v3111p43bc 116inch touch...
Name: title, Length: 1661, dtype: object

In [16]:
def blocking(data, r):
    blocks = {}
    
    #Creating blocks using Jaccard similarity
    for i, desc1 in enumerate(data['title']):
        blocks[i] = []
        for j, desc2 in enumerate(data['title']):
            if i != j:  #Not compared to oneself
                #Calculate Jaccard similarity
                set1 = set(desc1.split())
                set2 = set(desc2.split())
                jaccard_similarity = len(set1.intersection(set2)) / len(set1.union(set2))
                
                if jaccard_similarity >= r:
                    blocks[i].append(j)
    
    return blocks


In [17]:

r = 0.7  #Blocking parameters
blocks = blocking(X1, r)

In [18]:

blocks

{0: [264],
 1: [417],
 2: [1457],
 3: [903],
 4: [233],
 5: [1372],
 6: [26,
  37,
  51,
  83,
  112,
  114,
  127,
  128,
  146,
  194,
  220,
  226,
  240,
  243,
  249,
  250,
  289,
  295,
  347,
  351,
  375,
  380,
  382,
  425,
  438,
  450,
  462,
  472,
  484,
  496,
  523,
  578,
  636,
  658,
  663,
  679,
  739,
  756,
  789,
  816,
  856,
  881,
  885,
  900,
  922,
  964,
  987,
  1083,
  1170,
  1180,
  1228,
  1229,
  1233,
  1239,
  1281,
  1336,
  1352,
  1356,
  1371,
  1394,
  1456,
  1475,
  1492,
  1495,
  1541,
  1547,
  1566,
  1571,
  1600,
  1637],
 7: [125],
 8: [1494],
 9: [246],
 10: [1653],
 11: [1510],
 12: [17,
  55,
  62,
  84,
  135,
  192,
  208,
  261,
  268,
  269,
  307,
  418,
  503,
  644,
  647,
  660,
  693,
  708,
  740,
  741,
  748,
  872,
  1018,
  1026,
  1029,
  1042,
  1045,
  1046,
  1049,
  1095,
  1100,
  1104,
  1122,
  1142,
  1146,
  1148,
  1168,
  1230,
  1273,
  1298,
  1306,
  1343,
  1399,
  1496,
  1534,
  1548,
  1569,
  158

In [19]:
#Jaccard similarity calculation function
def calculate_jaccard_similarity(data, a):
    similar_pairs = []  #Store matching pairs with similarity exceeding threshold

    for i in range(len(data)):
        for j in range(i + 1, len(data)):
            desc1 = data['title'][i]
            desc2 = data['title'][j]

            #Calculate Jaccard similarity
            set1 = set(desc1.split())
            set2 = set(desc2.split())
            jaccard_similarity = len(set1.intersection(set2)) / len(set1.union(set2))

            if jaccard_similarity > a:
                similar_pairs.append((i, j))

    return similar_pairs

#Set similarity threshold
a = 0.7  # 

#Calculate matching pairs with similarity greater than threshold a
similar_pairs = calculate_jaccard_similarity(X1, a)

In [21]:
similar_pairs

[(0, 264),
 (1, 417),
 (2, 1457),
 (3, 903),
 (4, 233),
 (5, 1372),
 (6, 26),
 (6, 37),
 (6, 51),
 (6, 83),
 (6, 112),
 (6, 114),
 (6, 127),
 (6, 128),
 (6, 146),
 (6, 194),
 (6, 220),
 (6, 226),
 (6, 240),
 (6, 243),
 (6, 249),
 (6, 250),
 (6, 289),
 (6, 295),
 (6, 347),
 (6, 351),
 (6, 375),
 (6, 380),
 (6, 382),
 (6, 425),
 (6, 438),
 (6, 450),
 (6, 462),
 (6, 472),
 (6, 484),
 (6, 496),
 (6, 523),
 (6, 578),
 (6, 636),
 (6, 658),
 (6, 663),
 (6, 679),
 (6, 739),
 (6, 756),
 (6, 789),
 (6, 816),
 (6, 856),
 (6, 881),
 (6, 885),
 (6, 900),
 (6, 922),
 (6, 964),
 (6, 987),
 (6, 1083),
 (6, 1170),
 (6, 1180),
 (6, 1228),
 (6, 1229),
 (6, 1233),
 (6, 1239),
 (6, 1281),
 (6, 1336),
 (6, 1352),
 (6, 1356),
 (6, 1371),
 (6, 1394),
 (6, 1456),
 (6, 1475),
 (6, 1492),
 (6, 1495),
 (6, 1541),
 (6, 1547),
 (6, 1566),
 (6, 1571),
 (6, 1600),
 (6, 1637),
 (7, 125),
 (8, 1494),
 (9, 246),
 (10, 1653),
 (11, 1510),
 (12, 17),
 (12, 55),
 (12, 62),
 (12, 84),
 (12, 135),
 (12, 192),
 (12, 208),
 (1

In [20]:
from sklearn.cluster import DBSCAN

#Clustering function
def clustering(data, eps, min_samples):
    #Create DBSCAN model or other clustering models
    dbscan = DBSCAN(eps=eps, min_samples=min_samples)

    #Fit the model and predict the cluster labels for each sample
    cluster_labels = dbscan.fit_predict(data)

    return cluster_labels

#Setting parameters for DBSCAN
eps = 0.5  #Neighborhood radius
min_samples = 5  #Minimum number of samples

#Cluster
cluster_labels = clustering(similar_pairs, eps, min_samples)


In [22]:
cluster_labels

array([-1, -1, -1, ..., -1, -1, -1], dtype=int64)