# OJA Deduplication Challenge

## Imports

### Packages

In [58]:
import os
import warnings

import pandas as pd
from tqdm import tqdm

# from Levenshtein import distance
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [59]:
warnings.filterwarnings('ignore')

### Data

In [60]:
# Import from s3
os.system(f"mc cp s3/apalazzolo/Deduplication/wi_dataset.csv wi_dataset.csv")

`s3/apalazzolo/Deduplication/wi_dataset.csv` -> `wi_dataset.csv`
Total: 0 B, Transferred: 203.23 MiB, Speed: 155.83 MiB/s


0

In [61]:
data = pd.read_csv('wi_dataset.csv',
                   lineterminator='\n')

In [62]:
# For now let's work with a smaller extract

# data = original_data.head(10000)

In [63]:
n_ads = len(data)
n_ads

112056

In [64]:
data.head(3)

Unnamed: 0,id,title,description,location,country_id,company_name,retrieval_date
0,1,Traineeship Pensioen & Leven in Utrecht,We sturen je door naar je toekomstige opdracht...,Utrecht,NL,,2021-01-19
1,2,DEPOSITARY OFFICER (M/F),DEPOSITARY OFFICER (M/F) DO Recruitment Adviso...,,FR,DO Recruitment Advisors,2021-09-30
2,3,Cautam colegi manipulanti marfa,Descriere Angajam manipulanti marfa din Pitest...,Pitesti,RO,,2021-06-18


## Data preprocessing

In [65]:
# Basic cleaning

data[
    ['title', 'description', 'location', 'company_name']
] = data[
    ['title', 'description', 'location', 'company_name']
].apply(lambda x: x.str.strip().str.lower())

data.fillna("", inplace=True)

## Naive deduplication

In [66]:
duplicates = []

### Add the full duplicates

In [67]:
# Way too long to run
# + sorting the table breaks the code

# for i in tqdm(range(n_ads)):
#     for j in range(i+1,n_ads):
#         if (data.iloc[i, 1] == data.iloc[j, 1]) and (data.iloc[i, 2] == data.iloc[j, 2]):
#             duplicates.append({'id1': i+1, 'id2': j+1, 'type': 'FULL'})

In [68]:
data.sort_values(by=['title', 'description', 'id'], inplace=True)

i = 0
j = 1

while i < n_ads and j < n_ads:
    if (i%10000 == 0) and (j == i+1):
        print(i)
    if data.iloc[i, 1] < data.iloc[j, 1]:
        i += 1
        j = i + 1
    elif data.iloc[i, 1] > data.iloc[j, 1]:
        j += 1
    elif data.iloc[i, 2] < data.iloc[j, 2]:
        i += 1
        j = i + 1
    elif data.iloc[i, 2] > data.iloc[j, 2]:
        j += 1
    else:
        duplicates.append({'id1': data.iloc[i, 0], 'id2': data.iloc[j, 0], 'type': 'FULL'})
        j += 1

0
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000


In [69]:
len(duplicates)

357497

In [70]:
duplicates[0]

{'id1': 236, 'id2': 1598, 'type': 'FULL'}

### Add the semantic duplicates

In [71]:
data.sort_values(by=['id'], inplace=True)

In [72]:
data['text'] = data['title'] + ' ' + data['description'] + ' ' + data['location'] + ' ' + data['company_name']

# Use TF-IDF to vectorize the texts
vectorizer = TfidfVectorizer()
tfidf = vectorizer.fit_transform(data['text'])
tfidf

<112056x439107 sparse matrix of type '<class 'numpy.float64'>'
	with 15934299 stored elements in Compressed Sparse Row format>

In [73]:
# Use cosine similarity to compare the text
# But actually way too big output

# similarity_matrix = cosine_similarity(tfidf)

In [74]:
# Identify similar ads paris
# Needs to be done by steps

# for i in tqdm(range(similarity_matrix.shape[0])):
#     for j in range(i+1, similarity_matrix.shape[1]):
#         if similarity_matrix[i][j] > 0.8 and (
#             (data.iloc[i, 1] != data.iloc[j, 1]) or (data.iloc[i, 2] != data.iloc[j, 2])
#         ):
#             if data.iloc[i, 6] != data.iloc[j, 6]:
#                 duplicates.append({'id1': data.iloc[i, 0], 'id2': data.iloc[j, 0], 'type': 'TEMPORAL'})
#             else:
#                 if abs(
#                     len(data.iloc[i, 2]) - len(data.iloc[j, 2])
#                 ) / max(
#                     len(data.iloc[i, 2]), len(data.iloc[j, 2])
#                 ) < 0.08:
#                     duplicates.append({'id1': data.iloc[i, 0], 'id2': data.iloc[j, 0], 'type': 'SEMANTIC'})
#                 else:
#                     duplicates.append({'id1': data.iloc[i, 0], 'id2': data.iloc[j, 0], 'type': 'PARTIAL'})

In [75]:
def cosine_similarity_by_chunk(start, end):
    if end > n_ads:
        end = n_ads
    return cosine_similarity(X=tfidf[start:end], Y=tfidf)

In [76]:
chunk_size = 1000

for chunk_start in tqdm(range(0, n_ads, chunk_size)):
    similarity_matrix_chunk = cosine_similarity_by_chunk(chunk_start, chunk_start+chunk_size)
    compteur_init = len(duplicates)
    for i in range(chunk_size):
        for j in range(chunk_start+i+1, n_ads):
            if similarity_matrix_chunk[i][j] > 0.9 and (
                (data.iloc[chunk_start+i, 1] != data.iloc[j, 1]) or (data.iloc[chunk_start+i, 2] != data.iloc[j, 2])
            ):
                if data.iloc[chunk_start+i, 6] != data.iloc[j, 6]:
                    duplicates.append({'id1': data.iloc[chunk_start+i, 0], 'id2': data.iloc[j, 0], 'type': 'TEMPORAL'})
                else:
                    if abs(
                        len(data.iloc[chunk_start+i, 2]) - len(data.iloc[j, 2])
                    ) / max(
                        len(data.iloc[chunk_start+i, 2]), len(data.iloc[j, 2])
                    ) < 0.1:
                        duplicates.append({'id1': data.iloc[chunk_start+i, 0], 'id2': data.iloc[j, 0], 'type': 'SEMANTIC'})
                    else:
                        duplicates.append({'id1': data.iloc[chunk_start+i, 0], 'id2': data.iloc[j, 0], 'type': 'PARTIAL'})
    compteur_end = len(duplicates)
    print(compteur_end-compteur_init)

  1%|          | 1/113 [00:36<1:07:19, 36.07s/it]

48152


  2%|▏         | 2/113 [01:11<1:05:43, 35.53s/it]

44927


  3%|▎         | 3/113 [01:46<1:04:58, 35.44s/it]

47164


  4%|▎         | 4/113 [02:23<1:05:43, 36.18s/it]

61202


  4%|▍         | 5/113 [02:58<1:04:24, 35.78s/it]

44470


  5%|▌         | 6/113 [03:32<1:02:44, 35.19s/it]

47151


  6%|▌         | 7/113 [04:07<1:01:34, 34.85s/it]

39006


  7%|▋         | 8/113 [04:41<1:00:47, 34.74s/it]

48866


  8%|▊         | 9/113 [05:13<58:51, 33.96s/it]  

24033


  9%|▉         | 10/113 [05:46<57:48, 33.67s/it]

45896


 10%|▉         | 11/113 [06:19<56:45, 33.39s/it]

35900


 11%|█         | 12/113 [06:51<55:17, 32.85s/it]

35469


 12%|█▏        | 13/113 [07:23<54:22, 32.63s/it]

46810


 12%|█▏        | 14/113 [07:56<54:18, 32.91s/it]

47752


 13%|█▎        | 15/113 [08:29<53:23, 32.69s/it]

42920


 14%|█▍        | 16/113 [09:01<52:33, 32.51s/it]

41724


 15%|█▌        | 17/113 [09:35<52:39, 32.92s/it]

42496


 16%|█▌        | 18/113 [10:06<51:29, 32.52s/it]

29827


 17%|█▋        | 19/113 [10:37<50:09, 32.02s/it]

22189


 18%|█▊        | 20/113 [11:08<49:10, 31.73s/it]

34520


 19%|█▊        | 21/113 [11:39<48:14, 31.46s/it]

36920


 19%|█▉        | 22/113 [12:09<46:53, 30.91s/it]

40814


 20%|██        | 23/113 [12:38<45:47, 30.53s/it]

36677


 21%|██        | 24/113 [13:07<44:41, 30.13s/it]

41295


 22%|██▏       | 25/113 [13:35<43:17, 29.51s/it]

31800


 23%|██▎       | 26/113 [14:04<42:20, 29.20s/it]

33687


 24%|██▍       | 27/113 [14:32<41:22, 28.87s/it]

34931


 25%|██▍       | 28/113 [15:01<41:00, 28.95s/it]

42404


 26%|██▌       | 29/113 [15:29<40:07, 28.67s/it]

22919


 27%|██▋       | 30/113 [15:58<39:40, 28.68s/it]

36054


 27%|██▋       | 31/113 [16:25<38:39, 28.29s/it]

25224


 28%|██▊       | 32/113 [16:53<37:53, 28.06s/it]

33849


 29%|██▉       | 33/113 [17:19<36:44, 27.55s/it]

27652


 30%|███       | 34/113 [17:47<36:15, 27.53s/it]

33964


 31%|███       | 35/113 [18:14<35:36, 27.39s/it]

28564


 32%|███▏      | 36/113 [18:40<34:38, 27.00s/it]

22203


 33%|███▎      | 37/113 [19:08<34:34, 27.29s/it]

24646


 34%|███▎      | 38/113 [19:34<33:44, 26.99s/it]

30222


 35%|███▍      | 39/113 [20:02<33:40, 27.30s/it]

44971


 35%|███▌      | 40/113 [20:28<32:32, 26.75s/it]

21591


 36%|███▋      | 41/113 [20:52<31:23, 26.15s/it]

26329


 37%|███▋      | 42/113 [21:16<30:05, 25.43s/it]

26018


 38%|███▊      | 43/113 [21:40<29:09, 25.00s/it]

26231


 39%|███▉      | 44/113 [22:04<28:22, 24.67s/it]

30734


 40%|███▉      | 45/113 [22:27<27:30, 24.27s/it]

24772


 41%|████      | 46/113 [22:49<26:17, 23.55s/it]

19458


 42%|████▏     | 47/113 [23:12<25:45, 23.42s/it]

25646


 42%|████▏     | 48/113 [23:35<25:09, 23.23s/it]

23813


 43%|████▎     | 49/113 [23:57<24:30, 22.98s/it]

34946


 44%|████▍     | 50/113 [24:20<23:59, 22.85s/it]

29466


 45%|████▌     | 51/113 [24:41<23:07, 22.38s/it]

20172


 46%|████▌     | 52/113 [25:02<22:14, 21.88s/it]

24677


 47%|████▋     | 53/113 [25:22<21:27, 21.46s/it]

19780


 48%|████▊     | 54/113 [25:42<20:36, 20.95s/it]

16923


 49%|████▊     | 55/113 [26:03<20:18, 21.01s/it]

24674


 50%|████▉     | 56/113 [26:24<19:56, 21.00s/it]

31935


 50%|█████     | 57/113 [26:44<19:20, 20.73s/it]

20644


 51%|█████▏    | 58/113 [27:04<18:40, 20.37s/it]

23380


 52%|█████▏    | 59/113 [27:23<18:02, 20.05s/it]

24045


 53%|█████▎    | 60/113 [27:43<17:38, 19.98s/it]

18588


 54%|█████▍    | 61/113 [28:01<16:48, 19.40s/it]

15035


 55%|█████▍    | 62/113 [28:19<16:08, 18.99s/it]

16224


 56%|█████▌    | 63/113 [28:37<15:37, 18.75s/it]

17004


 57%|█████▋    | 64/113 [28:55<15:05, 18.48s/it]

19710


 58%|█████▊    | 65/113 [29:12<14:26, 18.06s/it]

13449


 58%|█████▊    | 66/113 [29:29<13:51, 17.69s/it]

17625


 59%|█████▉    | 67/113 [29:46<13:25, 17.50s/it]

19013


 60%|██████    | 68/113 [30:03<12:53, 17.19s/it]

19380


 61%|██████    | 69/113 [30:18<12:18, 16.78s/it]

17362


 62%|██████▏   | 70/113 [30:35<12:01, 16.77s/it]

20274


 63%|██████▎   | 71/113 [30:52<11:43, 16.76s/it]

17909


 64%|██████▎   | 72/113 [31:08<11:23, 16.67s/it]

16248


 65%|██████▍   | 73/113 [31:24<10:54, 16.37s/it]

16443


 65%|██████▌   | 74/113 [31:39<10:25, 16.03s/it]

18354


 66%|██████▋   | 75/113 [31:54<09:51, 15.57s/it]

15812


 67%|██████▋   | 76/113 [32:08<09:24, 15.25s/it]

16348


 68%|██████▊   | 77/113 [32:23<08:57, 14.93s/it]

16748


 69%|██████▉   | 78/113 [32:36<08:30, 14.59s/it]

15821


 70%|██████▉   | 79/113 [32:50<08:10, 14.43s/it]

10020


 71%|███████   | 80/113 [33:04<07:44, 14.08s/it]

17250


 72%|███████▏  | 81/113 [33:17<07:20, 13.78s/it]

18093


 73%|███████▎  | 82/113 [33:29<06:55, 13.39s/it]

11771


 73%|███████▎  | 83/113 [33:41<06:28, 12.94s/it]

8306


 74%|███████▍  | 84/113 [33:52<06:01, 12.48s/it]

6904


 75%|███████▌  | 85/113 [34:04<05:43, 12.27s/it]

15026


 76%|███████▌  | 86/113 [34:15<05:21, 11.90s/it]

5811


 77%|███████▋  | 87/113 [34:26<05:03, 11.68s/it]

13955


 78%|███████▊  | 88/113 [34:37<04:45, 11.42s/it]

10924


 79%|███████▉  | 89/113 [34:48<04:26, 11.10s/it]

7835


 80%|███████▉  | 90/113 [34:58<04:07, 10.77s/it]

8266


 81%|████████  | 91/113 [35:07<03:49, 10.45s/it]

8448


 81%|████████▏ | 92/113 [35:17<03:33, 10.16s/it]

9619


 82%|████████▏ | 93/113 [35:26<03:16,  9.84s/it]

8662


 83%|████████▎ | 94/113 [35:35<03:02,  9.61s/it]

9079


 84%|████████▍ | 95/113 [35:44<02:48,  9.35s/it]

5355


 85%|████████▍ | 96/113 [35:52<02:34,  9.06s/it]

6371


 86%|████████▌ | 97/113 [36:00<02:20,  8.75s/it]

6046


 87%|████████▋ | 98/113 [36:08<02:07,  8.47s/it]

3804


 88%|████████▊ | 99/113 [36:16<01:55,  8.27s/it]

7187


 88%|████████▊ | 100/113 [36:23<01:44,  8.03s/it]

4179


 89%|████████▉ | 101/113 [36:31<01:33,  7.81s/it]

4947


 90%|█████████ | 102/113 [36:38<01:23,  7.59s/it]

4600


 91%|█████████ | 103/113 [36:44<01:13,  7.30s/it]

2930


 92%|█████████▏| 104/113 [36:51<01:03,  7.04s/it]

3616


 93%|█████████▎| 105/113 [36:57<00:53,  6.70s/it]

2887


 94%|█████████▍| 106/113 [37:02<00:44,  6.41s/it]

2529


 95%|█████████▍| 107/113 [37:08<00:36,  6.12s/it]

1856


 96%|█████████▌| 108/113 [37:13<00:29,  5.81s/it]

1670


 96%|█████████▋| 109/113 [37:18<00:22,  5.51s/it]

1580


 97%|█████████▋| 110/113 [37:22<00:15,  5.22s/it]

1108


 98%|█████████▊| 111/113 [37:26<00:09,  4.92s/it]

859


 99%|█████████▉| 112/113 [37:30<00:04,  4.63s/it]

218


100%|██████████| 113/113 [37:31<00:00, 19.93s/it]

1





## Print duplicates

In [77]:
duplicates = pd.DataFrame(duplicates)
duplicates.sort_values(by=['id1', 'id2'], inplace=True)
duplicates

Unnamed: 0,id1,id2,type
357497,1,29699,TEMPORAL
357498,1,72192,SEMANTIC
53379,3,86,FULL
53380,3,11168,FULL
357499,3,27623,SEMANTIC
...,...,...,...
2871056,111777,111956,TEMPORAL
2871057,111777,111975,TEMPORAL
290641,111869,112054,FULL
2871058,111956,111975,TEMPORAL


In [78]:
duplicates.groupby('type').count()

Unnamed: 0_level_0,id1,id2
type,Unnamed: 1_level_1,Unnamed: 2_level_1
FULL,357497,357497
PARTIAL,1238,1238
SEMANTIC,212506,212506
TEMPORAL,2299819,2299819


In [81]:
duplicates.to_csv('duplicates.csv', index=False, header=False)

In [82]:
os.system(f"mc cp duplicates.csv s3/apalazzolo/Deduplication/duplicates.csv")

`/home/onyxia/work/deduplication/duplicates.csv` -> `s3/apalazzolo/Deduplication/duplicates.csv`
Total: 0 B, Transferred: 56.15 MiB, Speed: 144.77 MiB/s


0