# OJA Deduplication Challenge

## Imports

### Packages

In [1]:
import os
import warnings

import pandas as pd
from tqdm import tqdm

from Levenshtein import distance
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
warnings.filterwarnings('ignore')

### Data

In [3]:
# Import from s3
os.system(f"mc cp s3/apalazzolo/Deduplication/wi_dataset.csv wi_dataset.csv")

`s3/apalazzolo/Deduplication/wi_dataset.csv` -> `wi_dataset.csv`
Total: 0 B, Transferred: 203.23 MiB, Speed: 156.32 MiB/s


0

In [4]:
original_data = pd.read_csv('wi_dataset.csv',
                            lineterminator='\n')

In [5]:
# For now let's work with a smaller extract
data = original_data#.head(10000)

In [6]:
n_ads = len(data)
n_ads

10000

In [7]:
data.head(3)

Unnamed: 0,id,title,description,location,country_id,company_name,retrieval_date
0,1,Traineeship Pensioen & Leven in Utrecht,We sturen je door naar je toekomstige opdracht...,Utrecht,NL,,2021-01-19
1,2,DEPOSITARY OFFICER (M/F),DEPOSITARY OFFICER (M/F) DO Recruitment Adviso...,,FR,DO Recruitment Advisors,2021-09-30
2,3,Cautam colegi manipulanti marfa,Descriere Angajam manipulanti marfa din Pitest...,Pitesti,RO,,2021-06-18


## Data preprocessing

In [8]:
# Cleaning basique

data[
    ['title', 'description', 'location', 'company_name']
] = data[
    ['title', 'description', 'location', 'company_name']
].apply(lambda x: x.str.strip().str.lower())

data.fillna("", inplace=True)

## Naive deduplication

In [9]:
duplicates = []

### Add the full duplicates

In [10]:
# Beaucoup trop long
# Attention, si on trie la table rien ne marche plus !

# for i in tqdm(range(n_ads)):
#     for j in range(i+1,n_ads):
#         if (data.iloc[i, 1] == data.iloc[j, 1]) and (data.iloc[i, 2] == data.iloc[j, 2]):
#             duplicates.append({'id1': i+1, 'id2': j+1, 'type': 'FULL'})

In [11]:
data.sort_values(by=['title', 'description', 'id'], inplace=True)

i = 0
j = 1

while i < n_ads and j < n_ads:
    if (i%10000 == 0) and (j == i+1):
        print(i)
    if data.iloc[i, 1] < data.iloc[j, 1]:
        i += 1
        j = i + 1
    elif data.iloc[i, 1] > data.iloc[j, 1]:
        j += 1
    elif data.iloc[i, 2] < data.iloc[j, 2]:
        i += 1
        j = i + 1
    elif data.iloc[i, 2] > data.iloc[j, 2]:
        j += 1
    else:
        duplicates.append({'id1': data.iloc[i, 0], 'id2': data.iloc[j, 0], 'type': 'FULL'})
        j += 1

0


In [12]:
len(duplicates)

2753

In [13]:
duplicates[0]

{'id1': 236, 'id2': 1598, 'type': 'FULL'}

### Add the semantic duplicates

In [14]:
data.sort_values(by=['id'], inplace=True)

In [15]:
data['text'] = data['title'] + ' ' + data['description'] + ' ' + data['location'] + ' ' + data['company_name']

# Utiliser TF-IDF pour vectoriser les textes
vectorizer = TfidfVectorizer()
tfidf = vectorizer.fit_transform(data['text'])

In [16]:
# Utiliser la similarité cosinus pour comparer les textes
similarity_matrix = cosine_similarity(tfidf)

In [17]:
# Identifier les paires d'annonces similaires

for i in tqdm(range(similarity_matrix.shape[0])):
    for j in range(i+1, similarity_matrix.shape[1]):
        if similarity_matrix[i][j] > 0.8 and (
            (data.iloc[i, 1] != data.iloc[j, 1]) or (data.iloc[i, 2] != data.iloc[j, 2])
        ):
            if data.iloc[i, 6] != data.iloc[j, 6]:
                duplicates.append({'id1': data.iloc[i, 0], 'id2': data.iloc[j, 0], 'type': 'TEMPORAL'})
            else:
                if abs(
                    len(data.iloc[i, 2]) - len(data.iloc[j, 2])
                ) / max(
                    len(data.iloc[i, 2]), len(data.iloc[j, 2])
                ) < 0.2:
                    duplicates.append({'id1': data.iloc[i, 0], 'id2': data.iloc[j, 0], 'type': 'SEMANTIC'})
                else:
                    duplicates.append({'id1': data.iloc[i, 0], 'id2': data.iloc[j, 0], 'type': 'PARTIAL'})

100%|██████████| 10000/10000 [00:16<00:00, 601.92it/s]


## Print duplicates

In [21]:
duplicates = pd.DataFrame(duplicates)
duplicates.sort_values(by=['id1', 'id2'], inplace=True)
duplicates

Unnamed: 0,id1,id2,type
445,3,86,FULL
469,5,1443,FULL
2753,5,6352,SEMANTIC
1887,9,2949,FULL
1888,9,3675,FULL
...,...,...,...
31213,9874,9987,TEMPORAL
31214,9891,9919,TEMPORAL
31215,9950,9958,TEMPORAL
31216,9950,9987,SEMANTIC


In [19]:
duplicates.to_csv('duplicates.csv', index=False)

In [20]:
os.system(f"mc cp duplicates.csv s3/apalazzolo/Deduplication/duplicates.csv")

`/home/onyxia/work/deduplication/duplicates.csv` -> `s3/apalazzolo/Deduplication/duplicates.csv`
Total: 0 B, Transferred: 561.82 KiB, Speed: 4.57 MiB/s


0