# OJA Deduplication Challenge

## Imports

### Packages

In [2]:
import os
import warnings

import pandas as pd
import re
import string
from unidecode import unidecode
from tqdm import tqdm

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

from transformers import BertTokenizer, BertModel
import torch

from sklearn.metrics.pairwise import cosine_similarity
# from Levenshtein import distance

[nltk_data] Downloading package stopwords to /home/onyxia/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/onyxia/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
warnings.filterwarnings('ignore')

### Data

In [4]:
# Import from s3
os.system(f"mc cp s3/apalazzolo/Deduplication/wi_dataset.csv wi_dataset.csv")

`s3/apalazzolo/Deduplication/wi_dataset.csv` -> `wi_dataset.csv`
Total: 0 B, Transferred: 203.23 MiB, Speed: 181.20 MiB/s


0

In [5]:
data = pd.read_csv('wi_dataset.csv',
                   lineterminator='\n')

In [6]:
# For now let's work with a smaller extract

# data = original_data.head(10000)

In [7]:
n_ads = len(data)
n_ads

112056

In [8]:
data.head()

Unnamed: 0,id,title,description,location,country_id,company_name,retrieval_date
0,1,Traineeship Pensioen & Leven in Utrecht,We sturen je door naar je toekomstige opdracht...,Utrecht,NL,,2021-01-19
1,2,DEPOSITARY OFFICER (M/F),DEPOSITARY OFFICER (M/F) DO Recruitment Adviso...,,FR,DO Recruitment Advisors,2021-09-30
2,3,Cautam colegi manipulanti marfa,Descriere Angajam manipulanti marfa din Pitest...,Pitesti,RO,,2021-06-18
3,4,Home Care Assistant,Are you looking for a role as a Care Assistant...,,UK,Cera Care,2021-02-09
4,5,Chef de chantier espaces verts h/f,LE POSTE Vous êtes passionné par les Espaces V...,,FR,ALPHEA CONSEIL,2021-06-08


In [9]:
data.tail()

Unnamed: 0,id,title,description,location,country_id,company_name,retrieval_date
112051,112052,OPERATORE IMPIANTI AUTOMATICI,OPERATORE IMPIANTI AUTOMATICI SBE VARVIT - Mon...,"Monfalcone, Friuli-Venezia Giulia",IT,S.B.E. VARVIT,2021-01-31
112052,112053,Tarvittaessa töihin kutsuttavia siivoojia,Etsimme useampaa MATALAPAINEPESIJÄÄ tiimiimme ...,,FI,ISS Palvelut Oy,2021-04-01
112053,112054,Timanställning - Lager - Start Augusti,Om Jobbet: Just nu söker vi efter nya medarbet...,Jönköping,SE,Lyreco Sverige AB,2021-08-03
112054,112055,Opérateur de saisie bancaire H/F,Venez nous découvrir sur : https://www.tessi.e...,Fontenay-sous-Bois (94),FR,tessi,2021-05-31
112055,112056,Senior Java developer,Do you want to know what is means to be BOLD? ...,,PT,BOLD International,2021-01-30


## Data preprocessing

In [10]:
# Basic cleaning

data.fillna("", inplace=True)

data[
    ['title', 'description', 'location', 'country_id', 'company_name']
] = data[
    ['title', 'description', 'location', 'country_id', 'company_name']
].apply(lambda x: x.str.replace(r'\W', ' ').apply(lambda x: unidecode(re.sub(' +', ' ', x))).str.strip().str.lower())

In [11]:
data.head()

Unnamed: 0,id,title,description,location,country_id,company_name,retrieval_date
0,1,traineeship pensioen leven in utrecht,we sturen je door naar je toekomstige opdracht...,utrecht,nl,,2021-01-19
1,2,depositary officer m f,depositary officer m f do recruitment advisors...,,fr,do recruitment advisors,2021-09-30
2,3,cautam colegi manipulanti marfa,descriere angajam manipulanti marfa din pitest...,pitesti,ro,,2021-06-18
3,4,home care assistant,are you looking for a role as a care assistant...,,uk,cera care,2021-02-09
4,5,chef de chantier espaces verts h f,le poste vous etes passionne par les espaces v...,,fr,alphea conseil,2021-06-08


In [12]:
data.tail()

Unnamed: 0,id,title,description,location,country_id,company_name,retrieval_date
112051,112052,operatore impianti automatici,operatore impianti automatici sbe varvit monfa...,monfalcone friuli venezia giulia,it,s b e varvit,2021-01-31
112052,112053,tarvittaessa toihin kutsuttavia siivoojia,etsimme useampaa matalapainepesijaa tiimiimme ...,,fi,iss palvelut oy,2021-04-01
112053,112054,timanstallning lager start augusti,om jobbet just nu soker vi efter nya medarbeta...,jonkoping,se,lyreco sverige ab,2021-08-03
112054,112055,operateur de saisie bancaire h f,venez nous decouvrir sur https www tessi eu fr...,fontenay sous bois 94,fr,tessi,2021-05-31
112055,112056,senior java developer,do you want to know what is means to be bold b...,,pt,bold international,2021-01-30


## Naive deduplication

In [13]:
duplicates = []

### Add the full duplicates

In [14]:
data.sort_values(by=['title', 'description', 'id', 'company_name', 'location'], inplace=True)

In [15]:
for i in tqdm(range(n_ads-1)):
    j = i+1
    while j < n_ads and data.iloc[j, 1] == data.iloc[i, 1] and data.iloc[j, 2] == data.iloc[i, 2]:
        if data.iloc[j, 5] == data.iloc[i, 5] or len(data.iloc[i, 5]) * len(data.iloc[j, 5]) == 0:
            if data.iloc[j, 3] == data.iloc[i, 3] or len(data.iloc[i, 3]) * len(data.iloc[j, 3]) == 0:
                duplicates.append({'id1': data.iloc[i, 0], 'id2': data.iloc[j, 0], 'type': 'FULL'})
        j += 1

len(duplicates)

100%|██████████| 112055/112055 [01:09<00:00, 1603.96it/s]


357595

In [16]:
len(duplicates)

357595

In [17]:
duplicates[0]

{'id1': 16097, 'id2': 23753, 'type': 'FULL'}

### Add the semantic duplicates

In [18]:
data.sort_values(by=['id'], inplace=True)

#### Lemmatization

In [19]:
final_stopwords_list = stopwords.words(
    'danish') + stopwords.words(
    'dutch') + stopwords.words(
    'english') + stopwords.words(
    'finnish') + stopwords.words(
    'french') + stopwords.words(
    'german') + stopwords.words(
    'hungarian') + stopwords.words(
    'portuguese') + stopwords.words(
    'romanian') + stopwords.words(
    'russian') + stopwords.words(
    'spanish')

In [None]:
data['text'] = data['title'] + ' ' + data['description'] + ' ' + data['location'] + ' ' + data['country_id'] + ' ' + data['company_name']

# Lemmatiser les mots
lem = WordNetLemmatizer()
data['filtered_text'] = data['text'].apply(lambda x: ' '.join([lem.lemmatize(word) for word in x.split() if x not in final_stopwords_list]))

#### Tokenizer and model

In [None]:
# Charger le tokenizer et le modèle BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model = BertModel.from_pretrained('bert-base-multilingual-cased')

In [None]:
# Encoder les textes avec BERT
def encode_text(text):
    input_ids = torch.tensor(tokenizer.encode(text, add_special_tokens=True)).unsqueeze(0)
    outputs = model(input_ids)
    last_hidden_state = outputs.last_hidden_state
    return last_hidden_state[0][0].detach().numpy()

In [None]:
data['encoded_text'] = data['filtered_text'].apply(encode_text)

In [None]:
def cosine_similarity_by_chunk(start, end):
    if end > n_ads:
        end = n_ads
    return cosine_similarity(X=tfidf[start:end], Y=tfidf)

In [None]:
chunk_size = 10000

for chunk_start in range(0, n_ads, chunk_size):
    similarity_matrix_chunk = cosine_similarity_by_chunk(chunk_start, chunk_start+chunk_size)
    compteur_init = len(duplicates)
    for i in tqdm(range(chunk_size)):
        for j in range(chunk_start+i+1, n_ads):
            if similarity_matrix_chunk[i][j] > 0.9:
                if abs(
                    len(data.iloc[chunk_start+i, 2]) - len(data.iloc[j, 2])
                ) / (1 + min(
                    len(data.iloc[chunk_start+i, 2]), len(data.iloc[j, 2])
                )) > 0.1:
                    duplicates.append({'id1': data.iloc[chunk_start+i, 0], 'id2': data.iloc[j, 0], 'type': 'PARTIAL'})
                elif data.iloc[chunk_start+i, 6] != data.iloc[j, 6]:
                    duplicates.append({'id1': data.iloc[chunk_start+i, 0], 'id2': data.iloc[j, 0], 'type': 'TEMPORAL'})
                else:
                    duplicates.append({'id1': data.iloc[chunk_start+i, 0], 'id2': data.iloc[j, 0], 'type': 'SEMANTIC'})
    compteur_end = len(duplicates)
    print(compteur_end-compteur_init)

## Print duplicates

In [None]:
duplicates = pd.DataFrame(duplicates)
duplicates.sort_values(by=['type'], inplace=True)
duplicates.drop_duplicates(subset=['id1', 'id2'], inplace=True)
duplicates.sort_values(by=['id1', 'id2'], inplace=True)
duplicates

In [None]:
len(duplicates) - len(duplicates.drop_duplicates(subset=['id1', 'id2']))

In [None]:
duplicates[duplicates['id1'] > duplicates['id2']]

In [None]:
duplicates.groupby('type').count()

In [None]:
duplicates.to_csv('duplicates.csv', index=False, header=False)

In [None]:
os.system(f"mc cp duplicates.csv s3/apalazzolo/Deduplication/duplicates_bert.csv")