# OJA Deduplication Challenge

## Imports

### Packages

In [None]:
import os
import warnings

import pandas as pd
import numpy as np
import re
import string
from unidecode import unidecode
from tqdm import tqdm
tqdm.pandas()

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

from transformers import BertTokenizer, BertModel
import torch

from sklearn.metrics.pairwise import cosine_similarity
# from Levenshtein import distance

[nltk_data] Downloading package stopwords to /home/onyxia/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/onyxia/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
warnings.filterwarnings('ignore')

### Data

In [None]:
# Import from s3
os.system(f"mc cp s3/apalazzolo/Deduplication/wi_dataset.csv wi_dataset.csv")

In [None]:
data = pd.read_csv('wi_dataset.csv',
                   lineterminator='\n')

In [None]:
# For now let's work with a smaller extract

# data = data.head(1000)

In [None]:
n_ads = len(data)
n_ads

In [None]:
data.head()

In [None]:
data.tail()

## Data preprocessing

In [None]:
# Basic cleaning

data.fillna("", inplace=True)

## Naive deduplication

In [None]:
duplicates = []

### Add the full duplicates

In [None]:
data.sort_values(by=['title', 'description', 'id', 'company_name', 'location'], inplace=True)

In [None]:
for i in tqdm(range(n_ads-1)):
    j = i+1
    while j < n_ads and data.iloc[j, 1] == data.iloc[i, 1] and data.iloc[j, 2] == data.iloc[i, 2]:
        # if data.iloc[j, 5] == data.iloc[i, 5] or len(data.iloc[i, 5]) * len(data.iloc[j, 5]) == 0:
            # if data.iloc[j, 3] == data.iloc[i, 3] or len(data.iloc[i, 3]) * len(data.iloc[j, 3]) == 0:
        duplicates.append({'id1': data.iloc[i, 0], 'id2': data.iloc[j, 0], 'type': 'FULL'})
        j += 1

len(duplicates)

In [None]:
len(duplicates)

In [None]:
duplicates[0]

### Add the semantic duplicates

In [None]:
data[
    ['title', 'description', 'location', 'country_id', 'company_name']
] = data[
    ['title', 'description', 'location', 'country_id', 'company_name']
].progress_apply(lambda x: x.str.replace(r'\W', ' ').apply(lambda x: unidecode(re.sub(' +', ' ', x))).str.strip().str.lower())

In [None]:
data.sort_values(by=['id'], inplace=True)

#### Lemmatization

In [None]:
final_stopwords_list = stopwords.words(
    'danish') + stopwords.words(
    'dutch') + stopwords.words(
    'english') + stopwords.words(
    'finnish') + stopwords.words(
    'french') + stopwords.words(
    'german') + stopwords.words(
    'hungarian') + stopwords.words(
    'portuguese') + stopwords.words(
    'romanian') + stopwords.words(
    'russian') + stopwords.words(
    'spanish')

In [None]:
data['text'] = data['title'] + ' ' + data['location'] + ' ' + data['country_id'] + ' ' + data['company_name'] + ' ' + data['description']

# Lemmatiser les mots
lem = WordNetLemmatizer()
data['filtered_text'] = data['text'].progress_apply(lambda x: ' '.join([lem.lemmatize(word) for word in x.split() if word not in final_stopwords_list]))

#### Tokenizer and model

In [None]:
# Charger le tokenizer et le modèle BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model = BertModel.from_pretrained('bert-base-multilingual-cased')

In [None]:
# Encoder les textes avec BERT
def encode_text(text):
    input_ids = torch.tensor(tokenizer.encode(text, add_special_tokens=True, truncation=True)).unsqueeze(0)
    outputs = model(input_ids)
    last_hidden_state = outputs.last_hidden_state
    return last_hidden_state[0][0].detach().numpy()

In [None]:
bert = data['filtered_text'].progress_apply(encode_text)

In [None]:
matrix_bert = [list(x) for x in bert]

In [None]:
def cosine_similarity_by_chunk(start, end):
    if end > n_ads:
        end = n_ads
    return cosine_similarity(X=matrix_bert[start:end], Y=matrix_bert)

In [None]:
chunk_size = 10000

for chunk_start in range(0, n_ads, chunk_size):
    similarity_matrix_chunk = cosine_similarity_by_chunk(chunk_start, chunk_start+chunk_size)
    compteur_init = len(duplicates)
    for i in tqdm(range(chunk_size)):
        for j in range(chunk_start+i+1, n_ads):
            if similarity_matrix_chunk[i][j] > 0.996:
                if abs(
                    len(data.iloc[chunk_start+i, 2]) - len(data.iloc[j, 2])
                ) / (1 + min(
                    len(data.iloc[chunk_start+i, 2]), len(data.iloc[j, 2])
                )) > 0.1:
                    duplicates.append({'id1': data.iloc[chunk_start+i, 0], 'id2': data.iloc[j, 0], 'type': 'PARTIAL'})
                elif data.iloc[chunk_start+i, 6] != data.iloc[j, 6]:
                    duplicates.append({'id1': data.iloc[chunk_start+i, 0], 'id2': data.iloc[j, 0], 'type': 'TEMPORAL'})
                else:
                    duplicates.append({'id1': data.iloc[chunk_start+i, 0], 'id2': data.iloc[j, 0], 'type': 'SEMANTIC'})
    compteur_end = len(duplicates)
    print(compteur_end-compteur_init)

## Print duplicates

In [None]:
duplicates = pd.DataFrame(duplicates)
duplicates.sort_values(by=['type'], inplace=True)
duplicates.drop_duplicates(subset=['id1', 'id2'], inplace=True)
duplicates.sort_values(by=['id1', 'id2'], inplace=True)
duplicates

In [None]:
len(duplicates) - len(duplicates.drop_duplicates(subset=['id1', 'id2']))

In [None]:
duplicates[duplicates['id1'] > duplicates['id2']]

In [None]:
duplicates.groupby('type').count()

In [None]:
duplicates.to_csv('duplicates.csv', index=False, header=False)

In [None]:
os.system(f"mc cp duplicates.csv s3/apalazzolo/Deduplication/duplicates_bert.csv")