# OJA Deduplication Challenge

## Imports

### Packages

In [57]:
import os
import warnings

import pandas as pd
from tqdm import tqdm

from Levenshtein import distance
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [52]:
warnings.filterwarnings('ignore')

### Data

In [6]:
# Import from s3
os.system(f"mc cp s3/apalazzolo/Deduplication/wi_dataset.csv wi_dataset.csv")

`s3/apalazzolo/Deduplication/wi_dataset.csv` -> `wi_dataset.csv`
Total: 0 B, Transferred: 203.23 MiB, Speed: 149.45 MiB/s


0

In [73]:
original_data = pd.read_csv('wi_dataset.csv',
                            lineterminator='\n')

In [95]:
# For now let's work with a smaller extract
data = original_data#.head(1000)

In [96]:
n_ads = len(data)
n_ads

10000

In [97]:
data.head(3)

Unnamed: 0,id,title,description,location,country_id,company_name,retrieval_date
0,1,Traineeship Pensioen & Leven in Utrecht,We sturen je door naar je toekomstige opdracht...,Utrecht,NL,,2021-01-19
1,2,DEPOSITARY OFFICER (M/F),DEPOSITARY OFFICER (M/F) DO Recruitment Adviso...,,FR,DO Recruitment Advisors,2021-09-30
2,3,Cautam colegi manipulanti marfa,Descriere Angajam manipulanti marfa din Pitest...,Pitesti,RO,,2021-06-18


## Data preprocessing

In [98]:
# Cleaning basique

data[
    ['title', 'description', 'location', 'company_name']
] = data[
    ['title', 'description', 'location', 'company_name']
].apply(lambda x: x.str.strip().str.lower())

data.fillna("", inplace=True)

## Naive deduplication

In [99]:
duplicates = []

### Add the full duplicates

In [100]:
for i in tqdm(range(n_ads)):
    for j in range(i+1,n_ads):
        if (data.iloc[i, 1] == data.iloc[j, 1]) and (data.iloc[i, 2] == data.iloc[j, 2]):
            duplicates.append({'id1': i+1, 'id2': j+1, 'type': 'FULL'})

len(duplicates)

  7%|▋         | 659/10000 [03:02<43:03,  3.62it/s]


KeyboardInterrupt: 

### Add the semantic duplicates

In [None]:
data['text'] = data['title'] + ' ' + data['description'] + ' ' + data['location'] + ' ' + data['company_name']

# Utiliser TF-IDF pour vectoriser les textes
vectorizer = TfidfVectorizer()
tfidf = vectorizer.fit_transform(data['text'])

# Utiliser la similarité cosinus pour comparer les textes
similarity_matrix = cosine_similarity(tfidf)

In [None]:
# Identifier les paires d'annonces similaires

for i in tqdm(range(similarity_matrix.shape[0])):
    for j in range(i+1, similarity_matrix.shape[1]):
        if similarity_matrix[i][j] > 0.8 and (
            (data.iloc[i, 1] != data.iloc[j, 1]) or (data.iloc[i, 2] != data.iloc[j, 2])
        ):
            if data.iloc[i, 6] != data.iloc[j, 6]:
                duplicates.append({'id1': i+1, 'id2': j+1, 'type': 'TEMPORAL'})
            else:
                if abs(
                    len(data.iloc[i, 2]) - len(data.iloc[j, 2])
                ) / max(
                    len(data.iloc[i, 2]), len(data.iloc[j, 2])
                ) < 0.2:
                    duplicates.append({'id1': i+1, 'id2': j+1, 'type': 'SEMANTIC'})
                else:
                    duplicates.append({'id1': i+1, 'id2': j+1, 'type': 'PARTIAL'})

## Print duplicates

In [None]:
duplicates = pd.DataFrame(duplicates)
duplicates