# OJA Deduplication Challenge

## Imports

### Packages

In [172]:
import os
import warnings

import pandas as pd
import re
from unidecode import unidecode
from tqdm import tqdm

# from Levenshtein import distance
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [173]:
warnings.filterwarnings('ignore')

### Data

In [None]:
# Import from s3
os.system(f"mc cp s3/apalazzolo/Deduplication/wi_dataset.csv wi_dataset.csv")

`s3/apalazzolo/Deduplication/wi_dataset.csv` -> `wi_dataset.csv`
Total: 0 B, Transferred: 203.23 MiB, Speed: 154.19 MiB/s


0

In [None]:
data = pd.read_csv('wi_dataset.csv',
                   lineterminator='\n')

In [None]:
# For now let's work with a smaller extract

# data = original_data.head(10000)

In [None]:
n_ads = len(data)
n_ads

In [None]:
data.head()

In [None]:
data.tail()

## Data preprocessing

In [None]:
# Basic cleaning

data.fillna("", inplace=True)

data[
    ['title', 'description', 'location', 'country_id', 'company_name']
] = data[
    ['title', 'description', 'location', 'country_id', 'company_name']
].apply(lambda x: x.str.replace(r'\W', ' ').apply(lambda x: unidecode(re.sub(' +', ' ', x))).str.strip().str.lower())

## Naive deduplication

In [None]:
duplicates = []

### Add the full duplicates

In [None]:
# Way too long to run
# + sorting the table breaks the code

# for i in tqdm(range(n_ads)):
#     for j in range(i+1,n_ads):
#         if (data.iloc[i, 1] == data.iloc[j, 1]) and (data.iloc[i, 2] == data.iloc[j, 2]):
#             duplicates.append({'id1': i+1, 'id2': j+1, 'type': 'FULL'})

In [None]:
data.sort_values(by=['title', 'description', 'id'], inplace=True)

i = 0
j = 1

while i < n_ads and j < n_ads:
    if (i%10000 == 0) and (j == i+1):
        print(i)
    if data.iloc[i, 1] < data.iloc[j, 1]:
        i += 1
        j = i + 1
    elif data.iloc[i, 1] > data.iloc[j, 1]:
        j += 1
    elif data.iloc[i, 2] < data.iloc[j, 2]:
        i += 1
        j = i + 1
    elif data.iloc[i, 2] > data.iloc[j, 2]:
        j += 1
    else:
        duplicates.append({'id1': data.iloc[i, 0], 'id2': data.iloc[j, 0], 'type': 'FULL'})
        j += 1

In [None]:
len(duplicates)

In [None]:
duplicates[0]

### Add the semantic duplicates

In [None]:
data.sort_values(by=['id'], inplace=True)

In [None]:
data['text'] = data['title'] + ' ' + data['description'] + ' ' + data['location'] + ' ' + data['country_id'] + ' ' + data['company_name']

# Use TF-IDF to vectorize the texts
vectorizer = TfidfVectorizer()
tfidf = vectorizer.fit_transform(data['text'])
tfidf

In [None]:
# Use cosine similarity to compare the text
# But actually way too big output

# similarity_matrix = cosine_similarity(tfidf)

In [None]:
# Identify similar ads paris
# Needs to be done by steps

# for i in tqdm(range(similarity_matrix.shape[0])):
#     for j in range(i+1, similarity_matrix.shape[1]):
#         if similarity_matrix[i][j] > 0.8 and (
#             (data.iloc[i, 1] != data.iloc[j, 1]) or (data.iloc[i, 2] != data.iloc[j, 2])
#         ):
#             if data.iloc[i, 6] != data.iloc[j, 6]:
#                 duplicates.append({'id1': data.iloc[i, 0], 'id2': data.iloc[j, 0], 'type': 'TEMPORAL'})
#             else:
#                 if abs(
#                     len(data.iloc[i, 2]) - len(data.iloc[j, 2])
#                 ) / max(
#                     len(data.iloc[i, 2]), len(data.iloc[j, 2])
#                 ) < 0.08:
#                     duplicates.append({'id1': data.iloc[i, 0], 'id2': data.iloc[j, 0], 'type': 'SEMANTIC'})
#                 else:
#                     duplicates.append({'id1': data.iloc[i, 0], 'id2': data.iloc[j, 0], 'type': 'PARTIAL'})

In [None]:
def cosine_similarity_by_chunk(start, end):
    if end > n_ads:
        end = n_ads
    return cosine_similarity(X=tfidf[start:end], Y=tfidf)

In [None]:
chunk_size = 8000

for chunk_start in tqdm(range(0, n_ads, chunk_size)):
    similarity_matrix_chunk = cosine_similarity_by_chunk(chunk_start, chunk_start+chunk_size)
    compteur_init = len(duplicates)
    for i in range(chunk_size):
        for j in range(chunk_start+i+1, n_ads):
            if similarity_matrix_chunk[i][j] > 0.8 and (
                (data.iloc[chunk_start+i, 1] != data.iloc[j, 1]) or (data.iloc[chunk_start+i, 2] != data.iloc[j, 2])
            ):
                if data.iloc[chunk_start+i, 6] != data.iloc[j, 6]:
                    duplicates.append({'id1': data.iloc[chunk_start+i, 0], 'id2': data.iloc[j, 0], 'type': 'TEMPORAL'})
                else:
                    if abs(
                        len(data.iloc[chunk_start+i, 2]) - len(data.iloc[j, 2])
                    ) / max(
                        len(data.iloc[chunk_start+i, 2]), len(data.iloc[j, 2])
                    ) < 0.08:
                        duplicates.append({'id1': data.iloc[chunk_start+i, 0], 'id2': data.iloc[j, 0], 'type': 'SEMANTIC'})
                    else:
                        duplicates.append({'id1': data.iloc[chunk_start+i, 0], 'id2': data.iloc[j, 0], 'type': 'PARTIAL'})
    compteur_end = len(duplicates)
    print(compteur_end-compteur_init)

## Print duplicates

In [None]:
duplicates = pd.DataFrame(duplicates)
duplicates.sort_values(by=['id1', 'id2'], inplace=True)

duplicates

In [None]:
len(duplicates) - len(duplicates.drop_duplicates(subset=['id_1', 'id_2'])

In [None]:
duplicates.groupby('type').count()

In [None]:
duplicates.to_csv('duplicates.csv', index=False, header=False)

In [None]:
os.system(f"mc cp duplicates.csv s3/apalazzolo/Deduplication/duplicates.csv")