In [259]:
import pandas as pd
from scipy.spatial.distance import cdist
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import pairwise_distances
from sklearn.cluster import DBSCAN
import re
from functools import reduce
from collections import Counter
import numpy as np
import pickle as pkl
import preprocess_funcs
from preprocess_funcs import run_preprocess, text_edit, do_replacements
import time

In [260]:
data_path = "data/df_preprocessed.pkl"

In [261]:
df_main = pd.read_pickle(data_path)

In [295]:
df_small = df_main.iloc[:2]
fun = lambda word: word + " hahaha"
columns_to_process = ["Mahalle", "Adres"]

df_small[columns_to_process] = df_small[columns_to_process].apply(fun)
display(df_small)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_small[columns_to_process] = df_small[columns_to_process].apply(fun)


Unnamed: 0,İl,İlçe,Mahalle,Adres,Bulvar/Cadde/Sokak/Yol/Yanyol,Bina Adı,Dış Kapı/ Blok/Apartman No,Kat,İç Kapı,Ad-Soyad,Kaynak,Telefon,Oluşturulma Tarihi,Güncellenme Tarihi,id,group,new_adres,merged_address
0,gaziantep,islahiye,cevdetpasa hahaha,308 sokak hahaha,308 Sokak,,,,,,,,,,0.0,GAZİANTEP_İSLAHİYE_CEVDETPAŞA Mahallesi__308 S...,308 sokak,308 Sokak 308 sokak
1,hatay,antakya,cebrail hahaha,4 bahceli sokak meltem apartman nu 28 4 antaky...,Bahçeli Sokak,Meltem,No: 28,,4.0,dilara yilmaz,,5424098447.0,,,1.0,HATAY_ANTAKYA_CEBRAİL Mahallesi_Meltem_Bahçeli...,4 bahceli sokak meltem apartman nu 28 4,Meltem No: 28 Bahçeli Sokak 4 bahceli sokak me...


In [262]:
df_main["merged_address"] = df_main['Bina Adı'] + " " + df_main['Dış Kapı/ Blok/Apartman No'] \
    + " " + df_main["Bulvar/Cadde/Sokak/Yol/Yanyol"] + " " + df_main["new_adres"]

In [270]:
def cluster_by_column(
    df: pd.DataFrame,
    key_column_name: str,
    duplicate_max_distance_threshold: float = 0.1,
    tfidf_ngram_range: tuple = (1, 1),
    tfidf_min_df: int = 1,
    tfidf_use_char_ngrams: bool = False,
    df_mask = None,
) -> pd.DataFrame:
    df.loc[:, "group_index"] = -1
    df.loc[df_mask, "group_index"] = list(range(df.loc[df_mask].shape[0]))
    cluster_column_name = f"{key_column_name}-cluster"
    duplicate_info_column_name = f"{key_column_name}-duplicate"
    similarity_column_name = f"{key_column_name}-duplicate-similarity"
    duplicate_original_column_name = f"{key_column_name}-duplicate-original-id"
    if tfidf_use_char_ngrams:
        analyzer = "char_wb"
    else:
        analyzer = "word"
    name_vectorizer = TfidfVectorizer(analyzer=analyzer, ngram_range=tfidf_ngram_range, min_df=tfidf_min_df)
    name_tfidf_vectors = name_vectorizer.fit_transform(df.loc[df_mask, key_column_name])
    distance_matrix = pairwise_distances(name_tfidf_vectors, name_tfidf_vectors, metric="cosine")
    dbscan = DBSCAN(eps=duplicate_max_distance_threshold, min_samples=2, metric="precomputed").fit(distance_matrix)
    df.loc[df_mask, cluster_column_name] = dbscan.labels_
    for cluster in np.unique(dbscan.labels_):
        if cluster == "-1":
            continue
        # get a data frame of all the rows in the current cluster
        cluster_mask = df_mask & (df[cluster_column_name] == cluster)
        # find cluster "original" (the row with minimum id)
        original_row_mask = df.index[cluster_mask][0]
        non_original_row_mask = df.index[cluster_mask][1:]
        original_row = df.loc[original_row_mask]

        original_row_distance_matrix_idx = original_row["group_index"]
        original_row_similarities = 1.0 - distance_matrix[original_row_distance_matrix_idx, :]

        #non_original_row_mask = (cluster_mask & (cluster_df["id"] != original_row_id))
        # mark cluster original with "O"
        df.loc[original_row_mask, duplicate_info_column_name] = "O"
        # mark other rows with "D"
        df.loc[non_original_row_mask, duplicate_info_column_name] = "D"
        # mark every original id column with original's id
        df.loc[cluster_mask,  duplicate_original_column_name] = original_row["id"]
        # get the pairwise similarity between each row and the original row
        df.loc[cluster_mask, similarity_column_name] = original_row_similarities[df.loc[cluster_mask, "group_index"]]
    return df

def cluster_data(
    df: pd.DataFrame,
    name_duplicate_max_distance_threshold: float = 0.1,
    address_duplicate_max_distance_threshold: float = 0.1,
    tfidf_ngram_range: tuple = (1, 1),
    tfidf_min_df: int = 1,
    tfidf_use_char_ngrams: bool = False,
) -> pd.DataFrame:
    def cluster_group(group_df):
        try:
            trivial_mask = (group_df["Ad-Soyad"] != "")
            group_df = cluster_by_column(
                df=group_df, 
                key_column_name="Ad-Soyad", 
                duplicate_max_distance_threshold=name_duplicate_max_distance_threshold, 
                tfidf_ngram_range=tfidf_ngram_range, 
                tfidf_min_df=tfidf_min_df, 
                tfidf_use_char_ngrams=tfidf_use_char_ngrams, 
                df_mask = trivial_mask)
            group_df.loc[group_df["Ad-Soyad"] == "", "Ad-Soyad-cluster"] = -1
        except ValueError as e:
            group_df["Ad-Soyad-cluster"] = -1
        name_clusters = group_df["Ad-Soyad-cluster"].unique()
        for name_cluster in name_clusters:
            cluster_df_mask = (group_df["Ad-Soyad-cluster"] == name_cluster)
            if name_cluster == -1:
                group_df.loc[cluster_df_mask, 'merged_address-cluster'] = -1
                continue
            try:
                group_df = cluster_by_column(
                    df=group_df, 
                    key_column_name="merged_address", 
                    duplicate_max_distance_threshold=address_duplicate_max_distance_threshold, 
                    tfidf_ngram_range=tfidf_ngram_range, 
                    tfidf_min_df=tfidf_min_df, 
                    tfidf_use_char_ngrams=tfidf_use_char_ngrams, 
                    df_mask=cluster_df_mask)
            except ValueError as e:
                group_df.loc[cluster_df_mask, 'merged_address-cluster'] = -1
        return group_df
    df = df.groupby(["İl", "İlçe", "Mahalle"]).apply(cluster_group)
    df.drop(columns=["group_index"], inplace=True)
    return df

In [271]:
grouped = cluster_data(
    df_main, 
    name_duplicate_max_distance_threshold=0.2, 
    address_duplicate_max_distance_threshold=0.3, 
    tfidf_min_df=1, 
    tfidf_ngram_range=(2, 4), 
    tfidf_use_char_ngrams=True
)

To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  df = df.groupby(["İl", "İlçe", "Mahalle"]).apply(cluster_group)


In [272]:
duplicate_cluster_idx = (grouped["Ad-Soyad-cluster"] != -1) & (grouped["merged_address-cluster"] != -1)
grouped_duplicates = grouped[duplicate_cluster_idx]
c_grouped = grouped_duplicates.groupby(["İl", "İlçe", "Mahalle", "Ad-Soyad-cluster", "merged_address-cluster"])
print(len(c_grouped.groups))
grouped_duplicates.sort_values(["İl", "İlçe", "Mahalle", "Ad-Soyad-cluster", "merged_address-cluster"])

3234


Unnamed: 0,İl,İlçe,Mahalle,Adres,Bulvar/Cadde/Sokak/Yol/Yanyol,Bina Adı,Dış Kapı/ Blok/Apartman No,Kat,İç Kapı,Ad-Soyad,...,new_adres,merged_address,Ad-Soyad-cluster,Ad-Soyad-duplicate,Ad-Soyad-duplicate-original-id,Ad-Soyad-duplicate-similarity,merged_address-cluster,merged_address-duplicate,merged_address-duplicate-original-id,merged_address-duplicate-similarity
30543,adana,aladag,akoren,eksik bilgi kirmizi test test,test,,test,test,,test,...,,test test,0.0,O,30541.0,1.0,0.0,O,30541.0,1.000000
32707,adana,aladag,akoren,eksik bilgi kirmizi test,TEST,TEST,TEST,11111111,,test,...,,TEST TEST TEST,0.0,D,30541.0,1.0,0.0,D,30541.0,1.000000
4180,adana,cukurova,beyazevler,mirac apartman,,,,,,masoud darabi,...,mirac apartman,mirac apartman,0.0,O,4180.0,1.0,0.0,O,4180.0,1.000000
9680,adana,cukurova,beyazevler,mirac apartman,,Mirac Apartmanı,,,,masoud darabi,...,mirac apartman,Mirac Apartmanı mirac apartman,0.0,D,4180.0,1.0,0.0,D,4180.0,0.952868
4117,adana,cukurova,guzelyali,suleyman demirel bulvari guzelyali mahallesi p...,süleyman demirel bulvarı,,,,,damla unvar,...,,süleyman demirel bulvarı,0.0,O,4117.0,1.0,0.0,O,4117.0,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7527,sanliurfa,hilvan,bahcelievler,219sok no 14,219.sok,,No: 14,,,songul asar,...,219sok no 14,No: 14 219.sok 219sok no 14,0.0,D,7363.0,1.0,0.0,D,7363.0,0.813330
7562,sanliurfa,hilvan,bahcelievler,219 sokak no14,219 Sokak,,No: 14,,,songul asar,...,219 sokak no14,No: 14 219 Sokak 219 sokak no14,0.0,D,7363.0,1.0,0.0,D,7363.0,0.732206
16837,sanliurfa,hilvan,bahcelievler,219 sokak no14,219. Sokak,,,,,songul asar,...,219 sokak no14,219. Sokak 219 sokak no14,0.0,D,7363.0,1.0,0.0,D,7363.0,0.541418
16486,sanliurfa,karakopru,dogukent,burak apartman,,Burak Apartmanı,,,,fatma tok,...,burak apartman,Burak Apartmanı burak apartman,0.0,O,16486.0,1.0,0.0,O,16486.0,1.000000
