In [None]:
from epitran import Epitran
import json
import re
import pandas as pd
from tqdm.auto import tqdm
import os

from pyxdameraulevenshtein import normalized_damerau_levenshtein_distance
import textdistance
from monge_eklan import MongeEklan

# Перевод латиницы в кириллицу на основе Epitran

In [1]:
def translate(string):
    
    res = epi.transliterate(string)

    regular_dict = {
        "m": "м",
        "e": "е",
        '̩': "",
        "l": "л",
        'ɪ': "и",
        "s": "с",
        "f": "ф",
        "n": "н",
        "t": "т",
        "i": "и",
        "p": "п",
        "k": "к",
        "ɔ": "о",
        "o": "о",
        "ɑ": "о",
        "p": "п",
        "d": "д",
        "ɡ": "г",
        "ɛ": "е",
        "z": "з",
        "ŋ": "нг",
        "v": "в",
        "b": "б",
        'ʌ': "а",
        'ʤ': "дж",
        'ʒ': "дж",
        'u': "у",
        "ʃ":"ш",
        'æ': "е",
        'j':"й",
        'ɹ':"ер",
        "ə":"е",
        "w":"у",
        "h":"х",
        "ʧ":"ч",
        'a':"а",
        'ʊ':"ью",
        "θ":'з',
        'ɹ': "р",
        'ð':"с",
    }
    
    extra_dict = {
        "o":"о",
        "a":"а",
        "e": "е",
    }

    result = ""

    lst = list(res)
    for i,letter in enumerate(lst):
        
        if letter == 'æ' and i == 0: 
            result += 'э'
            continue
                
        if letter == 'ɑ' and i == len(lst)-1:
            result += "а"
            continue
        
        if letter == 'j' and i != len(lst)-1 and lst[i+1] in ["u", 'ʊ']: 
            continue
            
        if letter == 'ɹ' and i==len(lst)-2 and string.endswith("er"):
            result += "ер"
            continue
                
        if letter == 'ɹ' and i != 0 and lst[i-1] == "s":
            result += "ер"
            continue
            
        if letter == "ə" and i == 0 and string[0] != "u":
            result += extra_dict[string[0]]
            continue
            
        if letter == "ə" and res.endswith("iə") and i == len(lst)-1:
            result += "я"
            continue
                
        if letter == "ə" and res.endswith("ə") and i == len(lst)-1:
            result += "а"
            continue
            
        if letter == "w" and i > 0 and lst[i-1] == "o":
            continue 
            
        if letter == "u" and i!=0 and lst[i-1] == "j":
            result += "ью"
            continue
                
        if letter == "θ" and i == len(lst)-1:
            result += "с"
            continue
                
        else:
            if letter in regular_dict:
                result += regular_dict[letter]
                continue
            else:
                result += letter
                continue 
    result += " "
    result = result.strip()
    return result

In [None]:
epi = Epitran("eng-Latn", ligatures=True)

# Посчитаем символьные и лексические расстояния для пар запросов латиницей, переведенной в кириллицу

In [None]:
def read_df(path):
    df = pd.read_csv(path, sep="\t")
    return df

def make_transliteration(df):
    df_eng = df[(df.tag==0) & df.agg_types.str.contains("L")]
    # соберем только токены на английском
    eng_list = set()
    qs = df_eng[["query", "prev_query"]].values.tolist()
    for pair in qs:
        for query in pair:
            if isinstance(query, str) and re.search("[A-Za-z]+", query):
                query = query.split()
                for w in query:
                    if re.search("[A-Za-z]+", w):
                        eng_list.add(w)
    print("Proportion of pairs with Latin: ", len(qs)/len(df))
    eng_list = list(eng_list)
    
    print("Transliterating Latin tokens")
    d = {}
    for line in tqdm(eng_list):
        try:
            d[line] = translate(line)
        except:
            print(line)
            
    df["q_trans"] = df["query"].apply(translate_queries)
    df["prev_q_trans"] = df["prev_query"].apply(translate_queries)
    return df
                    

def translate_queries(query):
    if not isinstance(query, str):
        return query
        
    if not re.search("[A-Za-z]+",query):
        return query
    
    query = query.split()
    res = ""
    for i,w in enumerate(query):
        if w in d:
            res += d[w] + " "
        else:
            if i == len(query)-1:
                res += w
                continue
            res += w + " "
    return res

def apply_damerau_levenshtein(pairs):
    dists = {}
    print("Counting Damerau-Levenstein distance")
    for i, pair in enumerate(tqdm(pairs)):
        if not isinstance(pair[0], str) or not isinstance(pair[1], str):
            dists[i] = -1
        else:
            dists[i] = normalized_damerau_levenshtein_distance(pair[0], pair[1])
    return dists

def levenstein(df):
    df["trans_d_lev"] = -1
    pairs = df[["q_trans","prev_q_trans"]].values.tolist()
    lev_dists = apply_damerau_levenshtein(pairs)
    df["trans_d_lev"] = [lev_dists[i] for i in df.index]
    return df

def count_overlap(s1, s2):
    
    s1 = set(s1.split())
    s2 = set(s2.split())
    denom = (len(s1) + len(s2))/2
    if denom == 0:
        return 1.0
    overlap_ratio = 1 - (len(s1 & s2)/denom)
    return overlap_ratio


def overlap(df):
    pairs = df[["q_trans", "prev_q_trans"]].values.tolist()
    
    overlaps = {}
    print("Counting overlap score")
    for i, pair in enumerate(tqdm(pairs)):
        if not isinstance(pair[0], str) or not isinstance(pair[1], str):
            overlaps[i] = -1
        else:
            overlaps[i] = count_overlap(pair[0], pair[1])
    df["trans_overlap"] = [overlaps[i] for i in df.index]   
    return df

def monge_eklan(df):
    pairs = df[["q_trans", "prev_q_trans"]].values.tolist()
    monge = MongeEklan()
    monge_diffs = {}
    print("Counting Monge-Eklan score")
    for i, pair in enumerate(tqdm(pairs)):
        if not isinstance(pair[0], str) or not isinstance(pair[1], str):
            monge_diffs[i] = -1
        else:
            monge_diffs[i] = 1 - monge.score(pair[0], pair[1], m=2)
    df["trans_monge_eklan"] = [monge_diffs[i] for i in df.index]
    
    return df

def main():
    # рассчитаем близости между транслитерированными запросами для всех датафреймов
    paths = [i for i in os.listdir(".") if i.endswith("edit.tsv")]
    for path in paths:
        print(path)
        df = read_df(path)        
        print("Df processed")
        df = make_transliteration(df)
        df = levenstein(df)
        df = overlap(df)
        df = monge_eklan(df)
        new_path = path.split(".")[0] + "-trans.tsv"
        df.to_csv(new_path, sep="\t", index=False)

In [None]:
# main()

# Проставим теги на основе новых расстояний 

In [None]:
import pandas as pd
import os
from tqdm.auto import tqdm

def read_df(path):
    df = pd.read_csv(path, sep="\t")
    return df

def make_tags(df, indices):
    indices = {i:True for i in indices}
    tags = df["tag"].tolist()
    for i in tqdm(df.index):
        try:
            if indices[i]:
                tags[i] = 1
        except KeyError:
            continue
    df["tag"] = tags
    return df

def tag(df):
    # indices for monge-eklan
    indices = df[(df.tag==0) & (df.agg_types.str.contains("L")) & 
               (df.trans_monge_eklan <= 0.15) & 
               (df.n_tokens > 1) & 
                (df.prev_n_tokens > 1)].index.tolist()
    print("Tag 1 proportion before Monge Eklan", sum(df.tag)/len(df))
    df = make_tags(df, indices)
    print("Tag 1 proportion after Monge Eklan", sum(df.tag)/len(df))    
    
    
    # indices for word overlap
    indices = df[(df.tag==0) & 
                 (df.agg_types.str.contains("L")) & 
                 (df.trans_overlap <= 0.25)].index.tolist()
    print("Tag 1 proportion before overlap", sum(df.tag)/len(df))
    df = make_tags(df, indices)
    print("Tag 1 proportion after overlap", sum(df.tag)/len(df))

    
    #indices for damerau-levenstein 
    print("Tag 1 proportion before Damerau-Levenstein", sum(df.tag)/len(df))
    df = df.fillna("")
    # for 2 and 3 tokens
    indices = df[((df.n_tokens > 1) 
              & (df.prev_n_tokens > 1)) 
            & (df.n_tokens < 4) 
             & (df.prev_n_tokens < 4)
            & (df.trans_d_lev <= 0.2)
            & (~df["query"].str.contains("квартир"))
            & (~df["prev_query"].str.contains("квартир"))
            & (df.n_char > 5)].index.tolist()
    df = make_tags(df, indices)
    
    # for more than 3 tokens 
    indices = df[(df.n_tokens > 3) 
                 & (df.prev_n_tokens > 3) 
                 & (df.trans_d_lev <= 0.3)
                & (~df["query"].str.contains("квартир"))
                & (~df["prev_query"].str.contains("квартир"))
                & (df.n_char > 5)].index.tolist()
    df = make_tags(df, indices)
    
    # for 1 token 
    indices = df[(df.n_tokens == 1) 
       & (df.prev_n_tokens == 1) 
       & (df.trans_d_lev <= 0.4)  
       & (df.prev_type != "D") 
       & (df.type != "D") 
       & (df.n_char > 5)].index.tolist()
    df = make_tags(df, indices)
    
    print("Tag 1 proportion after Damerau-Levenstein", sum(df.tag)/len(df))
    return df


def main():
    paths = [i for i in os.listdir() if i.endswith("trans.tsv")]
    for path in paths:
        print(path)
        df = read_df(path)
        print("Df processed")
        new_df = tag(df)
        new_path = path.split(".")[0] + "-final.tsv"
        new_df.to_csv(new_path, sep="\t", index=False)
    return new_df

In [None]:
main()