# Import libraries

In [None]:
import re
import pykakasi
import numpy as np
from unidecode import unidecode

# Impute NaN data with KNNImputer using custom fucntion 

In [None]:
from sklearn.impute import KNNImputer
from sklearn.metrics import pairwise_distances

features = [fe for fe in train_df.columns if fe not in ['Id','Class']]

def cosine_dist(X, Y, metric='cosine', missing_values=np.nan, **kwargs):
    X[np.isnan(X)]=0
    Y[np.isnan(Y)]=0
    return pairwise_distances(X=X.reshape(-1, 1), 
                              Y=Y.reshape(-1, 1), 
                              metric='cosine').sum()

imputer = KNNImputer(n_neighbors=5, metric=cosine_dist)
imputer.fit_transform(train_df[features])

## Is text English?

In [None]:
def isEnglish(s):
    ss = "ª°⭐•®’—–™&\xa0\xad\xe2\xf0"  # special characters
    s = str(s).lower()
    for k in range(len(ss)):
        s = s.replace(ss[k], "")
    try:
        s.encode(encoding="utf-8").decode("ascii")
    except UnicodeDecodeError:
        # not english; check it still not english if western european characters are removed
        ss = "éáñóüäýöçãõúíàêôūâşè"
        for k in range(len(ss)):
            s = s.replace(ss[k], "")
        try:
            s.encode(encoding="utf-8").decode("ascii")
        except UnicodeDecodeError:
            return 3  # really not english
        else:
            return 2  # spanish/french?
    else:
        return 1  # english

# Convert japaneese alphabet

In [None]:
def convert_japanese_alphabet(df):
    kakasi = pykakasi.kakasi()
    kakasi.setMode('H', 'a')  # Convert Hiragana into alphabet
    kakasi.setMode('K', 'a')  # Convert Katakana into alphabet
    kakasi.setMode('J', 'a')  # Convert Kanji into alphabet
    conversion = kakasi.getConverter()

    def convert(row):
        for column in ["name", "address", "city", "state"]:
            try:
                row[column] = conversion.do(row[column])
            except Exception:
                pass
        return row

    df = df.apply(convert, axis=1)
    return df

# Process some categorical like features (couple of categories in one string)

In [None]:
def process_categories(cat, split=" "):
    cat = [x for x in str(cat).split(split) if cat != "" and len(x) >= 2]
    # Keep only letters
    cat = [re.sub(r"[^a-zA-Z]", " ", x) for x in cat]
    # Delete multi space
    cat = [re.sub("\\s+", " ", x).strip() for x in cat]
    return cat

# Function to fill missing categories
def find_cat(name, Key_words_for_cat):
    name_list = process_categories(unidecode(str(name).lower()))
    for cat, wordlist in Key_words_for_cat.items():
        if any(name_word in name_list for name_word in wordlist):
            return cat
    return ""

def process_text_cat(text):
    text = unidecode(text.lower())
    res = " ".join([re.sub(r"[^a-zA-Z]", " ", x).strip() for x in text.split()])
    return re.sub("\\s+", " ", res).strip()


def simplify_cat(categories, Cat_regroup):
    categories = str(categories).lower()
    if categories in ("", "nan"):
        return -1
    for cat in categories.split(","):
        cat = process_text_cat(cat)
        for i, Liste in enumerate(Cat_regroup):
            if any(cat == x for x in Liste):
                return i + 1
    else:
        return 0

# Remove special symbols

In [None]:
def st(x, remove_space=False):
    # turn to latin alphabet
    x = unidecode(str(x))
    # lower case
    x = x.lower()
    # remove symbols
    x = x.replace('"', "")
    ss = ",:;'/-+&()!#$%*.|\@`~^<>?[]{}_=\n"  # noqa
    if remove_space:
        ss = " " + ss
    for i in range(len(ss)):
        x = x.replace(ss[i], "")
    return x


def rem_expr(x):
    x = str(x)
    x = x.replace("™", "")  # tm
    x = x.replace("®", "")  # r
    x = x.replace("ⓘ", "")  # i
    x = x.replace("©", "")  # c
    return x


def rem_abr(x):
    x = str(x)
    if "(" in x and ")" in x:  # there are brakets
        i = x.find("(")
        j = x.find(")")
        if j > i + 1 and j - i < 10 and len(x) - (j - i) > 9:  # remainder is long enough
            s = x[i + 1: j]
            # clean it
            ss = " ,:;'/-+&()!#$%*.|`~^<>?[]{}_=\n"
            for k in range(len(ss)):
                s = s.replace(ss[k], "")
            if s == s.upper():  # all caps (and/or numbers)
                x = x[:i] + x[j + 1:]
    return x

# Clean phones

In [None]:
def process_phone(text):
    text = str(text)
    if text == "nan":
        return ""
    L = []
    for char in text:
        if char.isdigit():
            L.append(char)
    res = "".join(L)[-10:].zfill(10)
    if len(res) > 0:
        return res
    else:
        return text
    
train["phone"] = train["phone"].apply(lambda text: process_phone(text))
# all matches of last 9 digits look legit - drop leading digit
train["phone"] = train["phone"].str[1:]
# set invalid numbers to empty
idx = (train["phone"] == "000000000") | (train["phone"] == "999999999")
train["phone"].loc[idx] = ""

# Clean URL

In [None]:
train["url"] = train["url"].str[:129]  # cap length at 129
train["url"].loc[train["url"] == "nan"] = ""
idx = train["url"].str[:8] == "httpswww"
train["url"].loc[idx] = train["url"].str[8:].loc[idx]
idx = train["url"].str[:7] == "httpwww"
train["url"].loc[idx] = train["url"].str[7:].loc[idx]
idx = train["url"].str[:5] == "https"
train["url"].loc[idx] = train["url"].str[5:].loc[idx]
idx = train["url"].str[:4] == "http"
train["url"].loc[idx] = train["url"].str[4:].loc[idx]
train["url"].loc[train["url"] == "nan"] = ""

# Remove common words

In [None]:
def clean_nums(x):  # remove st/nd/th number extensions
    words = [
        "1st",
        "2nd",
        "3rd",
        "4th",
        "5th",
        "6th",
        "7th",
        "8th",
        "9th",
        "0th",
        "1th",
        "2th",
        "3th",
        "4 th",
        "5 th",
        "6 th",
        "7 th",
        "8 th",
        "9 th",
        "0 th",
        "1 th",
        "2 th",
        "3 th",
        "1 st",
        "2 nd",
        "3 nd",
    ]
    for word in words:
        x = x.replace(word, word[0])
    return x

def rem_words(x):  # remove common words without much meaning
    words = [
        "the",
        "de",
        "of",
        "da",
        "la",
        "a",
        "an",
        "and",
        "at",
        "b",
        "el",
        "las",
        "los",
        "no",
        "di",
        "by",
        "le",
        "del",
        "in",
        "co",
        "inc",
        "llc",
        "llp",
        "ltd",
        "on",
        "der",
        " das",
        "die",
    ]
    for word in words:
        x = x.replace(" " + word + " ", " ")  # middle
        if x[: len(word) + 1] == word + " ":  # start
            x = x[len(word) + 1:]
        if x[-len(word) - 1:] == " " + word:  # end
            x = x[: -len(word) - 1]
    return x

# select capitals only, or first letter of each word (which could have been capital)
def get_caps_leading(name):
    name = unidecode(name)
    if name[:3].lower() == "the":  # drop leading 'the' - do not include it in name
        name = name[3:]
    name = rem_words(
        name
    )  # remove common words without much meaning; assume they are always lowercase
    name = clean_nums(name)  # remove st/nd/th number extensions
    name = [x for x in str(name).split(" ") if name != "" and len(x) >= 2]
    # keep only capitals or first letters
    name = [re.findall(r"^[a-z]|[A-Z]", x) for x in name]
    # merge
    name = ["".join(x) for x in name]
    name = "".join(name)
    return name.lower()