In [61]:
! pip install pandas
! pip install nltk



In [62]:
# module import
import pandas as pd
import numpy as np
import multiprocessing as mp
import re
import csv

from nltk.metrics.distance import jaro_winkler_similarity, jaccard_distance
from nltk.util import ngrams

In [63]:
# load section
def loadFiles(file_name: str):
    with open(file_name, mode='r', encoding="utf8") as file:
        dr = csv.reader(file)
        next(dr, None)  # skip 1st line
        for row in dr:
            yield row[0].strip().lower()


raw_file = '../Resources/ws_invalid.csv'
valid_file = '../Resources/ws_valid.csv'


raw_words = list(loadFiles(file_name=raw_file))
valid_words = list(loadFiles(file_name=valid_file))

print(raw_words)
print(valid_words)

['rumba', 'coco cole', 'macrosoft', 'googlo', 'republic of running', 'sani', 'pholops', 'samseng', 'volvo']
['aquaphor', 'coca cola', 'microsoft', 'google', 'republic of gamers', 'sony', 'philips', 'samsung', 'valve']


In [64]:
# clean up section
def cleaner(value):
    try:
        url_tags = ['http', 'www.', 'https', '://']
        has_tags = [tag for tag in url_tags if tag in value]
        is_url = re.findall(r'.*[\/|.](.*)\..*', value)

        if len(is_url) > 2 and has_tags:
            return is_url[0]
        return value
    except:
        pass
    return 'invalid'


df = pd.DataFrame({'valid': valid_words, 'raw': raw_words})
df['clean'] = df['raw'].copy().apply(cleaner)
df['found'] = None

In [65]:
# work
def find_similarities(df: pd.DataFrame, valid_words: list):
    idx_values = df.index.values
    for idx in range(idx_values[0], int(idx_values[-1])+1):
        value = df.loc[idx]['clean']
        found = 'invalid'
        precision = 0

        if value != '':
            temp = [(jaccard_distance(set(ngrams(value, 2)),
                                      set(ngrams(valid_value, 2))), valid_value)
                    for valid_value in valid_words
                    if valid_value[0] == value[0] or valid_value[0] in value]

            if temp:
                recognized_values = sorted(temp, key=lambda acc: acc[0])
                if recognized_values[0][0] <= 0.80:
                    found = recognized_values[0][1]
                    precision = 1-round(recognized_values[0][0], 1)
                else:
                    second_try = sorted([(jaro_winkler_similarity(value, valid_value[1], p=0.1, max_l=100),
                                        valid_value[1])
                                        for valid_value in recognized_values], reverse=True)
                    if second_try and second_try[0][0] >= 0.70:
                        found = second_try[0][1]
                        precision = round(second_try[0][0], 1)

        df.loc[idx, 'found'] = found
        df.loc[idx, 'precision'] = precision

    return df


find_similarities(df=df, valid_words=valid_words)



Unnamed: 0,valid,raw,clean,found,precision
0,aquaphor,rumba,rumba,invalid,0.0
1,coca cola,coco cole,coco cole,coca cola,0.4
2,microsoft,macrosoft,macrosoft,microsoft,0.6
3,google,googlo,googlo,google,0.7
4,republic of gamers,republic of running,republic of running,republic of gamers,0.5
5,sony,sani,sani,sony,0.7
6,philips,pholops,pholops,philips,0.2
7,samsung,samseng,samseng,samsung,0.5
8,valve,volvo,volvo,valve,0.8


In [66]:
#create result file
df['is_match'] = np.where((df['valid'] == df['found']),1,0)
df['found'] = df['found'].str.title()
df['valid'] = df['valid'].str.title()

print(df.shape)
print(df.head(19))

(9, 6)
                valid                  raw                clean  \
0            Aquaphor                rumba                rumba   
1           Coca Cola            coco cole            coco cole   
2           Microsoft            macrosoft            macrosoft   
3              Google               googlo               googlo   
4  Republic Of Gamers  republic of running  republic of running   
5                Sony                 sani                 sani   
6             Philips              pholops              pholops   
7             Samsung              samseng              samseng   
8               Valve                volvo                volvo   

                found  precision  is_match  
0             Invalid        0.0         0  
1           Coca Cola        0.4         1  
2           Microsoft        0.6         1  
3              Google        0.7         1  
4  Republic Of Gamers        0.5         1  
5                Sony        0.7         1  
6      