python-Levenshtein  
https://rawgit.com/ztane/python-Levenshtein/master/docs/Levenshtein.html  
  
  
編集距離 (Levenshtein Distance)  
https://naoya-2.hatenadiary.org/entry/20090329/1238307757  
  
  
文字列間の類似性を測るための『標準化』編集距離の計算方法について  
https://qiita.com/Ishio/items/d52b9221c92bd4ebb344  
  
  
english-words  
https://github.com/dwyl/english-words  
  
  
Pythonでリストをflattenする方法まとめ  
http://d.hatena.ne.jp/xef/20121027/p2  
  
  
文字列類似度評価 レーベンシュタイン距離 / ジャロ・ウィンクラー距離  
http://grahamian.hatenablog.com/entry/word_similarity  
  
  
２つの文字列の類似度を数値化　レーベンシュタイン距離とジャロ・ウィンクラー距離の解説  
http://nkdkccmbr.hateblo.jp/entry/2016/08/18/102727  
  

Jaro Winkler  
https://www.kaggle.com/alvations/jaro-winkler

In [92]:
import pandas as pd
import numpy as np

# テストデータ作成

正解ブランド名  
小文字に統一している

In [93]:
brand_list = [c.lower() for c in ["Vuitton", "Gucci"]]

brand_list

['vuitton', 'gucci']

テストタイトル  
"Vuitton"と"Gucci"の表記ミスを追加している

In [94]:
item_title = ["Pochette voyage Louis Vuitton",
"Pochette voyage Louis Vuiton",
"Pochette voyage Louis Vuittonn",
"Genuine Gucci 1980’s Vintage Navy Blue Boston Bag",
"Genuine Guchi 1980’s Vintage Navy Blue Boston Bag",
"Genuine Gutti 1980’s Vintage Navy Blue Boston Bag",
"Vuittonn and Gutti"
             ]

In [95]:
df = pd.DataFrame({
    "id":pd.Series(range(0,len(item_title))),
    "item_title":item_title
})

df

Unnamed: 0,id,item_title
0,0,Pochette voyage Louis Vuitton
1,1,Pochette voyage Louis Vuiton
2,2,Pochette voyage Louis Vuittonn
3,3,Genuine Gucci 1980’s Vintage Navy Blue Boston Bag
4,4,Genuine Guchi 1980’s Vintage Navy Blue Boston Bag
5,5,Genuine Gutti 1980’s Vintage Navy Blue Boston Bag
6,6,Vuittonn and Gutti


小文字に統一する

In [96]:
df["item_title"] = df["item_title"].str.lower()

df

Unnamed: 0,id,item_title
0,0,pochette voyage louis vuitton
1,1,pochette voyage louis vuiton
2,2,pochette voyage louis vuittonn
3,3,genuine gucci 1980’s vintage navy blue boston bag
4,4,genuine guchi 1980’s vintage navy blue boston bag
5,5,genuine gutti 1980’s vintage navy blue boston bag
6,6,vuittonn and gutti


# テストデータの単語リスト作成

In [97]:
df["item_title"].str.split(" ").tolist()

[['pochette', 'voyage', 'louis', 'vuitton'],
 ['pochette', 'voyage', 'louis', 'vuiton'],
 ['pochette', 'voyage', 'louis', 'vuittonn'],
 ['genuine', 'gucci', '1980’s', 'vintage', 'navy', 'blue', 'boston', 'bag'],
 ['genuine', 'guchi', '1980’s', 'vintage', 'navy', 'blue', 'boston', 'bag'],
 ['genuine', 'gutti', '1980’s', 'vintage', 'navy', 'blue', 'boston', 'bag'],
 ['vuittonn', 'and', 'gutti']]

重複を除外する

In [98]:
from itertools import chain
title_list = list(set(chain.from_iterable(df["item_title"].str.split(" ").tolist())))

title_list

['voyage',
 'gutti',
 'vuitton',
 'guchi',
 'bag',
 'navy',
 'vuittonn',
 'and',
 'vintage',
 'blue',
 'vuiton',
 '1980’s',
 'boston',
 'genuine',
 'gucci',
 'louis',
 'pochette']

# 編集距離の計算（Levenshtein Distance）

In [99]:
import Levenshtein

word_miss = []
word_corr = []
word_dist = []

for word_test in title_list:
    for word_dict in brand_list:
            dist = Levenshtein.distance(word_test, word_dict)/(max(len(word_test), len(word_dict)))
            if dist < 0.5 and dist != 0:
                word_miss.append(word_test)
                word_corr.append(word_dict)
                word_dist.append(round(dist, 3))

print(word_miss, word_corr,word_dist)

['gutti', 'guchi', 'vuittonn', 'vuiton'] ['gucci', 'gucci', 'vuitton', 'vuitton'] [0.4, 0.2, 0.125, 0.143]


# 元のデータフレームから対象レコードを抽出

In [100]:
result = pd.DataFrame(columns={"id", "item_title"})

for m,c,d in zip(word_miss, word_corr, word_dist):
        temp = df.loc[df["item_title"].str.contains(m), :]
        temp.loc[temp["item_title"].str.contains(m), "miss_word"] = m
        temp.loc[temp["item_title"].str.contains(m), "corr_word"] = c
        temp.loc[temp["item_title"].str.contains(m), "distance"] = d
        result = result.append(temp, sort = False).sort_values(by = "id")

result

Unnamed: 0,item_title,id,miss_word,corr_word,distance
1,pochette voyage louis vuiton,1,vuiton,vuitton,0.143
2,pochette voyage louis vuittonn,2,vuittonn,vuitton,0.125
4,genuine guchi 1980’s vintage navy blue boston bag,4,guchi,gucci,0.2
5,genuine gutti 1980’s vintage navy blue boston bag,5,gutti,gucci,0.4
6,vuittonn and gutti,6,gutti,gucci,0.4
6,vuittonn and gutti,6,vuittonn,vuitton,0.125


# 編集距離の計算（Jaro-Winkler Distance）

In [101]:
from pyjarowinkler import distance

word_miss = []
word_corr = []
word_dist = []

for word_test in title_list:
    for word_dict in brand_list:
            dist =  distance.get_jaro_distance(word_test, word_dict, winkler=True, scaling=0.1)
            if dist > 0.75 and dist != 1:
                word_miss.append(word_test)
                word_corr.append(word_dict)
                word_dist.append(round(dist, 3))

print(word_miss, word_corr, word_dist)

['gutti', 'guchi', 'vuittonn', 'vuiton'] ['gucci', 'gucci', 'vuitton', 'vuitton'] [0.79, 0.91, 0.98, 0.97]


In [102]:
result = pd.DataFrame(columns={"id", "item_title"})

for m,c,d in zip(word_miss, word_corr,word_dist):
        temp = df.loc[df["item_title"].str.contains(m), :]
        temp.loc[temp["item_title"].str.contains(m), "miss_word"] = m
        temp.loc[temp["item_title"].str.contains(m), "corr_word"] = c
        temp.loc[temp["item_title"].str.contains(m), "distance"] = d
        result = result.append(temp, sort = False).sort_values(by = "id")

result

Unnamed: 0,item_title,id,miss_word,corr_word,distance
1,pochette voyage louis vuiton,1,vuiton,vuitton,0.97
2,pochette voyage louis vuittonn,2,vuittonn,vuitton,0.98
4,genuine guchi 1980’s vintage navy blue boston bag,4,guchi,gucci,0.91
5,genuine gutti 1980’s vintage navy blue boston bag,5,gutti,gucci,0.79
6,vuittonn and gutti,6,gutti,gucci,0.79
6,vuittonn and gutti,6,vuittonn,vuitton,0.98


# Levenshtein Distance 関数化

In [103]:
brand_list = ["Vuitton", "Gucci"]

brand_list

['Vuitton', 'Gucci']

In [104]:
item_title = ["Pochette voyage Louis Vuitton",
"Pochette voyage Louis Vuiton",
"Pochette voyage Louis Vuittonn",
"Genuine Gucci 1980’s Vintage Navy Blue Boston Bag",
"Genuine Guchi 1980’s Vintage Navy Blue Boston Bag",
"Genuine Gutti 1980’s Vintage Navy Blue Boston Bag",
"Vuittonn and Gutti"
             ]

In [105]:
df = pd.DataFrame({
    "id":pd.Series(range(0,len(item_title))),
    "item_title":item_title
})

df

Unnamed: 0,id,item_title
0,0,Pochette voyage Louis Vuitton
1,1,Pochette voyage Louis Vuiton
2,2,Pochette voyage Louis Vuittonn
3,3,Genuine Gucci 1980’s Vintage Navy Blue Boston Bag
4,4,Genuine Guchi 1980’s Vintage Navy Blue Boston Bag
5,5,Genuine Gutti 1980’s Vintage Navy Blue Boston Bag
6,6,Vuittonn and Gutti


In [106]:
import pandas as pd
import numpy as np
from itertools import chain
import Levenshtein

In [107]:
def levenshtein_similarity(df, brand_list, dist = 0.5):
    df["item_title"] = df["item_title"].str.lower()
    brand_list = [c.lower() for c in brand_list]

    title_list = list(set(chain.from_iterable(df["item_title"].str.split(" ").tolist())))

    word_miss = []
    word_corr = []
    word_dist = []

    for word_test in title_list:
        for word_dict in brand_list:
                dist = Levenshtein.distance(word_test, word_dict)/(max(len(word_test), len(word_dict)))
                if dist < 0.5 and dist != 0:
                    word_miss.append(word_test)
                    word_corr.append(word_dict)
                    word_dist.append(round(dist, 3))

    result = pd.DataFrame(columns={"id", "item_title"})

    for m,c,d in zip(word_miss, word_corr, word_dist):
            temp = df.loc[df["item_title"].str.contains(m), :].copy()
            temp.loc[temp["item_title"].str.contains(m), "miss_word"] = m
            temp.loc[temp["item_title"].str.contains(m), "corr_word"] = c
            temp.loc[temp["item_title"].str.contains(m), "distance"] = d
            result = result.append(temp, sort = False).sort_values(by = "id")

    return result

In [108]:
levenshtein_similarity(df, brand_dict, 0.5)

Unnamed: 0,item_title,id,miss_word,corr_word,distance
1,pochette voyage louis vuiton,1,vuiton,vuitton,0.143
2,pochette voyage louis vuittonn,2,vuittonn,vuitton,0.125
4,genuine guchi 1980’s vintage navy blue boston bag,4,guchi,gucci,0.2
5,genuine gutti 1980’s vintage navy blue boston bag,5,gutti,gucci,0.4
6,vuittonn and gutti,6,gutti,gucci,0.4
6,vuittonn and gutti,6,vuittonn,vuitton,0.125


# Jaro-Winkler Distance 関数化

In [109]:
brand_list = ["Vuitton", "Gucci"]

brand_list

['Vuitton', 'Gucci']

In [110]:
item_title = ["Pochette voyage Louis Vuitton",
"Pochette voyage Louis Vuiton",
"Pochette voyage Louis Vuittonn",
"Genuine Gucci 1980’s Vintage Navy Blue Boston Bag",
"Genuine Guchi 1980’s Vintage Navy Blue Boston Bag",
"Genuine Gutti 1980’s Vintage Navy Blue Boston Bag",
"Vuittonn and Gutti"
             ]

In [111]:
df = pd.DataFrame({
    "id":pd.Series(range(0,len(item_title))),
    "item_title":item_title
})

df

Unnamed: 0,id,item_title
0,0,Pochette voyage Louis Vuitton
1,1,Pochette voyage Louis Vuiton
2,2,Pochette voyage Louis Vuittonn
3,3,Genuine Gucci 1980’s Vintage Navy Blue Boston Bag
4,4,Genuine Guchi 1980’s Vintage Navy Blue Boston Bag
5,5,Genuine Gutti 1980’s Vintage Navy Blue Boston Bag
6,6,Vuittonn and Gutti


In [112]:
import pandas as pd
import numpy as np
from itertools import chain
from pyjarowinkler import distance

In [113]:
def jarowinkler_similarity(df, brand_list, dist = 0.75):
    df["item_title"] = df["item_title"].str.lower()
    brand_list = [c.lower() for c in brand_list]

    title_list = list(set(chain.from_iterable(df["item_title"].str.split(" ").tolist())))

    word_miss = []
    word_corr = []
    word_dist = []

    for word_test in title_list:
        for word_dict in brand_list:
                dist =  distance.get_jaro_distance(word_test, word_dict, winkler=True, scaling=0.1)
                if dist > 0.75 and dist != 1:
                    word_miss.append(word_test)
                    word_corr.append(word_dict)
                    word_dist.append(round(dist, 3))


    result = pd.DataFrame(columns={"id", "item_title"})

    for m,c,d in zip(word_miss, word_corr, word_dist):
            temp = df.loc[df["item_title"].str.contains(m), :].copy()
            temp.loc[temp["item_title"].str.contains(m), "miss_word"] = m
            temp.loc[temp["item_title"].str.contains(m), "corr_word"] = c
            temp.loc[temp["item_title"].str.contains(m), "distance"] = d
            result = result.append(temp, sort = False).sort_values(by = "id")

    return result

In [114]:
jarowinkler_similarity(df, brand_list, 0.75)

Unnamed: 0,item_title,id,miss_word,corr_word,distance
1,pochette voyage louis vuiton,1,vuiton,vuitton,0.97
2,pochette voyage louis vuittonn,2,vuittonn,vuitton,0.98
4,genuine guchi 1980’s vintage navy blue boston bag,4,guchi,gucci,0.91
5,genuine gutti 1980’s vintage navy blue boston bag,5,gutti,gucci,0.79
6,vuittonn and gutti,6,gutti,gucci,0.79
6,vuittonn and gutti,6,vuittonn,vuitton,0.98
