In [1]:
import pandas as pd
import numpy as np
import csv
import re
import unicodedata
import math
from tqdm import tqdm

from fuzzywuzzy import fuzz, process
from ast import literal_eval


In [2]:
data = pd.read_csv('brand_train.csv')

data = data[['name','brand']]
print('資料總筆數:', len(data))
print('品牌總數:', len(data['brand'].unique()))

資料總筆數: 70046
品牌總數: 4068


In [6]:
# 全形轉半形
def strQ2B(ustring):
    rstring = ""
    for uchar in ustring:
        inside_code = ord(uchar)
        if inside_code == 12288:                            # 全形空格直接轉換
            inside_code = 32
        elif 65281 <= inside_code <= 65374:                # 全形字元（除空格）根據關係轉化
            inside_code -= 65248
        rstring += chr(inside_code)
    return rstring

In [370]:
# 拉丁語系轉換，如:biore
def reverse(x: str):
    return ''.join(c for c in unicodedata.normalize('NFD', x)
                  if unicodedata.category(c) != 'Mn')

In [371]:
# 品牌大寫轉小寫
data['brand'] = data['brand'].str.lower()

for i in tqdm(range(len(data))):
    try:
        data['brand'][i] = reverse(strQ2B(data['brand'][i]))
    except:
        pass

100%|██████████████████████████████████| 70046/70046 [00:06<00:00, 11544.90it/s]


In [372]:
oribrand = pd.DataFrame(data['brand'].unique(), columns = ['original_brand'])
oribrand

Unnamed: 0,original_brand
0,元山
1,姍拉娜
2,台鹽
3,neo-tec妮傲絲翠
4,舒潔
...,...
3969,smith's rosebud salve
3970,van pur
3971,sweep robot
3972,malibu beauty


# 整理品牌名單

In [373]:
presave = oribrand.copy(deep=True)

In [384]:
brand_list = presave.copy(deep=True)

## 1. 單一品牌名單（全小寫，順序英+中）

In [380]:
brand_list

Unnamed: 0,original_brand
0,元山
1,姍拉娜
2,台鹽
3,neo-tec妮傲絲翠
4,舒潔
...,...
3969,smith's rosebud salve
3970,van pur
3971,sweep robot
3972,malibu beauty


In [385]:
brand_list['brand_ch'] = ''
brand_list['brand_en'] = ''
brand_list['cleaned_brand'] = ''

for i in tqdm(range(len(brand_list))):
    dtxt = brand_list['original_brand'][i]
    
    if not dtxt.isnumeric():
        # 提出中文
        try:
            # 先提出中英複合詞，e.g. 阿q桶麵, 台灣g霸
            brand_list['brand_ch'][i] = re.search(u"[\u4e00-\u9fa5]+[\u0061-\u007a]+[\u4e00-\u9fa5]+", dtxt).group()
        except AttributeError:
            # 刪除英文字符
            brand_list['brand_ch'][i] = re.sub("( |~|-|·|‧|'|\.|\d+|[0-9]+\.[0-9])?[\u0061-\u007a](~| |-|·|‧|'|\.|!|’|&|/|:)?( |\d+|\+|[0-9]+\.[0-9]|&)?", "", dtxt)

        try:
            # 確認架構是否為中+數字+英 或 英+數字+中
            tmp = re.search(u"[\u4e00-\u9fa5]+( )?\d+( )?[\u0061-\u007a]+|[\u0061-\u007a]+( )?\d+( )?[\u4e00-\u9fa5]+", dtxt).group()
            brand_list['brand_en'][i] = re.sub("( |\d+|[0-9]+\.[0-9])?[\u4e00-\u9fa5]+[\u0061-\u007a]+[\u4e00-\u9fa5]+( )?|( )?[\u4e00-\u9fa5]+(~| |-|·|‧)?", "", dtxt)
        except AttributeError:
            # 提出英文
            brand_list['brand_en'][i] = re.sub("( |\d+|[0-9]+\.[0-9])?[\u4e00-\u9fa5]+[\u0061-\u007a]+[\u4e00-\u9fa5]+( )?|( |\d+|[0-9]+\.[0-9])?[\u4e00-\u9fa5]+(~| |-|·|‧|±|[0-9]+\.[0-9])?(\d+|\+)?", "", dtxt)
        
        brand_list['cleaned_brand'][i] = brand_list['brand_en'][i]+brand_list['brand_ch'][i]
        
    else:
        brand_list['cleaned_brand'][i] = dtxt
        pass

brand_list
    

100%|█████████████████████████████████████| 3974/3974 [00:01<00:00, 3517.46it/s]


Unnamed: 0,original_brand,brand_ch,brand_en,cleaned_brand
0,元山,元山,,元山
1,姍拉娜,姍拉娜,,姍拉娜
2,台鹽,台鹽,,台鹽
3,neo-tec妮傲絲翠,妮傲絲翠,neo-tec,neo-tec妮傲絲翠
4,舒潔,舒潔,,舒潔
...,...,...,...,...
3969,smith's rosebud salve,,smith's rosebud salve,smith's rosebud salve
3970,van pur,,van pur,van pur
3971,sweep robot,,sweep robot,sweep robot
3972,malibu beauty,,malibu beauty,malibu beauty


### ----------
### 暫時存檔（依英文順序排列）

In [386]:
print('單一品牌總數:', len(brand_list['original_brand'].unique()))
print('英文切割總數:', len(brand_list['brand_ch'].unique()))
print('中文切割總數:', len(brand_list['brand_en'].unique()))

單一品牌總數: 3974
英文切割總數: 2385
中文切割總數: 2377


In [387]:
sr = brand_list.sort_values('brand_en', ascending=True)
sr

Unnamed: 0,original_brand,brand_ch,brand_en,cleaned_brand
0,元山,元山,,元山
1946,金安心,金安心,,金安心
1943,虎屋,虎屋,,虎屋
1936,阿娟,阿娟,,阿娟
1935,戀戀情人,戀戀情人,,戀戀情人
...,...,...,...,...
3821,zoobies,,zoobies,zoobies
103,卓威 zowie,卓威,zowie,zowie卓威
2699,日象 zushiang,日象,zushiang,zushiang日象
1383,德國雙人 zwilling,德國雙人,zwilling,zwilling德國雙人


In [388]:
sr.to_csv('./temp/check_brand.csv') 

## -----

## 2. 相關性處理

1. 同品牌但涵蓋範圍不同
2. 純英文與英文加中文 e.g. adidas adidas愛迪達
3. 中文名稱與英文加中文 e.g. 愛迪達 adidas愛迪達
4. 英文與譯名 e.g. adidas 愛迪達 (只能從adidas愛迪達 推斷)

In [398]:
print('單一品牌總數:', len(brand_list['original_brand'].unique()))
print("清理後品牌總數:", len(brand_list['cleaned_brand'].unique()))

單一品牌總數: 3974
清理後品牌總數: 3865


In [495]:
clunibrand = brand_list.drop_duplicates(subset=['cleaned_brand'],keep='first').reset_index()
clunibrand = clunibrand[['cleaned_brand']]
clunibrand

Unnamed: 0,cleaned_brand
0,元山
1,姍拉娜
2,台鹽
3,neo-tec妮傲絲翠
4,舒潔
...,...
3860,smith's rosebud salve
3861,van pur
3862,sweep robot
3863,malibu beauty


In [507]:
cbrand = clunibrand.copy(deep=True)
cbrandls = clunibrand.copy(deep=True)['cleaned_brand'].tolist()

In [519]:
cbrand['sim'] = cbrand['cleaned_brand'].apply(lambda x: process.extractBests(x, cbrandls, scorer=fuzz.token_sort_ratio, limit=20, score_cutoff=50))

In [535]:
cbrand

Unnamed: 0,cleaned_brand,sim,certain
0,元山,"[(元山牌, 80), (泰山, 50), (雪山, 50), (香山, 50), (山森,...","[(元山牌, 80)]"
1,姍拉娜,[],[]
2,台鹽,"[(台鹽生技, 67), (台酒, 50), (台啤, 50), (台農, 50), (臺鹽...",[]
3,neo-tec妮傲絲翠,"[(neostrata芯絲翠, 52), (neocell妮兒, 50)]",[]
4,舒潔,"[(舒跑, 50), (朵舒, 50), (舒澡, 50), (潔倍, 50), (潔芬, ...",[]
...,...,...,...
3860,smith's rosebud salve,[],[]
3861,van pur,"[(pureal, 62), (urban veda, 59), (tarn ju, 57)...",[]
3862,sweep robot,"[(irobot, 59), (rosette, 56), (brook’s, 56), (...",[]
3863,malibu beauty,"[(mandom beauty, 69), (pure beauty, 67), (beau...",[]


In [533]:
cbrand['certain'] = ''
for i in tqdm(range(len(cbrand))):
    tmplst = []
    ctlt = []
    for name , score in cbrand['sim'][i]:
        if score == 100:
            continue
        elif score >= 80:
            ctlt.append((name, score))
        tmplst.append((name, score))
    cbrand['sim'][i] = tmplst
    cbrand['certain'][i] = ctlt

100%|█████████████████████████████████████| 3865/3865 [00:00<00:00, 5743.24it/s]


### 轉出做人工判定

In [536]:
cbrand = cbrand[['cleaned_brand', 'certain', 'sim']]
cbrand.to_csv('./temp/checklist.csv')

In [526]:
len(cbrand.loc[cbrand['sim'].str.len() != 0])

3217

In [558]:
ts = pd.read_csv('brand_zhen.csv')
ts

Unnamed: 0.1,Unnamed: 0,name,brand
0,0,元山熱水瓶ys540ap,元山
1,1,姍拉娜治痘洗面乳,姍拉娜
2,2,台鹽海洋鹼性離子水,台鹽
3,3,妮傲絲翠果酸深層保養乳液,neo-tec妮傲絲翠
4,4,舒潔棉柔舒適迪士尼抽取式衛生紙,舒潔
...,...,...,...
70041,70041,惠而浦16公斤瓦斯型滾筒蒸氣可堆疊乾衣機8twgd6622hw,whirlpool惠而浦
70042,70042,康乃馨成人紙尿褲l,康乃馨
70043,70043,日本芮芙茹零矽靈洗髮露頭皮保養,reveur芮芙茹
70044,70044,crest長效鎖白牙膏輕柔鑽白,crestnan


In [559]:
ts = ts[['name','brand']]
print('資料總筆數:', len(ts))
print('品牌總數:', len(ts['brand'].unique()))

資料總筆數: 70046
品牌總數: 3858


In [560]:
ts['brand_ch'] = ''
ts['brand_en'] = ''

for i in tqdm(range(len(brand_list))):

Unnamed: 0,name,brand
0,元山熱水瓶ys540ap,元山
1,姍拉娜治痘洗面乳,姍拉娜
2,台鹽海洋鹼性離子水,台鹽
3,妮傲絲翠果酸深層保養乳液,neo-tec妮傲絲翠
4,舒潔棉柔舒適迪士尼抽取式衛生紙,舒潔
...,...,...
70041,惠而浦16公斤瓦斯型滾筒蒸氣可堆疊乾衣機8twgd6622hw,whirlpool惠而浦
70042,康乃馨成人紙尿褲l,康乃馨
70043,日本芮芙茹零矽靈洗髮露頭皮保養,reveur芮芙茹
70044,crest長效鎖白牙膏輕柔鑽白,crestnan


## ---

## 中對中檢查

In [561]:
testb = brand_list.drop_duplicates(subset=['cleaned_brand'],keep='first').reset_index()
testb = testb[['brand_ch','brand_en','cleaned_brand']]
testb

Unnamed: 0,brand_ch,brand_en,cleaned_brand
0,元山,,元山
1,姍拉娜,,姍拉娜
2,台鹽,,台鹽
3,妮傲絲翠,neo-tec,neo-tec妮傲絲翠
4,舒潔,,舒潔
...,...,...,...
3860,,smith's rosebud salve,smith's rosebud salve
3861,,van pur,van pur
3862,,sweep robot,sweep robot
3863,,malibu beauty,malibu beauty


In [564]:
chtest = testb[(testb['brand_ch'] != '') & (testb['brand_en'] == '')]

In [566]:
chtest['sim'] = chtest['brand_ch'].apply(lambda x: process.extractBests(x, chtest['brand_ch'].tolist(), scorer=fuzz.token_sort_ratio, limit=20, score_cutoff=50))
chtest

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chtest['sim'] = chtest['brand_ch'].apply(lambda x: process.extractBests(x, chtest['brand_ch'].tolist(), scorer=fuzz.token_sort_ratio, limit=20, score_cutoff=50))


Unnamed: 0,brand_ch,brand_en,cleaned_brand,sim
0,元山,,元山,"[(元山, 100), (元山牌, 80), (泰山, 50), (雪山, 50), (香山..."
1,姍拉娜,,姍拉娜,"[(姍拉娜, 100)]"
2,台鹽,,台鹽,"[(台鹽, 100), (台鹽生技, 67), (台酒, 50), (台啤, 50), (台..."
4,舒潔,,舒潔,"[(舒潔, 100), (舒跑, 50), (朵舒, 50), (舒澡, 50), (潔倍,..."
9,龜甲萬,,龜甲萬,"[(龜甲萬, 100)]"
...,...,...,...,...
3842,華菱,,華菱,"[(華菱, 100), (華佗, 50), (華陀, 50), (富華, 50), (華冠,..."
3845,台塩,,台塩,"[(台塩, 100), (台鹽, 50), (台酒, 50), (台啤, 50), (台農,..."
3847,上元,,上元,"[(上元, 100), (元山, 50), (上豪, 50), (東元, 50), (上田,..."
3848,富基-10,,富基-10,"[(富基-10, 100)]"


In [570]:
chtest.reset_index(inplace=True)
chtest = chtest[['brand_ch','brand_en','sim','cleaned_brand']]

In [571]:
chtest

Unnamed: 0,brand_ch,brand_en,cleaned_brand,sim
0,元山,,元山,"[(元山牌, 80), (泰山, 50), (雪山, 50), (香山, 50), (山森,..."
1,姍拉娜,,姍拉娜,[]
2,台鹽,,台鹽,"[(台鹽生技, 67), (台酒, 50), (台啤, 50), (台農, 50), (臺鹽..."
3,舒潔,,舒潔,"[(舒潔, 100), (舒跑, 50), (朵舒, 50), (舒澡, 50), (潔倍,..."
4,龜甲萬,,龜甲萬,"[(龜甲萬, 100)]"
...,...,...,...,...
1370,華菱,,華菱,"[(華菱, 100), (華佗, 50), (華陀, 50), (富華, 50), (華冠,..."
1371,台塩,,台塩,"[(台塩, 100), (台鹽, 50), (台酒, 50), (台啤, 50), (台農,..."
1372,上元,,上元,"[(上元, 100), (元山, 50), (上豪, 50), (東元, 50), (上田,..."
1373,富基-10,,富基-10,"[(富基-10, 100)]"


In [572]:
chtest['certain'] = ''
chtest['U'] = 0
for i in tqdm(range(len(chtest))):
    tmplst = []
    ctlt = []
    for name , score in chtest['sim'][i]:
        if score == 100:
            continue
        elif score >= 80:
            ctlt.append((name, score))
        tmplst.append((name, score))
    if len(ctlt) != 0:
        chtest['U'][i] = 1
    chtest['sim'][i] = tmplst
    chtest['certain'][i] = ctlt

chtest

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chtest['certain'] = ''
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chtest['sim'][i] = tmplst
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chtest['certain'][i] = ctlt
100%|██████████████████████████████████████| 1375/1375 [00:07<00:00, 182.15it/s]


Unnamed: 0,brand_ch,brand_en,cleaned_brand,sim,certain
0,元山,,元山,"[(元山牌, 80), (泰山, 50), (雪山, 50), (香山, 50), (山森,...","[(元山牌, 80)]"
1,姍拉娜,,姍拉娜,[],[]
2,台鹽,,台鹽,"[(台鹽生技, 67), (台酒, 50), (台啤, 50), (台農, 50), (臺鹽...",[]
3,舒潔,,舒潔,"[(舒跑, 50), (朵舒, 50), (舒澡, 50), (潔倍, 50), (潔芬, ...",[]
4,龜甲萬,,龜甲萬,[],[]
...,...,...,...,...,...
1370,華菱,,華菱,"[(華佗, 50), (華陀, 50), (富華, 50), (華冠, 50), (西華, ...",[]
1371,台塩,,台塩,"[(台鹽, 50), (台酒, 50), (台啤, 50), (台農, 50), (台畜, ...",[]
1372,上元,,上元,"[(元山, 50), (上豪, 50), (東元, 50), (上田, 50), (山元, ...",[]
1373,富基-10,,富基-10,[],[]


In [575]:
# 轉出做人工
chtest.to_csv('./temp/chtest.csv')

In [620]:
finzh = pd.read_csv('./temp/finchtest.csv', converters={"certain": literal_eval})

In [621]:
finzh = finzh[finzh['U'] != 2]
finzh.reset_index(inplace=True)
finzh = finzh[['brand_ch','certain','U']]
print('中文名稱總數:', len(finzh))

中文名稱總數: 1306


In [622]:
# 整理中文資料

for i in tqdm(range(len(finzh))):
    namelst = []
    for name, score in finzh['certain'][i]:
        namelst.append(name)
    finzh['certain'][i] = namelst
finzh

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  finzh['certain'][i] = namelst
100%|██████████████████████████████████████| 1306/1306 [00:03<00:00, 356.45it/s]


Unnamed: 0,brand_ch,certain,U
0,元山,[元山牌],1
1,姍拉娜,[],0
2,台鹽,"[臺鹽, 台塩]",1
3,舒潔,[],0
4,龜甲萬,[],0
...,...,...,...
1301,華菱,[],0
1302,上元,[],0
1303,富基-10,[],0
1304,一午一食,[],0


In [623]:
finzh.to_csv('./temp/已檢查僅中文.csv')

## -----

## 英對英檢查

In [624]:
entest = testb[(testb['brand_ch'] == '') & (testb['brand_en'] != '')]
print('僅英文名稱總數:', len(entest))

僅英文名稱總數: 1267


In [630]:
entest['sim'] = entest['brand_en'].apply(lambda x: process.extractBests(x, entest['brand_en'].tolist(), scorer=fuzz.token_sort_ratio, limit=20, score_cutoff=50))
entest

Unnamed: 0,brand_en,sim,cleaned_brand,certain,check
0,refine,"[(refine, 100), (lirene, 67), (fine color, 62)...",refine,[],1
1,i'm meme,"[(i'm meme, 100), (heme, 50), (idee, 50)]",i'm meme,[],1
2,chicco,"[(chicco, 100), (ok choice, 67), (maychic, 62)...",chicco,[],1
3,kirin,"[(kirin, 100), (kiki, 67), (iris, 67), (i-kire...",kirin,[],1
4,cezanne,"[(cezanne, 100), (enne, 73), (ehrmann, 57), (b...",cezanne,[],1
...,...,...,...,...,...
1262,smith's rosebud salve,"[(smith's rosebud salve, 100)]",smith's rosebud salve,[],0
1263,van pur,"[(van pur, 100), (pureal, 62), (urban veda, 59...",van pur,[],1
1264,sweep robot,"[(sweep robot, 100), (irobot, 59), (rosette, 5...",sweep robot,[],1
1265,malibu beauty,"[(malibu beauty, 100), (mandom beauty, 69), (p...",malibu beauty,[],1


In [631]:
entest.reset_index(inplace=True)
entest = entest[['brand_en','sim','cleaned_brand']]

In [632]:
entest['certain'] = ''
entest['check'] = 0
for i in tqdm(range(len(entest))):
    tmplst = []
    ctlt = []
    for name , score in entest['sim'][i]:
        if score == 100:
            continue
        elif score >= 80:
            ctlt.append((name, score))
        tmplst.append((name, score))
    if len(ctlt) != 0:
        entest['check'][i] = 1
    entest['sim'][i] = tmplst
    entest['certain'][i] = ctlt

entest

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  entest['certain'] = ''
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  entest['check'] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  entest['sim'][i] = tmplst
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-v

Unnamed: 0,brand_en,sim,cleaned_brand,certain,check
0,refine,"[(lirene, 67), (fine color, 62), (fresh line, ...",refine,[],0
1,i'm meme,"[(heme, 50), (idee, 50)]",i'm meme,[],0
2,chicco,"[(ok choice, 67), (maychic, 62), (chacott, 62)...",chicco,[],0
3,kirin,"[(kiki, 67), (iris, 67), (i-kirei, 67), (orion...",kirin,[],0
4,cezanne,"[(enne, 73), (ehrmann, 57), (beanies, 57), (co...",cezanne,[],0
...,...,...,...,...,...
1262,smith's rosebud salve,[],smith's rosebud salve,[],0
1263,van pur,"[(pureal, 62), (urban veda, 59), (tarn ju, 57)...",van pur,[],0
1264,sweep robot,"[(irobot, 59), (rosette, 56), (brook’s, 56), (...",sweep robot,[],0
1265,malibu beauty,"[(mandom beauty, 69), (pure beauty, 67), (beau...",malibu beauty,[],0


In [633]:
entest.to_csv('./temp/entest.csv')

In [636]:
finen = pd.read_csv('./temp/finentest.csv', converters={"certain": literal_eval})
finen = finen[finen['check'] != 2]
finen.reset_index(inplace=True)
finen = finen[['brand_en','certain','check']]
print('英文名稱總數:', len(finzh))

英文名稱總數: 1306


In [637]:
# 整理英文資料

for i in tqdm(range(len(finen))):
    namelst = []
    for name, score in finen['certain'][i]:
        namelst.append(name)
    finen['certain'][i] = namelst
finen

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  finen['certain'][i] = namelst
100%|██████████████████████████████████████| 1259/1259 [00:03<00:00, 360.75it/s]


Unnamed: 0,brand_en,certain,check
0,refine,[],0
1,i'm meme,[],0
2,chicco,[],0
3,kirin,[],0
4,cezanne,[],0
...,...,...,...
1254,van pur,[],0
1255,sweep robot,[],0
1256,malibu beauty,[],0
1257,s.pellegrino,[],0


In [694]:
finen.to_csv('./temp/已檢查僅英文.csv', encoding="utf-8")

## --

## 中英比對僅中文或僅英文

In [650]:
bttest = testb[(testb['brand_ch'] != '') & (testb['brand_en'] != '')]
print('中英名稱總數:', len(bttest))

中英名稱總數: 1217


In [653]:
bttest = bttest.reset_index()
bttest = bttest[['brand_ch','brand_en','cleaned_brand']]

### 中文比對

In [655]:
bttest['sim_ch'] = bttest['brand_ch'].apply(lambda x: process.extractBests(x, finzh['brand_ch'].tolist(), scorer=fuzz.token_sort_ratio, limit=20, score_cutoff=90))



In [670]:
bttest

Unnamed: 0,brand_ch,brand_en,cleaned_brand,sim_ch
0,妮傲絲翠,neo-tec,neo-tec妮傲絲翠,[]
1,飛利浦,philips,philips飛利浦,"[(飛利浦, 100)]"
2,專科,senka,senka專科,"[(專科, 100)]"
3,芳珂,fancl,fancl芳珂,[]
4,紅牛,red bull,red bull紅牛,"[(紅牛, 100)]"
...,...,...,...,...
1212,樂格,log,log樂格,[]
1213,陪心寵糧,nu4pet,nu4pet陪心寵糧,[]
1214,渴達,tundra,tundra渴達,[]
1215,御皇居,royal,royal御皇居,"[(御皇居, 100)]"


In [649]:
# bttest.to_csv('./temp/simch.csv')

### 中+英 檢查中文是否有重複並標記

中文重複

In [669]:
rpls = bttest[bttest.duplicated('brand_ch')]['brand_ch'].to_list()

In [671]:
bttest['repeat'] = 0
for i in range(len(bttest)):
    if bttest['brand_ch'][i] in rpls:
        bttest['repeat'][i] = 1        

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bttest['repeat'][i] = 1


In [673]:
bttest.to_csv('./temp/chsame.csv')

In [688]:
bttest = pd.read_csv('./temp/chrped.csv', converters={"sim_ch": literal_eval})
bttest = bttest[['brand_ch','brand_en','cleaned_brand','sim_ch','repeat']]

In [689]:
# 整理中+英 比對 中文資料

for i in tqdm(range(len(bttest))):
    namelst = []
    for name, score in bttest['sim_ch'][i]:
        namelst.append(name)
    bttest['sim_ch'][i] = namelst
bttest

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bttest['sim_ch'][i] = namelst
100%|██████████████████████████████████████| 1217/1217 [00:03<00:00, 356.52it/s]


Unnamed: 0,brand_ch,brand_en,cleaned_brand,sim_ch,repeat
0,妮傲絲翠,neo-tec,neo-tec妮傲絲翠,[],0
1,飛利浦,philips,philips飛利浦,[飛利浦],0
2,專科,senka,senka專科,[專科],0
3,芳珂,fancl,fancl芳珂,[],0
4,紅牛,red bull,red bull紅牛,[],0
...,...,...,...,...,...
1212,樂格,log,log樂格,[],0
1213,陪心寵糧,nu4pet,nu4pet陪心寵糧,[],0
1214,渴達,tundra,tundra渴達,[],0
1215,御皇居,royal,royal御皇居,[御皇居],0


In [691]:
print('比對中文後總數：',len(bttest[bttest['repeat'] == 0]))

比對中文後總數： 1186


### 英文比對

In [696]:
bttest['sim_en'] = bttest['brand_en'].apply(lambda x: process.extractBests(x, finen['brand_en'].tolist(), scorer=fuzz.token_sort_ratio, limit=20, score_cutoff=80))

In [697]:
bttest

Unnamed: 0,brand_ch,brand_en,cleaned_brand,sim_ch,repeat,sim_en
0,妮傲絲翠,neo-tec,neo-tec妮傲絲翠,[],0,[]
1,飛利浦,philips,philips飛利浦,[飛利浦],0,[]
2,專科,senka,senka專科,[專科],0,[]
3,芳珂,fancl,fancl芳珂,[],0,"[(fancl, 100)]"
4,紅牛,red bull,red bull紅牛,[],0,[]
...,...,...,...,...,...,...
1212,樂格,log,log樂格,[],0,"[(lg, 80)]"
1213,陪心寵糧,nu4pet,nu4pet陪心寵糧,[],0,[]
1214,渴達,tundra,tundra渴達,[],0,[]
1215,御皇居,royal,royal御皇居,[御皇居],0,[]


In [698]:
rplsen = bttest[bttest.duplicated('brand_en')]['brand_en'].to_list()

In [699]:
bttest['repeaten'] = 0
for i in range(len(bttest)):
    if bttest['brand_en'][i] in rplsen:
        bttest['repeaten'][i] = 1  

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bttest['repeaten'][i] = 1


In [700]:
bttest.to_csv('./temp/ensame.csv')

In [1018]:
enchdf = pd.read_csv('./temp/enchreped.csv', converters={"sim_ch": literal_eval, "sim_en": literal_eval})
enchdf = enchdf[['brand_ch','brand_en','cleaned_brand','sim_ch','repeat','sim_en','repeaten']]
chdf = pd.read_csv('./temp/已檢查僅中文.csv', converters={"certain": literal_eval})
chdf = chdf[['brand_ch','certain']]
endf = pd.read_csv('./temp/已檢查僅英文.csv', converters={"certain": literal_eval})
endf = endf[['brand_en','certain']]

In [1019]:
# 整理中+英 比對 英文資料

for i in tqdm(range(len(enchdf))):
    namelst = []
    for name, score in enchdf['sim_en'][i]:
        namelst.append(name)
    enchdf['sim_en'][i] = namelst

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  enchdf['sim_en'][i] = namelst
100%|██████████████████████████████████████| 1217/1217 [00:03<00:00, 345.10it/s]


In [1020]:
print('比對中英文後總數：',len(enchdf[(enchdf['repeat'] == 0) & (enchdf['repeaten'] == 0)]))

比對中英文後總數： 1167


In [1021]:
enchdf = enchdf[(enchdf['repeat'] == 0) & (enchdf['repeaten'] == 0)]
enchdf.reset_index(inplace=True)

In [1022]:
enchdf = enchdf[['brand_ch','brand_en','cleaned_brand','sim_ch','sim_en']]

In [1023]:
enchdf

Unnamed: 0,brand_ch,brand_en,cleaned_brand,sim_ch,sim_en
0,妮傲絲翠,neo-tec,neo-tec妮傲絲翠,[],[]
1,飛利浦,philips,philips飛利浦,[飛利浦],[]
2,專科,senka,senka專科,[專科],[]
3,芳珂,fancl,fancl芳珂,[],[fancl]
4,紅牛,red bull,red bull紅牛,[],[]
...,...,...,...,...,...
1162,樂格,log,log樂格,[],[]
1163,陪心寵糧,nu4pet,nu4pet陪心寵糧,[],[]
1164,渴達,tundra,tundra渴達,[],[]
1165,御皇居,royal,royal御皇居,[御皇居],[]


In [1024]:
enchdf['similar'] = ''

for i in tqdm(range(len(enchdf))):
    # 若皆沒有相似則跳過
    if len(enchdf['sim_en'][i]) == 0 and len(enchdf['sim_ch'][i]) == 0:
        enchdf['similar'][i] = []
        continue
    
    enchdf['similar'][i] = []
    # 是否英文有相似
    if len(enchdf['sim_en'][i]) != 0:
        for name in enchdf['sim_en'][i]:
            idx = endf[endf['brand_en'] == name].index  # 取得在英文資料集的位置
            if len(idx) == 0: # 若沒有找到（在中+英有相似但不在英文資料集內）
                enchdf['similar'][i].append(name)
            else: # 如果有找到
                enchdf['similar'][i].append(name)
                for enname in endf['certain'][idx[0]]:
                    enchdf['similar'][i].append(enname) # 逐一加入
                endf.drop(idx[0], inplace = True) # 將資料刪除（因為已轉移至中+英）
                
    # 是否中文有相似
    if len(enchdf['sim_ch'][i]) != 0:
        for name in enchdf['sim_ch'][i]:
            idx = chdf[chdf['brand_ch'] == name].index # 取得在中文資料集的位置
            if len(idx) == 0: # 若沒有找到（在中+英有相似但不在英文資料集內）
                enchdf['similar'][i].append(name)
            else: # 如果有找到
                enchdf['similar'][i].append(name)
                for chname in chdf['certain'][idx[0]]:
                    enchdf['similar'][i].append(chname) # 逐一加入
                chdf.drop(idx[0], inplace = True) # 將資料刪除（因為已轉移至中+英）

100%|█████████████████████████████████████| 1167/1167 [00:00<00:00, 4961.48it/s]


## 合併

In [1025]:
# 中英混合
enchdf = enchdf[['cleaned_brand', 'similar']]
enchdf

Unnamed: 0,cleaned_brand,similar
0,neo-tec妮傲絲翠,[]
1,philips飛利浦,[飛利浦]
2,senka專科,[專科]
3,fancl芳珂,[fancl]
4,red bull紅牛,[]
...,...,...
1162,log樂格,[]
1163,nu4pet陪心寵糧,[]
1164,tundra渴達,[]
1165,royal御皇居,[御皇居]


In [1026]:
# 純中文
chdf

Unnamed: 0,brand_ch,certain
0,元山,[元山牌]
1,姍拉娜,[]
2,台鹽,"[臺鹽, 台塩]"
3,舒潔,[]
4,龜甲萬,[]
...,...,...
1301,華菱,[]
1302,上元,[]
1303,富基-10,[]
1304,一午一食,[]


In [1027]:
# 純英文
endf

Unnamed: 0,brand_en,certain
0,refine,[]
1,i'm meme,[]
2,chicco,[]
3,kirin,[]
4,cezanne,[]
...,...,...
1254,van pur,[]
1255,sweep robot,[]
1256,malibu beauty,[]
1257,s.pellegrino,[]


In [1028]:
# 純數字
numdf = oribrand[oribrand['original_brand'].str.isnumeric() == 1]
numdf.reset_index(inplace=True)
numdf['similar'] = ''
for i in range(len(numdf)):
    numdf['similar'][i] = []
numdf = numdf[['original_brand', 'similar']]
numdf

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  numdf['similar'] = ''
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  numdf['similar'][i] = []


Unnamed: 0,original_brand,similar
0,五五五,[]
1,1028,[]
2,360,[]
3,080,[]
4,零零,[]
5,2080,[]


In [1029]:
enchdf = enchdf.rename({'cleaned_brand': 'brand'}, axis=1)
chdf = chdf.rename({'brand_ch': 'brand', 'certain': 'similar'}, axis=1)
endf = endf.rename({'brand_en': 'brand', 'certain': 'similar'}, axis=1)
numdf = numdf.rename({'original_brand': 'brand'}, axis=1)

In [1030]:
finaldf = pd.concat([enchdf,chdf,endf,numdf])
finaldf.reset_index(inplace=True)
finaldf = finaldf[['brand', 'similar']]

In [1031]:
finaldf

Unnamed: 0,brand,similar
0,neo-tec妮傲絲翠,[]
1,philips飛利浦,[飛利浦]
2,senka專科,[專科]
3,fancl芳珂,[fancl]
4,red bull紅牛,[]
...,...,...
3479,1028,[]
3480,360,[]
3481,080,[]
3482,零零,[]


In [1005]:
bddf = pd.read_csv('./temp/forfinaldf.csv')

In [1007]:
bddf = bddf[['name', 'brand', 'clean_name', 'clean_brand', 'brand_ch', 'brand_en', 'cleaned_brand']]
bddf

Unnamed: 0,name,brand,clean_name,clean_brand,brand_ch,brand_en,cleaned_brand
0,元山熱水瓶ys-540ap,元山,元山熱水瓶ys540ap,元山,元山,,元山
1,356612@姍拉娜治痘洗面乳150g,姍拉娜,姍拉娜治痘洗面乳,姍拉娜,姍拉娜,,姍拉娜
2,台鹽海洋鹼性離子水 600ml,台鹽,台鹽海洋鹼性離子水,台鹽,台鹽,,台鹽
3,妮傲絲翠果酸深層保養乳液,NEO-TEC妮傲絲翠,妮傲絲翠果酸深層保養乳液,neo-tec妮傲絲翠,妮傲絲翠,neo-tec,neo-tec妮傲絲翠
4,舒潔棉柔舒適迪士尼抽取式衛生紙 100抽16入,舒潔,舒潔棉柔舒適迪士尼抽取式衛生紙,舒潔,舒潔,,舒潔
...,...,...,...,...,...,...,...
70041,惠而浦16公斤瓦斯型滾筒蒸氣(可堆疊)乾衣機8twgd6622hw,惠而浦 Whirlpool,惠而浦瓦斯型滾筒蒸氣可堆疊乾衣機8twgd6622hw,惠而浦 whirlpool,惠而浦,whirlpool,whirlpool惠而浦
70042,康乃馨成人紙尿褲l,康乃馨,康乃馨成人紙尿褲l,康乃馨,康乃馨,,康乃馨
70043,日本芮芙茹零矽靈洗髮露460ml頭皮保養,芮芙茹 Reveur,日本芮芙茹零矽靈洗髮露頭皮保養,芮芙茹 reveur,芮芙茹,reveur,reveur芮芙茹
70044,crest長效鎖白牙膏-輕柔鑽白99g,crest,crest長效鎖白牙膏輕柔鑽白,crest,,crest,crest


In [1008]:
for i in range(len(bddf)):
    try:
        if math.isnan(bddf['brand_ch'][i]):
            bddf['brand_ch'][i] = ''
    except:
        pass
    try:
        if math.isnan(bddf['brand_en'][i]):
            bddf['brand_en'][i] = ''
    except:
        pass

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bddf['brand_en'][i] = ''
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bddf['brand_ch'][i] = ''


In [1009]:
bddf = bddf[['name', 'brand', 'clean_name', 'brand_ch', 'brand_en', 'cleaned_brand']]
bddf

Unnamed: 0,name,brand,clean_name,brand_ch,brand_en,cleaned_brand
0,元山熱水瓶ys-540ap,元山,元山熱水瓶ys540ap,元山,,元山
1,356612@姍拉娜治痘洗面乳150g,姍拉娜,姍拉娜治痘洗面乳,姍拉娜,,姍拉娜
2,台鹽海洋鹼性離子水 600ml,台鹽,台鹽海洋鹼性離子水,台鹽,,台鹽
3,妮傲絲翠果酸深層保養乳液,NEO-TEC妮傲絲翠,妮傲絲翠果酸深層保養乳液,妮傲絲翠,neo-tec,neo-tec妮傲絲翠
4,舒潔棉柔舒適迪士尼抽取式衛生紙 100抽16入,舒潔,舒潔棉柔舒適迪士尼抽取式衛生紙,舒潔,,舒潔
...,...,...,...,...,...,...
70041,惠而浦16公斤瓦斯型滾筒蒸氣(可堆疊)乾衣機8twgd6622hw,惠而浦 Whirlpool,惠而浦瓦斯型滾筒蒸氣可堆疊乾衣機8twgd6622hw,惠而浦,whirlpool,whirlpool惠而浦
70042,康乃馨成人紙尿褲l,康乃馨,康乃馨成人紙尿褲l,康乃馨,,康乃馨
70043,日本芮芙茹零矽靈洗髮露460ml頭皮保養,芮芙茹 Reveur,日本芮芙茹零矽靈洗髮露頭皮保養,芮芙茹,reveur,reveur芮芙茹
70044,crest長效鎖白牙膏-輕柔鑽白99g,crest,crest長效鎖白牙膏輕柔鑽白,,crest,crest


In [1035]:
fbranlst = finaldf.copy(deep=True)

In [1036]:
fbranlst['same'] = ''

In [1037]:
tmplen = len(fbranlst)
for i in range(tmplen):
    for name in fbranlst['similar'][i]:
        fbranlst = fbranlst.append({
            "brand": name,
            "similar": [],
            "same": fbranlst['brand'][i]
        }, ignore_index=True)

  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranl

  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranl

  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranl

  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranl

  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranl

  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranl

  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranlst = fbranlst.append({
  fbranl

In [1038]:
for i in range(len(bddf)):
    searchres = fbranlst[fbranlst['brand'] == bddf['cleaned_brand'][i]]
    if len(searchres) == 0:
        print("index:", i)
        print("No Matched Search:", bddf['cleaned_brand'][i])
    elif len(searchres) == 1:
        samewd = fbranlst['same'][searchres.index[0]] 
        if samewd == '': # 是主值
            if bddf['brand'][i] not in fbranlst['similar'][searchres.index[0]]: # 如果未清洗的品牌名稱不在相似表內
                fbranlst['similar'][searchres.index[0]].append(bddf['brand'][i]) # 加入未清洗的名稱
        else: # 不是主值
            # 如果未清洗的品牌名稱不在主值相似表內
            if bddf['brand'][i] not in fbranlst['similar'][fbranlst[fbranlst['brand'] == samewd].index[0]]: 
                # 加入未清洗的名稱
                fbranlst['similar'][fbranlst[fbranlst['brand'] == samewd].index[0]].append(bddf['brand'][i]) 
    else:
        print("index:", i)
        print("Multiple Results:", bddf['cleaned_brand'][i])
        

print('----END----')

index: 1223
Multiple Results: 生活
index: 2249
Multiple Results: 愛生活
index: 2372
Multiple Results: 生活
index: 2998
Multiple Results: 屏大
index: 3594
Multiple Results: 愛力
index: 4821
Multiple Results: 皇冠
index: 5556
Multiple Results: 生活
index: 5724
Multiple Results: 極淨適
index: 6367
Multiple Results: 生活
index: 6594
Multiple Results: 生活
index: 7621
Multiple Results: 生活
index: 7758
Multiple Results: 生活
index: 10026
Multiple Results: 屏科大
index: 10263
Multiple Results: 生活
index: 10977
Multiple Results: 生活
index: 11340
Multiple Results: 生活
index: 11358
Multiple Results: d-up
index: 11497
Multiple Results: 生活
index: 13286
Multiple Results: 生活
index: 14302
Multiple Results: d-up
index: 14757
Multiple Results: 生活
index: 14881
Multiple Results: 生活
index: 15377
Multiple Results: 屏科大
index: 15516
Multiple Results: 屏大
index: 15766
Multiple Results: 生活
index: 16120
Multiple Results: 生活
index: 16170
Multiple Results: 屏大
index: 16742
Multiple Results: 生活
index: 17322
Multiple Results: 屏科大
index: 17486
Mult

In [1041]:
finalreslst = fbranlst[fbranlst['same'] == '']

In [1043]:
finalreslst = finalreslst[['brand', 'similar']]
finalreslst

Unnamed: 0,brand,similar
0,neo-tec妮傲絲翠,"[NEO-TEC妮傲絲翠, NEO-TEC 妮傲絲翠]"
1,philips飛利浦,"[飛利浦, 飛利浦 Philips, 飛利浦 PHILIPS]"
2,senka專科,"[專科, SENKA 專科]"
3,fancl芳珂,"[fancl, FANCL 芳珂, FANCL]"
4,red bull紅牛,[紅牛 Red Bull]
...,...,...
3479,1028,[1028]
3480,360,[360]
3481,080,[080]
3482,零零,[零零]


In [1044]:
finalreslst.to_csv('單一品牌內容.csv')