## Extract brand name

In [173]:
def add_brand(path):
    '''
    Extract brands from the title and generate a new 'brand' column;
    Sort the rows based on the total number of comments, like and share.
    '''
    
    # read dataframe and preprocess
    import re
    import pandas as pd
    df = pd.read_excel(path, index_col = 0)
    df.drop_duplicates(inplace = True)
    title = df.title.to_list()
    eng = [re.sub(u"([^\u0041-\u007a])", " ", string) for string in title]
    eng_strip = [string.rstrip().lstrip().lower() for string in eng]
    eng_split = [[i for i in string.split('  ') if i] for string in eng_strip]
    
    # remove the frequent word (typically not brand name) from brands
    from nltk.corpus import words
    from nltk import FreqDist
    from nltk.corpus import brown
    frequency_list = FreqDist(i.lower() for i in brown.words())
    freq_words = [word[0] for word in frequency_list.most_common()[:1000]]
    brands = [[word for word in string if word not in freq_words] for string in eng_split]
    df['brands'] = brands
    df.fillna(0, inplace=True)
    
    # add a summary column that sums the three numbers and sort
    df['like_num'][df['like_num'] == '9999+'] = '9999'
    df['like_num'] = df['like_num'].astype(float)
    df['sum_num'] = df['comment_num'] + df['like_num'] + df['share_num']
    df.sort_values('like_num', ascending=False, inplace=True)
    
    return(df)

In [170]:
path = r'result/dealmoon_result_2020-07-18.xlsx'

In [171]:
dft = add_brand(path)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [172]:
dft

Unnamed: 0,title,item1,item2,item3,comment_num,like_num,share_num,time,link,brands,sum_num
226,"Walmart 好物汇总 | 卡通小帐篷$9.9, 紫外线消毒灯$25",卡通小帐篷,PampersSwaddlers 纸尿裤 以1号为例 396片,紫外线消毒灯,5243.0,9999.0,0.0,8小时前,https://www.dealmoon.com/cn/daily-update-2019-...,[walmart],15242.0
637,Walmart LEGO乐高积木折扣汇总，低至5.6折每日更新,Lego玩具总动员4系列 Buzz和Woody的嘉年华 10770,Lego经典创意盒中号 10696,LegoMinions和他们的基地 75551,833.0,4280.0,0.0,17小时前,https://www.dealmoon.com/cn/up-to-40-off-lego-...,[walmart lego],5113.0
26,Nordstrom清仓特卖 Jimmy Choo亮片高跟鞋$270,Jimmy ChooRomy尖头高跟鞋,ALI & JAY吊带百褶裙,Badgley MischkaUlanni钻面平底鞋,3877.0,3540.0,2400.0,8小时前,https://www.dealmoon.com/cn/up-to-60-off-extra...,"[nordstrom, jimmy choo]",9817.0
22,上新：SSENSE 2020年中大促，YSL、巴黎世家、Loewe上新,Bottega Veneta卡包,Off-White箭头卫衣,Nike Air Zoom Spiridon 运动鞋,2117.0,2392.0,1917.0,6小时前,https://www.dealmoon.com/cn/up-to-70-off-ssens...,"[ssense, ysl, loewe]",6426.0
42,Coach官网 年中大促 精选鞋包、服饰热卖,CoachTabby 单肩包,Parker 18铆钉斜挎包,CoachCharlie水桶包,2232.0,2225.0,3045.0,8小时前,https://www.dealmoon.com/cn/50-off-coach-summe...,[coach],7502.0
...,...,...,...,...,...,...,...,...,...,...,...
1091,Tory Burch官网 多款美鞋加入折扣区,Tory BurchMiller凉鞋,Tory BurchMiller凉鞋,Tory BurchMinnie 波点芭蕾鞋,2.0,0.0,0.0,2天前,https://www.dealmoon.com/cn/up-to-50-off-tory-...,[tory burch],2.0
928,Women Within 清仓区舒适服饰热卖,连衣裙,印花半裙,牛仔裤,0.0,0.0,0.0,1天前,https://www.dealmoon.com/cn/up-to-80-off-women...,[women within],0.0
236,OshKosh BGosh 婴幼儿入夏服饰上新 尺寸3m-24m,OshKosh B'gosh婴儿、幼童包臀衫,OshKosh B'gosh婴儿、幼童荷叶边印花上衣,OshKosh B'gosh婴儿、幼童背带裤,0.0,0.0,0.0,9小时前,https://www.dealmoon.com/cn/up-to-50-off-oshko...,"[oshkosh bgosh, m, m]",0.0
935,独家：Calvin Klein 男士舒适圆领T恤,Calvin KleinT恤,Calvin KleinT恤,Calvin KleinT恤,0.0,0.0,0.0,1天前,https://www.dealmoon.com/cn/19-calvin-klein-me...,"[calvin klein, t]",0.0
