In [25]:
from collections import Counter
import pandas as pd
import numpy as np
from itertools import chain

intermediate_directory = 'data'
reviews = os.path.join(intermediate_directory,
                                   'full_data_appliances.txt')
reviews_new = os.path.join(intermediate_directory,
                                   'full_data_appliances_new.txt')
compare = os.path.join(intermediate_directory,
                                   'compare_appliances.txt')
grouped = os.path.join(intermediate_directory,
                                   'full_data_appliancesG.txt')

# reading the old full data and adding skus to a dictionary
skus = []
with open(reviews,'r') as f:
    for line in f:
        sku = line.split('; ')[0]
        skus.append(sku)
    dict1 = Counter(skus)
    
# reading the new full data and adding skus to a dictionary
skus2 = []
with open(reviews_new,'r') as f2:
    for line in f2:
        sku = line.split('; ')[0]
        skus2.append(sku)
    dict2 = Counter(skus2)
print(len(set(skus2)))

# Comparison of two files based on the number of reviews
keys = set()
with open(compare,'w') as fout:
    for key, value in dict1.items():
        for key2, value2 in dict2.items():
            if key2 not in dict1.keys():
                if key2 not in keys:
                    fout.write(str(key2) + '\t' + '0' + '\t' + str(value2) + '\n')
                    keys.add(key2)

keys2 = set()
with open(compare,'a') as fout:
    for key, value in dict1.items():
        for key2, value2 in dict2.items():
            if key == key2:
                if key not in keys2:
                    fout.write(str(key) + '\t' + str(value) + '\t' + str(value2) + '\n')
                    keys2.add(key)

skus_set = set()
number = set()
with open(compare,'r') as fin:
    rev = pd.read_table(reviews_new,
        sep='; ', header=None,
        names=['sku', 'name', 'id', 'h_votes', 'noth_votes', 'merchant_id', 'date', 'rating', 'comment'])
    for line in fin:
        old = line.split()[1]
        new = line.split()[2]
        sku = line.split()[0]
        number.add(sku)
        
# Filtering skus based on the comparison of the number of reviews in 2 files. Goal: if the number wasn't updated, eliminate
# sku as it can indicate that the review data for this sku is incompleted
        if (int(new) > 5 and int(new) > int(old)):
            skus_set.add(sku)
    rev1 = rev[rev['sku'].isin(skus_set)]
    f = lambda x: len(x.split())
    rev1['comment'] = rev1['comment'].astype('str')
    rev1['review_len'] = rev1.comment.apply(f)

#     print(rev1)
    print('size of stock_skus: ', len(skus_set))

# Filtering top 5 most helpful positive reviews for each sku
table1 = rev1.loc[rev1['rating'] > 3]
table1['substr'] = table1['h_votes'].sub(table1['noth_votes'], axis=0)

gr = table1.sort_values(['sku', 'substr', 'review_len'], ascending=False).groupby('sku').head(n=5)
gr = gr[gr['comment'].str.contains('walmart', case=False)]
gr['positive'] = '1'
gr = gr.drop_duplicates(subset='sku', keep="last")

# Filtering top 5 most helpful negative reviews for each sku
table2 = rev1.loc[rev1['rating'] < 4]
table2['substr'] = table2['h_votes'].sub(table2['noth_votes'], axis=0)
gr2 = table2.sort_values(['sku', 'substr', 'review_len'], ascending=False).groupby('sku').head(n=5)
gr2 = gr2[gr2['comment'].str.contains('walmart', case=False)]
gr2['negative'] = '1'
gr2 = gr2.drop_duplicates(subset='sku', keep="last")

# Keeping reviews from main page (first five)
gr3 = rev1.sort_values(['sku', 'date'], ascending=False).groupby('sku').head(n=5)
gr3 = gr3[gr3['comment'].str.contains('walmart', case=False)]
gr3['main_page'] = '1'
gr3 = gr3.drop_duplicates(subset='sku', keep="last")


# Finding duplicated reviews, based on the mapping skus with the same number of reviews. Dropping reviews which are in
# the same group of skus, aka same earphones with the same color
from collections import OrderedDict

numbers = []
duplicated = []
parents_set = set()
seen = set()
skus_seen = set()
parent_ids = {}
dict_skus = {}
skus = []

with open(compare, 'r') as fin:
    with open(grouped) as fin2:
        parents = fin2.readlines()
        for el in parents:
            parents_set.add(el)
        parent_ids = dict((el.split()[0], el.split()[3] +'\n' + el.split()[2]) for el in parents_set)
        
        for line in fin:
            number = line.split()[2]
            sku = line.split()[0]
            
            try:
                parent_id = parent_ids[sku].split()[1]
                group_id = parent_ids[sku].split()[0]
                compare = tuple((parent_id, number))
            except:
                pass
            
            if int(number) > 120:
                if compare not in seen:
                    seen.add(compare)
                else:
                    dict_skus[sku +'\nGroup ID: ' + group_id] = compare
                    
        rev_multidict = {}
        for key, value in dict_skus.items():
            rev_multidict.setdefault(value, set()).add(key)
            
        filtered_dict = {k:v for k,v in rev_multidict.items() if len(v)>1}

        for x in filtered_dict:
            print (x)
            similar = set()
            temp = []
            for y in filtered_dict[x]:
                if (y.split()[3] not in similar and y != 'NULL'):
                    similar.add(y.split()[3])
                    print (y)
                    print()
                    temp.append(y.split()[0])
            if len(temp) > 1:
                duplicated.append([el for el in temp])
            print ('---------------------------')
flatten = list(chain.from_iterable(duplicated))
print (flatten)
print (len(flatten))
            
df_sku = rev['sku']
gr4 = rev[df_sku.isin(flatten)]
gr4['duplicated'] = '1'
gr4 = gr4.drop_duplicates(subset='sku', keep="last")

# Merging all four tables and writing results to a file
merged = pd.merge(gr, gr2, on='sku', how='outer')
merged2 = pd.merge(merged, gr3, on='sku', how='outer')
merged3 = pd.merge(merged2, gr4, on='sku', how='outer')

np.savetxt('blacklisted_skus_appliances.txt',
    merged3[['sku', 'positive', 'negative', 'main_page', 'duplicated']], fmt='%s', delimiter="\t")

5135


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


size of stock_skus:  1901


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


('3000028', '2753')
0f302e17915e4695bc8a70b26e1e5239
Group ID: 0d7afbd6a2334f3899a59906a7785ace

8be8f5520c254a66b07ab30dedd8f1ef
Group ID: NULL

59f4e3f659e64075a6e79c6d3662dcfc
Group ID: 24e7f63598164337a303d40bcc0a75bc

---------------------------
('3000030', '792')
7d9d35314a5941cd9e2e05d8965315db
Group ID: bcc8af88f82a4907a3e47ed605fc81f9

2d6dbf829a8d46b5930fd1415a592b63
Group ID: 388aeecc1d5a4cd4b494627c7d428bcb

8df2087bd7ab447da08b395885114bbc
Group ID: NULL

---------------------------
('3000005', '3426')
4d0e6544ecf74fd784218e42b56754ef
Group ID: NULL

---------------------------
('3000028', '461')
86dca234725b4e5d90e2b8c90fbc7ca0
Group ID: NULL

a8291d3a3d6e4408a5be13ac019ce275
Group ID: 23a932da089f43f2bcd175914ce96f53

---------------------------
('3000028', '189')
622773e48f384750a7e80e0bb50113b5
Group ID: NULL

---------------------------
('3000028', '348')
c49d520cb61444da9613319aa7e771c3
Group ID: b66fb9265318483c9ff92f6040097769

04ad34cb683f4dd2a618bf80003c598f
Grou

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
