In [16]:
#module imports
import pandas as pd, numpy as np, textblob
from textblob import TextBlob
from langdetect import detect

In [6]:
#import all dataframes
az_reviews = pd.read_excel('../data/Amazon_reviews_with_scores.xlsx')
az_reviews = az_reviews.drop(['ReviewID'],axis=1)

tg_reviews = pd.read_csv('../data/target_reviews_clean.csv')
tg_reviews['verified_purchaser'] = tg_reviews['verified_purchaser'].astype(bool)
tg_reviews.rename({'verified_purchaser':'verified_purchase'},inplace=True,axis=1)

bb_reviews = pd.read_excel('../data/BB_Review_data.xlsx')
bb_reviews = bb_reviews.drop(['ReviewID'],axis=1)


In [7]:
#combine all dataframes
all_reviews = pd.concat(
    [
        az_reviews,
        tg_reviews,
        bb_reviews
    ]
)
all_reviews = all_reviews.reset_index()

In [8]:
#remove the duplicate / error indices
all_reviews = all_reviews.drop(['index'],axis=1)
#add the specific site from which the data came
all_reviews['site'] = np.select(
    [
        all_reviews['prodSiteID'].str.contains('BB'),
        all_reviews['prodSiteID'].str.contains('target'),
        all_reviews['prodSiteID'].str.contains('Amazon')
    ],
    [
        'BestBuy',
        'Target',
        'Amazon'
    ]
)


In [12]:
#helper functions
def get_lang(data):
    lang = None
    try:
        lang = detect(
            ''.join(
                w.lower() if w.isalpha() else " " for w in data
            )
        )
    except:
        lang = 'Unk'
    return lang

def get_sentiments(data):
    res = textblob.TextBlob(
        data
    )
    return [res.subjectivity,res.polarity]

In [14]:
all_reviews

Unnamed: 0,productID,prodSiteID,review_header,reviewer_name,review_content,review_star_rating,review_helpful_votes,verified_purchase,review_subjectivity,review_polarity,review_length,url,review_lang,site
0,1,Amazon1,It is renewed but looked like new,Andre,"It was packaged very well, Amazon packaging, n...",5,1,True,0.604943,0.273504,282.0,,en,Amazon
1,1,Amazon1,Worth it,Alice,"The phone came in perfect conditions, without ...",5,2,True,0.513636,0.484091,138.0,,en,Amazon
2,1,Amazon1,Amazon Warehouse for the Win!,Mrs. K,I got a great deal on this at Amazon Warehouse...,5,2,True,0.570455,0.333636,373.0,,en,Amazon
3,1,Amazon1,Awesome,David Chaves Rojas,"Got my phone, 10/10. No scratches, still have ...",5,0,True,0.000000,0.000000,83.0,,en,Amazon
4,1,Amazon1,Like new iPhone 14,Rick,The iPhone 14 came with a battery that had 100...,5,0,True,0.727273,0.568182,170.0,,en,Amazon
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46074,3,BB3,Returned this product,TraceyPortchester,did not have the capacity to safeguard all of ...,1,0,True,,,,,en,BestBuy
46075,3,BB3,forget BB,Deede,Received in email but not downloadable. Geek ...,1,0,True,,,,,en,BestBuy
46076,3,BB3,Decent product,RamonI,It’s a good antivirus program it does keep my ...,4,0,True,,,,,en,BestBuy
46077,3,BB3,Works great,GracieW,"Works great, never had a problem with it, will...",5,0,True,,,,,en,BestBuy


In [15]:
#add the predicted language, poliarity, subjectivity
all_reviews['review_lang'] = all_reviews.apply(
    lambda row: get_lang(str(row['review_content'])), axis = 1
)

In [17]:
# all_reviews[['review_subjectivity','review_polarity']] = all_reviews.apply(
#     lambda row: get_sentiments(str(row['review_header'])+ ' '+ str(row['review_content'])),axis=1
# )
all_reviews['review_subjectivity'] = all_reviews.apply(
    lambda row: TextBlob(
        str(row['review_header'])+ ' '+ str(row['review_content'])
    ).subjectivity,axis=1
)

all_reviews['review_polarity'] = all_reviews.apply(
    lambda row: TextBlob(
        str(row['review_header'])+ ' '+ str(row['review_content'])
    ).polarity,axis=1
)

In [27]:
all_reviews['review_length'] = (all_reviews['review_content'].str.split(' ').str.len()) #get length in words

In [38]:
#review metrics by site, all reviews (including bad ones)
def add_metr_cols(df):
    df['IQR'] = df['75%'] - df['25%']
    df['UpperLimit'] = df['75%'] + (1.5 * df['IQR'])
    df['LowerLimit'] = df['25%'] - (1.5 * df['IQR'])
    return df

all_review_metrics_target = add_metr_cols(
    pd.DataFrame(all_reviews[all_reviews['site']=='Target'].describe().T.reset_index())
)

all_review_metrics_amazon = add_metr_cols(
    pd.DataFrame(all_reviews[all_reviews['site']=='Amazon'].describe().T.reset_index())
)

all_review_metrics_bestbuy = add_metr_cols(pd.DataFrame(all_reviews[all_reviews['site']=='BestBuy'].describe().T.reset_index()))

all_review_metrics_target_en = add_metr_cols(
    pd.DataFrame(all_reviews[(all_reviews['site']=='Target')&(all_reviews['review_lang']=='en')].describe().T.reset_index())
)

all_review_metrics_amazon_en = add_metr_cols(
    pd.DataFrame(all_reviews[(all_reviews['site']=='Amazon')&(all_reviews['review_lang']=='en')].describe().T.reset_index())
)

all_review_metrics_bestbuy_en = add_metr_cols(
    pd.DataFrame(all_reviews[(all_reviews['site']=='BestBuy')&(all_reviews['review_lang']=='en')].describe().T.reset_index())
)

#review metrics, all sites, all reviews
all_review_metrics = add_metr_cols(all_reviews.describe().T.reset_index())

all_review_metrics_en = add_metr_cols(all_reviews[all_reviews['review_lang']=='en'].describe().T.reset_index())

#star rating, subjectivity, polarity, ...
target_cols = ['review_star_rating', 'review_subjectivity', 'review_polarity', 'review_length']

all_reviews[['site_outlier','site_outlier_en','global_outlier','global_outlier_en']] = 0,0,0,0

selection_all = {
    'Amazon':all_review_metrics_amazon,
    'BestBuy':all_review_metrics_bestbuy,
    'Target':all_review_metrics_target
}

selection_en = {
    'Amazon':all_review_metrics_amazon_en,
    'BestBuy':all_review_metrics_bestbuy_en,
    'Target':all_review_metrics_target_en
}

for i, row in all_reviews.iterrows():
    site_out = selection_all[row['site']]
    site_out_en = selection_en[row['site']]
    for col in target_cols:
        #determine if the value is an outlier, globally
        all_reviews.loc[i,'global_outlier'] += int(
            all_review_metrics[all_review_metrics['index']==col].iloc[0]['UpperLimit'] < row[col] or
            all_review_metrics[all_review_metrics['index']==col].iloc[0]['LowerLimit'] > row[col]
        )
        #determine if the value is an outlier for english-only / well spelled reviews
        all_reviews.loc[i,'global_outlier_en'] += int(
            all_review_metrics_en[all_review_metrics_en['index']==col].iloc[0]['UpperLimit'] < row[col] or
            all_review_metrics_en[all_review_metrics_en['index']==col].iloc[0]['LowerLimit'] > row[col]
        )
        #determine if the value is an outlier amongst all values for this site.
        all_reviews.loc[i,'site_outlier'] += int( 
           site_out[site_out['index']==col].iloc[0]['UpperLimit'] < row[col] or 
           site_out[site_out['index']==col].iloc[0]['LowerLimit'] > row[col] 
        )
        #determine if the value is an outlier amongst all values associated to english-only / well-spelled reviews for this site
        all_reviews.loc[i,'site_outlier_en'] += int(
           site_out_en[site_out_en['index']==col].iloc[0]['UpperLimit'] < row[col] or 
           site_out_en[site_out_en['index']==col].iloc[0]['LowerLimit'] > row[col] 
        )


#each of the outlier columns will have a number greater than 0 if the 
#record has an outlier in one or more specified columns under the given context.
#e.g. site_outlier_en means that for a row containing "Target" as the site - 
    # the record has n values that are outliers where n is the number of columns.
    #not keeping track of where there are outliers, presently.
    # just knowing the record has outliers...probably could be something to determine to include or exclude.
        
    #more than likely, we'll need to go with review_lang == 'en' and may need to filter down on site outliers
        


In [39]:
all_review_metrics_amazon['type'] = 'site_amazon'
all_review_metrics_bestbuy['type'] = 'site_bestbuy'
all_review_metrics_target['type'] = 'site_target'
all_review_metrics_amazon_en['type'] = 'site_amazon_english'
all_review_metrics_bestbuy_en['type'] = 'site_bestbuy_english'
all_review_metrics_target_en['type'] = 'site_target_english'
all_review_metrics['type'] = 'global'
all_review_metrics_en['type'] = 'global_en'

review_metrics = pd.concat([
    all_review_metrics,all_review_metrics_amazon,
    all_review_metrics_bestbuy,all_review_metrics_target,
    all_review_metrics_en,all_review_metrics_amazon_en,
    all_review_metrics_bestbuy_en,all_review_metrics_target_en
])

review_metrics.to_csv('../data/combined_review_metrics.csv')

all_reviews.to_csv('../data/combined_review_table.csv')


In [40]:
display(
    all_reviews[all_reviews['global_outlier']>0],
    all_reviews[all_reviews['global_outlier_en']>0],
    all_reviews[all_reviews['site_outlier']>0],
    all_reviews[all_reviews['site_outlier_en']>0]
)

Unnamed: 0,productID,prodSiteID,review_header,reviewer_name,review_content,review_star_rating,review_helpful_votes,verified_purchase,review_subjectivity,review_polarity,review_length,url,review_lang,site,site_outlier,site_outlier_en,global_outlier,global_outlier_en
11,1,Amazon1,Excelente,"Moldea muy bien, me gustó mucho! Es cómodo de ...","En perfectas condiciones, 100% el estado de la...",5,1,True,0.000000,0.000000,11.0,,es,Amazon,1,1,1,1
13,1,Amazon1,Excelente,Diego Sanchez,Todo estuvo muy bien,5,0,True,0.000000,0.000000,4.0,,es,Amazon,1,1,1,1
17,1,Amazon1,App loading,Amazon Customer,"I am still in the process of loading app,s in ...",5,0,True,0.000000,0.000000,22.0,,en,Amazon,1,1,1,1
19,1,Amazon1,Practically brand new,Gustavo Perez,From the description of the listing I was expe...,5,0,True,0.386518,0.036024,158.0,,en,Amazon,0,0,1,1
21,1,Amazon1,Definitely worth the investment,Richard Dumostil,I was reluctant to buy a refurbished phone but...,5,9,True,0.593506,0.333766,90.0,,en,Amazon,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46071,3,BB3,Accident,Aenavarro,Never downloaded- purchase was accident\nI alr...,1,0,True,0.000000,0.000000,8.0,,en,BestBuy,2,2,2,2
46072,3,BB3,couldn’t even get the item.,tony,i tried to download and it just threw me an er...,1,1,True,0.300000,1.000000,16.0,,en,BestBuy,1,1,1,1
46073,3,BB3,"Unable to download!!! Message comes up ""Bad R...",TEEE,"Unable to download!!! Message comes up ""Bad R...",1,1,True,0.583333,-0.882031,9.0,,en,BestBuy,2,2,2,2
46074,3,BB3,Returned this product,TraceyPortchester,did not have the capacity to safeguard all of ...,1,0,True,0.000000,0.000000,11.0,,en,BestBuy,2,2,2,2


Unnamed: 0,productID,prodSiteID,review_header,reviewer_name,review_content,review_star_rating,review_helpful_votes,verified_purchase,review_subjectivity,review_polarity,review_length,url,review_lang,site,site_outlier,site_outlier_en,global_outlier,global_outlier_en
11,1,Amazon1,Excelente,"Moldea muy bien, me gustó mucho! Es cómodo de ...","En perfectas condiciones, 100% el estado de la...",5,1,True,0.000000,0.000000,11.0,,es,Amazon,1,1,1,1
13,1,Amazon1,Excelente,Diego Sanchez,Todo estuvo muy bien,5,0,True,0.000000,0.000000,4.0,,es,Amazon,1,1,1,1
17,1,Amazon1,App loading,Amazon Customer,"I am still in the process of loading app,s in ...",5,0,True,0.000000,0.000000,22.0,,en,Amazon,1,1,1,1
19,1,Amazon1,Practically brand new,Gustavo Perez,From the description of the listing I was expe...,5,0,True,0.386518,0.036024,158.0,,en,Amazon,0,0,1,1
21,1,Amazon1,Definitely worth the investment,Richard Dumostil,I was reluctant to buy a refurbished phone but...,5,9,True,0.593506,0.333766,90.0,,en,Amazon,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46071,3,BB3,Accident,Aenavarro,Never downloaded- purchase was accident\nI alr...,1,0,True,0.000000,0.000000,8.0,,en,BestBuy,2,2,2,2
46072,3,BB3,couldn’t even get the item.,tony,i tried to download and it just threw me an er...,1,1,True,0.300000,1.000000,16.0,,en,BestBuy,1,1,1,1
46073,3,BB3,"Unable to download!!! Message comes up ""Bad R...",TEEE,"Unable to download!!! Message comes up ""Bad R...",1,1,True,0.583333,-0.882031,9.0,,en,BestBuy,2,2,2,2
46074,3,BB3,Returned this product,TraceyPortchester,did not have the capacity to safeguard all of ...,1,0,True,0.000000,0.000000,11.0,,en,BestBuy,2,2,2,2


Unnamed: 0,productID,prodSiteID,review_header,reviewer_name,review_content,review_star_rating,review_helpful_votes,verified_purchase,review_subjectivity,review_polarity,review_length,url,review_lang,site,site_outlier,site_outlier_en,global_outlier,global_outlier_en
3,1,Amazon1,Awesome,David Chaves Rojas,"Got my phone, 10/10. No scratches, still have ...",5,0,True,1.000000,1.000000,14.0,,en,Amazon,2,2,0,0
11,1,Amazon1,Excelente,"Moldea muy bien, me gustó mucho! Es cómodo de ...","En perfectas condiciones, 100% el estado de la...",5,1,True,0.000000,0.000000,11.0,,es,Amazon,1,1,1,1
13,1,Amazon1,Excelente,Diego Sanchez,Todo estuvo muy bien,5,0,True,0.000000,0.000000,4.0,,es,Amazon,1,1,1,1
17,1,Amazon1,App loading,Amazon Customer,"I am still in the process of loading app,s in ...",5,0,True,0.000000,0.000000,22.0,,en,Amazon,1,1,1,1
22,1,Amazon1,Funciona y se mira bien,Daniel831,Llevo un día usándolo y aparecer funciona bien...,5,0,True,0.000000,0.000000,24.0,,es,Amazon,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46072,3,BB3,couldn’t even get the item.,tony,i tried to download and it just threw me an er...,1,1,True,0.300000,1.000000,16.0,,en,BestBuy,1,1,1,1
46073,3,BB3,"Unable to download!!! Message comes up ""Bad R...",TEEE,"Unable to download!!! Message comes up ""Bad R...",1,1,True,0.583333,-0.882031,9.0,,en,BestBuy,2,2,2,2
46074,3,BB3,Returned this product,TraceyPortchester,did not have the capacity to safeguard all of ...,1,0,True,0.000000,0.000000,11.0,,en,BestBuy,2,2,2,2
46075,3,BB3,forget BB,Deede,Received in email but not downloadable. Geek ...,1,0,True,0.000000,0.000000,15.0,,en,BestBuy,2,2,2,2


Unnamed: 0,productID,prodSiteID,review_header,reviewer_name,review_content,review_star_rating,review_helpful_votes,verified_purchase,review_subjectivity,review_polarity,review_length,url,review_lang,site,site_outlier,site_outlier_en,global_outlier,global_outlier_en
3,1,Amazon1,Awesome,David Chaves Rojas,"Got my phone, 10/10. No scratches, still have ...",5,0,True,1.000000,1.000000,14.0,,en,Amazon,2,2,0,0
11,1,Amazon1,Excelente,"Moldea muy bien, me gustó mucho! Es cómodo de ...","En perfectas condiciones, 100% el estado de la...",5,1,True,0.000000,0.000000,11.0,,es,Amazon,1,1,1,1
13,1,Amazon1,Excelente,Diego Sanchez,Todo estuvo muy bien,5,0,True,0.000000,0.000000,4.0,,es,Amazon,1,1,1,1
17,1,Amazon1,App loading,Amazon Customer,"I am still in the process of loading app,s in ...",5,0,True,0.000000,0.000000,22.0,,en,Amazon,1,1,1,1
22,1,Amazon1,Funciona y se mira bien,Daniel831,Llevo un día usándolo y aparecer funciona bien...,5,0,True,0.000000,0.000000,24.0,,es,Amazon,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46072,3,BB3,couldn’t even get the item.,tony,i tried to download and it just threw me an er...,1,1,True,0.300000,1.000000,16.0,,en,BestBuy,1,1,1,1
46073,3,BB3,"Unable to download!!! Message comes up ""Bad R...",TEEE,"Unable to download!!! Message comes up ""Bad R...",1,1,True,0.583333,-0.882031,9.0,,en,BestBuy,2,2,2,2
46074,3,BB3,Returned this product,TraceyPortchester,did not have the capacity to safeguard all of ...,1,0,True,0.000000,0.000000,11.0,,en,BestBuy,2,2,2,2
46075,3,BB3,forget BB,Deede,Received in email but not downloadable. Geek ...,1,0,True,0.000000,0.000000,15.0,,en,BestBuy,2,2,2,2


In [51]:
print('\n'.join(list(all_reviews[(all_reviews['review_lang']!='en')&(all_reviews['global_outlier']>0)]['review_content'].astype(str))))

En perfectas condiciones, 100% el estado de la batería, muy recomendado
Todo estuvo muy bien
Llevo un día usándolo y aparecer funciona bien y en lo físico también se mira bien y las batería esta al 100% de vida
Excelente producto y llego bien embalado
Me encanto llego en muy buen estado…
La vida de batería esta al %100 quede fascinada
De regalo de cumpleaños a mi esposa.
Teléfono impecable, como nuevo, ni un rasguño, batería 100%.
Sin duda pronto compraré uno para mi.
Como nuevo ! Excelente relación precio calidad
Me agrado la presteza del envio
Tiene el 100% de la bateria y esta en excelente estado, ni un ralla, practicamente nuevo
Excelente equipo la verdad me sorprendido más de lo q esperaba batería 🔋 98% y sin marcas o rayones
Asta el momento esta bien la compra tengo un mes con el
La cámara no es igual a la de otros iPhone 14 . Es más opaco (colores distintos)y se ve más pequeña la imagen a comparación de la misma configuración en otros iPhones 14
Item to exspensive
Específicament