# Imports and Configurations

In [6]:
import pandas as pd
import numpy as np
import re
import nltk
from num2words import num2words

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import RSLPStemmer
from autocorrect import Speller
from sklearn.feature_extraction.text import CountVectorizer
from translate import translator

import matplotlib.pyplot as plt
import matplotlib.dates as mdates
%matplotlib inline 
import seaborn as sns

nltk.download('rslp')

# dataset visibility
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

[nltk_data] Downloading package rslp to /Users/angelachow/nltk_data...
[nltk_data]   Package rslp is already up-to-date!


# Functions

In [7]:
def remove_stop_words(text):
    for word in text:
        if word in stopwords.words('portuguese'):
            text.remove(word)
    return text

stemmer = RSLPStemmer()
spell = Speller('pt')

def spell_and_stem_words(text):
    for i in range(len(text)):
        text[i] = stemmer.stem(spell(text[i]))
    return text

# Joining dataset

In [8]:
#final dataframe
df = pd.DataFrame()

In [9]:
# read all datasets

# orders dataset
orders = pd.read_csv('data/olist_orders_dataset.csv')

# order items dataset
items = pd.read_csv('data/olist_order_items_dataset.csv')

# sellers dataset
sellers = pd.read_csv('data/olist_sellers_dataset.csv')

# sellers dataset
customers = pd.read_csv('data/olist_customers_dataset.csv')

# reviews dataset
reviews = pd.read_csv('data/olist_order_reviews_dataset.csv')

# products dataset
products = pd.read_csv('data/olist_products_dataset.csv')

# sellers dataset
payments = pd.read_csv('data/olist_order_payments_dataset.csv')

# categories dataset
categories = pd.read_csv('data/product_category_name_translation.csv')

In [None]:
df = orders

df = df.merge(customers, on='customer_id', how='left')
df = df.merge(payments, on='order_id', how='left')
df = df.merge(reviews, on='order_id', how='left')

df = df.merge(items, on='order_id', how='left')
df = df.merge(sellers, on='seller_id', how='left')
df = df.merge(products, on='product_id', how='left')
df = df.merge(categories, on='product_category_name', how='left')

#df.head()

In [9]:
df.to_csv('data/merged_dataset.csv')

# EDA

## Customers

In [None]:
customers.info()
customers.isnull().sum()

In [None]:
cust = customers["customer_unique_id"].nunique()
print(cust, "unique customers")

In [None]:
cities = customers["customer_city"].nunique()
c1 = customers.groupby('customer_city')['customer_id'].nunique().sort_values(ascending=False)
print("There are",cities,"unique cities in the dataset. The Top 10 cities based on customers_id are:")
c2 = c1.head(10)
print(c2)
print("\nTop 10 cities covers", round(c2.sum()/customers.shape[0]*100,1),"percent of all the orders.")
plt.figure(figsize=(16,8))
c2.plot(kind="bar",rot=0)

In [None]:
state = customers["customer_state"].nunique()
c1 = customers.groupby('customer_state')['customer_id'].nunique().sort_values(ascending=False)
print("There are",cities,"unique states in the dataset. The Top 5 states are:")
c2 = c1.head(5)
print(c2)
print("\nTop 10 states covers", round(c2.sum()/customers.shape[0]*100,1),"percent of all the orders.")
plt.figure(figsize=(16,8))
c2.plot(kind="bar",rot=0)

## Orders

In [None]:
orders.head()

In [None]:
orders.info()
customers.isnull().sum()

In [None]:
orders.head()

In [None]:
orders_mod = orders.copy()
orders_mod["order_purchase_timestamp"] = pd.to_datetime(orders["order_purchase_timestamp"], format='%Y-%m-%d %H:%M:%S')
orders_mod["order_delivered_carrier_date"] = pd.to_datetime(orders["order_delivered_carrier_date"], format='%Y-%m-%d %H:%M:%S')
orders_mod["order_delivered_customer_date"] = pd.to_datetime(orders["order_delivered_customer_date"], format='%Y-%m-%d %H:%M:%S')
orders_mod["order_estimated_delivery_date"] = pd.to_datetime(orders["order_estimated_delivery_date"], format='%Y-%m-%d %H:%M:%S')
orders_mod.dtypes

In [None]:
orders_mod.head()

In [None]:
##Viz on when purchases are made during period in dataset.
counts = orders_mod.set_index("order_purchase_timestamp").groupby(pd.Grouper(freq='D')).count()
fig = plt.figure(figsize=(16,8))
ax = fig.gca()
counts.plot(y = "order_id", use_index=True, ax=ax)

In [None]:
##Identifies orderstatus distribution
orderstatus = orders["order_status"].nunique()
o1 = orders.groupby('order_status')['customer_id'].nunique().sort_values(ascending=False)
print("There are",orderstatus,"unique order_status in the dataset.")
o2 = o1.head(8)
print(o2)
o3 = o1.head(1)
print("\nDelivered status covers", round(o3.sum()/orders.shape[0]*100,1),"percent of all the orders.")
plt.figure(figsize=(16,8))
o2.plot(kind="bar",rot=0)

## Order Reviews

In [29]:
reviews.info()
reviews.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 7 columns):
 #   Column                   Non-Null Count   Dtype 
---  ------                   --------------   ----- 
 0   review_id                100000 non-null  object
 1   order_id                 100000 non-null  object
 2   review_score             100000 non-null  int64 
 3   review_comment_title     11715 non-null   object
 4   review_comment_message   41753 non-null   object
 5   review_creation_date     100000 non-null  object
 6   review_answer_timestamp  100000 non-null  object
dtypes: int64(1), object(6)
memory usage: 5.3+ MB


Unnamed: 0,review_id,order_id,review_score,review_comment_title,review_comment_message,review_creation_date,review_answer_timestamp
0,7bc2406110b926393aa56f80a40eba40,73fc7af87114b39712e6da79b0a377eb,4,,,2018-01-18 00:00:00,2018-01-18 21:46:59
1,80e641a11e56f04c1ad469d5645fdfde,a548910a1c6147796b98fdf73dbeba33,5,,,2018-03-10 00:00:00,2018-03-11 03:05:13
2,228ce5500dc1d8e020d8d1322874b6f0,f9e4b658b201a9f2ecdecbb34bed034b,5,,,2018-02-17 00:00:00,2018-02-18 14:36:24
3,e64fb393e7b32834bb789ff8bb30750e,658677c97b385a9be170737859d3511b,5,,Recebi bem antes do prazo estipulado.,2017-04-21 00:00:00,2017-04-21 22:02:06
4,f7c4243c7fe1938f181bec41a392bdeb,8e6bfb81e283fa7e4f11123a3fb894f1,5,,Parab√©ns lojas lannister adorei comprar pela I...,2018-03-01 00:00:00,2018-03-02 10:26:53


In [27]:
reviews.isnull().values.any()

True

In [28]:
# Determine how many missing data instances
reviews.isnull().sum()

review_id                      0
order_id                       0
review_score                   0
review_comment_title       88285
review_comment_message     58247
review_creation_date           0
review_answer_timestamp        0
dtype: int64

In [None]:
# Address missing data - we can see that there is a review score but not necessarily a comment or a title. Should we make it "none?"

reviews.isnull().sum()

## Items

In [None]:
items.info()
items.isnull().sum()

In [None]:
sellerinfo = items["seller_id"].nunique()
o1 = items.groupby('seller_id')['order_id'].nunique().sort_values(ascending=False)
print("There are",items,"unique items in the dataset.")
o2 = o1.head(10)
print(o2)

print("\nTop 10 items covers", round(o2.sum()/orders.shape[0]*100,1),"percent of all the orders.")
plt.figure(figsize=(16,8))
o2.plot(kind="bar",rot=0)

## Sellers

In [None]:
sellers.info()
sellers.isnull().sum()

In [None]:
sellerstatus = sellers["seller_id"].nunique()
o1 = sellers.groupby('seller_city')['seller_id'].nunique().sort_values(ascending=False)
print("There are",sellerstatus,"unique sellers in the dataset.")
o2 = o1.head(8)
print(o2)

print("\nTop Seller by city covers", round(o2.sum()/orders.shape[0]*100,1),"percent of all the sellers.")
plt.figure(figsize=(16,8))
o2.plot(kind="bar",rot=0)

In [None]:
sellerstatus = sellers["seller_id"].nunique()
o1 = sellers.groupby('seller_state')['seller_id'].nunique().sort_values(ascending=False)
print("There are",sellerstatus,"unique sellers in the dataset.")
o2 = o1.head(8)
print(o2)

print("\nTop Sellers by state represents", round(o2.sum()/orders.shape[0]*100,1),"percent of all the sellers.")
plt.figure(figsize=(16,8))
o2.plot(kind="bar",rot=0)

## Products

In [None]:
products.info()
products.isnull().sum()
# need to fix missing data 610 is the same entry but missing description. 2 items also missing (1 competely missing all except ID
#other one is just missing weight, lenght, height, width) Dtypes changed from float64 to object as a result of replacing NA
products["product_category_name"].fillna("None", inplace = True)
products["product_name_lenght"].fillna("None", inplace = True)
products["product_description_lenght"].fillna("None", inplace = True)
products["product_photos_qty"].fillna("None", inplace = True)
products["product_weight_g"].fillna("0", inplace = True)
products["product_length_cm"].fillna("0", inplace = True)
products["product_height_cm"].fillna("0", inplace = True)
products["product_width_cm"].fillna("0", inplace = True)

In [None]:
products.isnull().sum()

In [None]:
most_product=df.groupby('product_category_name_english').aggregate({'order_id':'count'}).rename(columns={'order_id':'order_count'}).sort_values(by='order_count',ascending=False).reset_index()
most_product.head()

In [None]:
### Visualising top 10 most bought product categories:
sns.barplot(x='product_category_name_english',y='order_count',data=most_product[:10],color="green")
plt.xlabel("Product Category")
plt.ylabel("Number of orders")
plt.title("Most bought product categories")
plt.xticks(rotation='vertical')
plt.show()

In [None]:
prodcat= df["product_id"].nunique()
o1 = df.groupby('product_category_name_english')['product_id'].nunique().sort_values(ascending=False)
print("There are",prodcat,"unique products in the dataset.")
o2 = o1.head(10)
print(o2)

print("\nTop 10 Products by category represent", round(o2.sum()/orders.shape[0]*100,1),"percent of all the products.")
plt.figure(figsize=(16,8))
o2.plot(kind="bar",rot=0)

## Payment

In [None]:
payments.info()
payments.isnull().sum()

## Categories

In [None]:
categories.info()
categories.isnull().sum()

### NLP

nltk.download('punkt')
nltk.download("stopwords")# NLP

In [11]:
nltk.download('punkt')
nltk.download("stopwords")

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/angelachow/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/angelachow/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [12]:
# we are going to work with only reviews dataset
reviews = reviews.dropna()
#reviews.head(100)

reviews_only = reviews[reviews['review_score'] == 1]
reviews_only = reviews_only['review_comment_message']
# replace numbers of words
reviews_only = reviews_only.apply(lambda t: re.sub(r"(\d+)", lambda x: num2words(int(x.group(0))), t))
# replace special characters
reviews_only = reviews_only.apply(lambda t: re.sub('[^A-z√Ä-√∫\s]/gi', ' ', t))
# make all letters in a lower case
reviews_only = reviews_only.apply(lambda t: t.lower())

# tokenize text and remove stop words
tokenized_reviews = reviews_only.apply(lambda t: word_tokenize(t))
tokenized_reviews = tokenized_reviews.apply(lambda t: remove_stop_words(t))

# spell check and stemming
tokenized_reviews = tokenized_reviews.apply(lambda t: spell_and_stem_words(t))

tokenized_reviews.head(100)

reviews_text = tokenized_reviews.apply(lambda t: " ".join(t))


In [None]:
# feature matrix
# creating the feature matrix
matrix = CountVectorizer(max_features=1000, ngram_range=(2, 4))
X = matrix.fit_transform(reviews_text).toarray()
matrix.vocabulary_
#

In [59]:
from googletrans import Translator
import pandas as pd

a = reviews.iloc[201:400]
translator = Translator()
translator.raise_Exception = True
reviews['English']=a['review_comment_message'].apply(translator.translate, src='pt', dest='en').apply(getattr, args=('text',))




In [22]:
reviews_low = reviews[reviews['review_score'] == 1]
reviews_low['English'] = np.nan

In [26]:
from googletrans import Translator
import time

translator = Translator()
translator.raise_Exception = True

#mini-batch loop. Loops 200 at a time. Hit failures with the translate API, switched to the slow iterrows loop below
#for i in range(0, len(reviews_low), 200): 
    #print(i)
    #a = reviews_low.iloc[i:i+199]
    #reviews_low['English']=a['review_comment_message'].apply(translator.translate, src='pt', dest='en').apply(getattr, args=('text',))

# row by row to handle when API limits are hit...not the best performance...can probably add the try/except to the mini-batch above for better performance, assuming it calls the google API in bulk
for i, row in reviews_low.iterrows():
    # Skip rows that were successful from previous attempts
    # Blank text also seems to throw an error in the translator
    if pd.isnull(row['English']) and not pd.isnull(row['review_comment_message']) and row['review_comment_message'].strip() != '':
        try:
            #getattr is to retrieve the object value for 'text', returns None if 'text' is not in the object
            reviews_low.at[i, 'English'] = getattr(translator.translate(row['review_comment_message'], src='pt', dest='en'), 'text')
        # Most likely the 429 status code error from too many attempts
        except Exception as e:
            print('Error on {}: {}'.format(i, row['review_comment_message']))
            print(e)
            print('pausing for 1min...')
            time.sleep(60)
            print('trying again...')
            reviews_low.at[i, 'English'] = getattr(translator.translate(row['review_comment_message'], src='pt', dest='en'), 'text')
            print('Success! Moving on.')


Error on 31615: veio faltando uma carteira e querendo saber como vai fica 
Unexpected status code "429" from ('translate.google.com',)
pausing for 1min...
trying again...
Success! Moving on.
Error on 31735: Foto diferente do produto, quero a troca ou a devolu√ß√£o do dinheiro
Unexpected status code "429" from ('translate.google.com',)
pausing for 1min...
trying again...
Success! Moving on.
Error on 31760: Produto de baixa qualidade, na foto engana bem, bota esquerdo n√£o tem fun√ß√£o, o digital somente funciona se vc aperta um dos dois bot√µes da direita e logo apaga, instru√ß√£o em ingl√™s e chin√™s. 
Unexpected status code "429" from ('translate.google.com',)
pausing for 1min...
trying again...
Success! Moving on.
Error on 31877: Comprei dois filtros e a embalagem chegou com apenas um filtro... E o outro?
Unexpected status code "429" from ('translate.google.com',)
pausing for 1min...
trying again...
Success! Moving on.
Error on 32024: Comprei duas cadeiras, uma o assento √± encaixa c

In [27]:
#print(reviews)
reviews_low.to_csv("reviews_low.csv")

In [15]:
a=reviews.iloc[201:400]
translator = Translator()
translator.raise_Exception = True
reviews['English']=a['review_comment_message'].apply(translator.translate, src='pt', dest='en').apply(getattr, args=('text',))


In [16]:
print(reviews)

                              review_id                          order_id  \
9      8670d52e15e00043ae7de4c01cc2fe06  b9bf720beb4ab3728760088589c62129   
15     3948b09f7c818e2d86c9a546758b2335  e51478e7e277a83743b6f9991dbfa3fb   
19     373cbeecea8286a2b66c97b1b157ec46  583174fbe37d3d5f0d6661be3aad1786   
22     d21bbc789670eab777d27372ab9094cc  4fc44d78867142c627497b60a7e0228a   
34     c92cdd7dd544a01aa35137f901669cdf  37e7875cdce5a9e5b3a692971f370151   
...                                 ...                               ...   
99962  47e0954e156dac6512c25c6d2ecc1c66  16cbf959cfdb88c47ee2a29303547ec2   
99967  0e7bc73fde6782891898ea71443f9904  bd78f91afbb1ecbc6124974c5e813043   
99971  58be140ccdc12e8908ff7fd2ba5c7cb0  0ebf8e35b9807ee2d717922d5663ccdb   
99972  51de4e06a6b701cb2be47ea0e689437b  b7467ae483dbe956fe9acdf0b1e6e3f4   
99975  2ee221b28e5b6fceffac59487ed39348  f2d12dd37eaef72ed7b1186b2edefbcd   

       review_score       review_comment_title  \
9                 4      

In [None]:
pd.set_option('display.max_rows', None)
phrases = pd.DataFrame()
phrases['phrases'] = matrix.get_feature_names()

#phrases['translation']= phrases.apply(lambda t: translator('pt', 'en', str(t)))

#phrases

translator('pt', 'en', 'vei quebr')