# Classification of Reviews


The following notebooks details how different types 

In [7]:
import pandas as pd
import os
import numpy as np

In [3]:
# change directory
os.chdir("/Users/andreamock/Documents/Globalink/")

In [4]:
reviewsDf = pd.read_csv('scrapedReviews0705.csv', index_col=0)

In [5]:
reviewsDf.head()

Unnamed: 0,review_for,username,user_url,num_stars,review_text,lang,review_date,type,scraped_from
0,Parc de la Capture-d'Ethan-Allen,Claudia,https://www.google.com/maps/contrib/1001449741...,4.0,One of the nicest entry points to this invitin...,en,2020-11-20 22:04:09.211296,Park,Google Maps
1,Parc de la Capture-d'Ethan-Allen,Nate Neel,https://www.google.com/maps/contrib/1121030547...,5.0,"Waterfront to fish or just relax, great place ...",en,2020-10-20 22:04:09.212245,Park,Google Maps
2,Parc de la Capture-d'Ethan-Allen,Yucel Salimoglu,https://www.google.com/maps/contrib/1034180738...,4.0,Everything except the parking is good here.,en,2020-07-20 22:04:09.213178,Park,Google Maps
3,Parc de la Capture-d'Ethan-Allen,COCO BEADZ,https://www.google.com/maps/contrib/1036060504...,4.0,"Defenely the best park in Montreal East, Tetre...",en,2020-06-20 22:04:09.214115,Park,Google Maps
4,Parc de la Capture-d'Ethan-Allen,Anna Maria Fiore,https://www.google.com/maps/contrib/1016779009...,5.0,It's so peaceful and happy place near the water,en,2020-06-20 22:04:09.215069,Park,Google Maps


In [9]:
reviewsDf['label'] = np.where(reviewsDf['num_stars'] < 4, 0, 1)

## Calculating summary statistics 

In [11]:
pos_reviews = reviewsDf[reviewsDf['label'] == 1]
neg_reviews = reviewsDf[reviewsDf['label'] == 0]

After breaking down the dataset into positive and negative reviews 

In [13]:
pos_reviews.shape[0]/(pos_reviews.shape[0] +neg_reviews.shape[0])

0.8718536983521211

## Cleaning data in each language

In [21]:
enDf = reviewsDf[reviewsDf['lang'] == 'en']
enDf.reset_index(drop=True)
enDf.shape

(25225, 10)

In [22]:
frDf = reviewsDf[reviewsDf['lang'] == 'fr']

frDf.shape

(18351, 10)

In [26]:
# percent of french reviews in dataset

100*frDf.shape[0]/(frDf.shape[0]+ enDf.shape[0])

42.11263080594823

In [25]:
# total percentage of french and english text in dataset
100*(frDf.shape[0]+ enDf.shape[0])/reviewsDf.shape[0]

96.12847720103241

In [30]:
frenchReviewsDf = frDf.copy()
frenchReviewText = frDf['review_text'].apply(lambda x: x.split('(Original)')[-1].strip())
frenchReviewsDf['review_text'] = frenchReviewText

In [36]:
frenchReviewsDf = frenchReviewsDf.drop(columns=['review_for', 'username', 'user_url', 
                                                'review_date', 'type', 'scraped_from'])
frenchReviewsDf = frenchReviewsDf.reset_index(drop=True)

In [37]:
frenchReviewsDf

Unnamed: 0,num_stars,review_text,lang,label
0,4.0,Tres beau parc pour faire un pinic et profiter...,fr,1
1,5.0,Les gens sont tous très sociables.\nExceptionnel.,fr,1
2,5.0,"Belle place pour marcher, jogger, promener le ...",fr,1
3,4.0,Nourriture excellente !,fr,1
4,4.0,J aime bien pour prendre de l air frais,fr,1
...,...,...,...,...
18346,5.0,"Îlot de verdure au centre-ville, idéal pour pi...",fr,1
18347,5.0,Nous ne venons pas souvent dans le quartier ma...,fr,1
18348,4.0,"À partir de la route Côte-des-Neiges, une asce...",fr,1
18349,4.0,Superbe vue sur le coté ouest de Montreal. Bel...,fr,1


In [38]:
englishReviewsDf = enDf.copy()

englishReviewsDf = englishReviewsDf.drop(columns=['review_for', 'username', 'user_url', 
                                                'review_date', 'type', 'scraped_from'])
englishReviewsDf = englishReviewsDf.reset_index(drop=True)

In [164]:
englishReviewsDf['review_text'][1]

'Waterfront to fish or just relax, great place to picnic, hangout and watch the sunset. Amazing place.'

In [171]:
#englishReviewsDf['review_text'].to_csv('ENReviews.csv', index = False, header=False)
#englishReviewsDf.to_csv('ENReviewsComplete.csv')
#frenchReviewsDf['review_text'].to_csv('FRReviews.csv', index = False, header=False)
#frenchReviewsDf.to_csv('FRReviewsComplete.csv')

In [40]:
import re
def clean_text(df, text_field, new_text_field_name):
    df[new_text_field_name] = df[text_field].str.lower()
    df[new_text_field_name] = df[new_text_field_name].apply(lambda elem: re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", elem))  
    # remove numbers
    df[new_text_field_name] = df[new_text_field_name].apply(lambda elem: re.sub(r"\d+", "", elem))
    
    return df

In [41]:
data_cleanEn = clean_text(englishReviewsDf, 'review_text', 'text_clean')
data_cleanEn.head()

Unnamed: 0,num_stars,review_text,lang,label,text_clean
0,4.0,One of the nicest entry points to this invitin...,en,1,one of the nicest entry points to this invitin...
1,5.0,"Waterfront to fish or just relax, great place ...",en,1,waterfront to fish or just relax great place t...
2,4.0,Everything except the parking is good here.,en,1,everything except the parking is good here
3,4.0,"Defenely the best park in Montreal East, Tetre...",en,1,defenely the best park in montreal east tetrea...
4,5.0,It's so peaceful and happy place near the water,en,1,its so peaceful and happy place near the water


In [45]:
data_cleanFr= clean_text(frenchReviewsDf, 'review_text', 'text_clean')
data_cleanFr.head()

Unnamed: 0,num_stars,review_text,lang,label,text_clean
0,4.0,Tres beau parc pour faire un pinic et profiter...,fr,1,tres beau parc pour faire un pinic et profiter...
1,5.0,Les gens sont tous très sociables.\nExceptionnel.,fr,1,les gens sont tous trs sociablesexceptionnel
2,5.0,"Belle place pour marcher, jogger, promener le ...",fr,1,belle place pour marcher jogger promener le ch...
3,4.0,Nourriture excellente !,fr,1,nourriture excellente
4,4.0,J aime bien pour prendre de l air frais,fr,1,j aime bien pour prendre de l air frais


## Removing stopwords

In [42]:
import nltk.corpus
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/andreamock/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [44]:
stop = stopwords.words('english')
data_cleanEn['text_clean'] = data_cleanEn['text_clean'].apply(lambda x: ' '.join(
    [word for word in x.split() if word not in (stop)]))
data_cleanEn.head()

Unnamed: 0,num_stars,review_text,lang,label,text_clean
0,4.0,One of the nicest entry points to this invitin...,en,1,one nicest entry points inviting waterfront pr...
1,5.0,"Waterfront to fish or just relax, great place ...",en,1,waterfront fish relax great place picnic hango...
2,4.0,Everything except the parking is good here.,en,1,everything except parking good
3,4.0,"Defenely the best park in Montreal East, Tetre...",en,1,defenely best park montreal east tetreauville ...
4,5.0,It's so peaceful and happy place near the water,en,1,peaceful happy place near water


In [46]:
stop = stopwords.words('french')
data_cleanFr['text_clean'] = data_cleanFr['text_clean'].apply(lambda x: ' '.join(
    [word for word in x.split() if word not in (stop)]))
data_cleanFr.head()

Unnamed: 0,num_stars,review_text,lang,label,text_clean
0,4.0,Tres beau parc pour faire un pinic et profiter...,fr,1,tres beau parc faire pinic profiter beaute nature
1,5.0,Les gens sont tous très sociables.\nExceptionnel.,fr,1,gens tous trs sociablesexceptionnel
2,5.0,"Belle place pour marcher, jogger, promener le ...",fr,1,belle place marcher jogger promener chien fair...
3,4.0,Nourriture excellente !,fr,1,nourriture excellente
4,4.0,J aime bien pour prendre de l air frais,fr,1,aime bien prendre air frais


### Perform word stemming

In [51]:
from nltk.stem import SnowballStemmer 
from nltk.tokenize import word_tokenize, sent_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/andreamock/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [48]:
data_cleanEn['text_tokens'] = data_cleanEn['text_clean'].apply(lambda x: word_tokenize(x))
data_cleanEn.head()

Unnamed: 0,num_stars,review_text,lang,label,text_clean,text_tokens
0,4.0,One of the nicest entry points to this invitin...,en,1,one nicest entry points inviting waterfront pr...,"[one, nicest, entry, points, inviting, waterfr..."
1,5.0,"Waterfront to fish or just relax, great place ...",en,1,waterfront fish relax great place picnic hango...,"[waterfront, fish, relax, great, place, picnic..."
2,4.0,Everything except the parking is good here.,en,1,everything except parking good,"[everything, except, parking, good]"
3,4.0,"Defenely the best park in Montreal East, Tetre...",en,1,defenely best park montreal east tetreauville ...,"[defenely, best, park, montreal, east, tetreau..."
4,5.0,It's so peaceful and happy place near the water,en,1,peaceful happy place near water,"[peaceful, happy, place, near, water]"


In [49]:
data_cleanFr['text_tokens'] = data_cleanFr['text_clean'].apply(lambda x: word_tokenize(x))
data_cleanFr.head()

Unnamed: 0,num_stars,review_text,lang,label,text_clean,text_tokens
0,4.0,Tres beau parc pour faire un pinic et profiter...,fr,1,tres beau parc faire pinic profiter beaute nature,"[tres, beau, parc, faire, pinic, profiter, bea..."
1,5.0,Les gens sont tous très sociables.\nExceptionnel.,fr,1,gens tous trs sociablesexceptionnel,"[gens, tous, trs, sociablesexceptionnel]"
2,5.0,"Belle place pour marcher, jogger, promener le ...",fr,1,belle place marcher jogger promener chien fair...,"[belle, place, marcher, jogger, promener, chie..."
3,4.0,Nourriture excellente !,fr,1,nourriture excellente,"[nourriture, excellente]"
4,4.0,J aime bien pour prendre de l air frais,fr,1,aime bien prendre air frais,"[aime, bien, prendre, air, frais]"


In [54]:
def word_stemmer(text,lang):
    stem_text = [SnowballStemmer(lang).stem(i) for i in text]
    return stem_text

In [55]:
data_cleanEn['text_tokens_stem'] = data_cleanEn['text_tokens'].apply(lambda x: word_stemmer(x, 'english'))
data_cleanEn.head()

Unnamed: 0,num_stars,review_text,lang,label,text_clean,text_tokens,text_tokens_stem
0,4.0,One of the nicest entry points to this invitin...,en,1,one nicest entry points inviting waterfront pr...,"[one, nicest, entry, points, inviting, waterfr...","[one, nicest, entri, point, invit, waterfront,..."
1,5.0,"Waterfront to fish or just relax, great place ...",en,1,waterfront fish relax great place picnic hango...,"[waterfront, fish, relax, great, place, picnic...","[waterfront, fish, relax, great, place, picnic..."
2,4.0,Everything except the parking is good here.,en,1,everything except parking good,"[everything, except, parking, good]","[everyth, except, park, good]"
3,4.0,"Defenely the best park in Montreal East, Tetre...",en,1,defenely best park montreal east tetreauville ...,"[defenely, best, park, montreal, east, tetreau...","[defen, best, park, montreal, east, tetreauvil..."
4,5.0,It's so peaceful and happy place near the water,en,1,peaceful happy place near water,"[peaceful, happy, place, near, water]","[peac, happi, place, near, water]"


In [56]:
data_cleanFr['text_tokens_stem'] = data_cleanFr['text_tokens'].apply(lambda x: word_stemmer(x, 'french'))
data_cleanFr.head()

Unnamed: 0,num_stars,review_text,lang,label,text_clean,text_tokens,text_tokens_stem
0,4.0,Tres beau parc pour faire un pinic et profiter...,fr,1,tres beau parc faire pinic profiter beaute nature,"[tres, beau, parc, faire, pinic, profiter, bea...","[tre, beau, parc, fair, pinic, profit, beaut, ..."
1,5.0,Les gens sont tous très sociables.\nExceptionnel.,fr,1,gens tous trs sociablesexceptionnel,"[gens, tous, trs, sociablesexceptionnel]","[gen, tous, tr, sociablesexceptionnel]"
2,5.0,"Belle place pour marcher, jogger, promener le ...",fr,1,belle place marcher jogger promener chien fair...,"[belle, place, marcher, jogger, promener, chie...","[bel, plac, march, jogg, promen, chien, fair, ..."
3,4.0,Nourriture excellente !,fr,1,nourriture excellente,"[nourriture, excellente]","[nourritur, excellent]"
4,4.0,J aime bien pour prendre de l air frais,fr,1,aime bien prendre air frais,"[aime, bien, prendre, air, frais]","[aim, bien, prendr, air, frais]"


### Word lemmatization

In [58]:
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/andreamock/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [60]:
def word_lemmatizer(text):
    lem_text = [WordNetLemmatizer().lemmatize(i) for i in text]
    return lem_text

In [61]:
data_cleanEn['text_tokens_lemma'] = data_cleanEn['text_tokens'].apply(lambda x: word_lemmatizer(x))
data_cleanEn.head()

Unnamed: 0,num_stars,review_text,lang,label,text_clean,text_tokens,text_tokens_stem,text_tokens_lemma
0,4.0,One of the nicest entry points to this invitin...,en,1,one nicest entry points inviting waterfront pr...,"[one, nicest, entry, points, inviting, waterfr...","[one, nicest, entri, point, invit, waterfront,...","[one, nicest, entry, point, inviting, waterfro..."
1,5.0,"Waterfront to fish or just relax, great place ...",en,1,waterfront fish relax great place picnic hango...,"[waterfront, fish, relax, great, place, picnic...","[waterfront, fish, relax, great, place, picnic...","[waterfront, fish, relax, great, place, picnic..."
2,4.0,Everything except the parking is good here.,en,1,everything except parking good,"[everything, except, parking, good]","[everyth, except, park, good]","[everything, except, parking, good]"
3,4.0,"Defenely the best park in Montreal East, Tetre...",en,1,defenely best park montreal east tetreauville ...,"[defenely, best, park, montreal, east, tetreau...","[defen, best, park, montreal, east, tetreauvil...","[defenely, best, park, montreal, east, tetreau..."
4,5.0,It's so peaceful and happy place near the water,en,1,peaceful happy place near water,"[peaceful, happy, place, near, water]","[peac, happi, place, near, water]","[peaceful, happy, place, near, water]"


In [62]:
data_cleanFr['text_tokens_lemma'] = data_cleanFr['text_tokens'].apply(lambda x: word_lemmatizer(x))
data_cleanFr.head()

Unnamed: 0,num_stars,review_text,lang,label,text_clean,text_tokens,text_tokens_stem,text_tokens_lemma
0,4.0,Tres beau parc pour faire un pinic et profiter...,fr,1,tres beau parc faire pinic profiter beaute nature,"[tres, beau, parc, faire, pinic, profiter, bea...","[tre, beau, parc, fair, pinic, profit, beaut, ...","[tres, beau, parc, faire, pinic, profiter, bea..."
1,5.0,Les gens sont tous très sociables.\nExceptionnel.,fr,1,gens tous trs sociablesexceptionnel,"[gens, tous, trs, sociablesexceptionnel]","[gen, tous, tr, sociablesexceptionnel]","[gen, tous, trs, sociablesexceptionnel]"
2,5.0,"Belle place pour marcher, jogger, promener le ...",fr,1,belle place marcher jogger promener chien fair...,"[belle, place, marcher, jogger, promener, chie...","[bel, plac, march, jogg, promen, chien, fair, ...","[belle, place, marcher, jogger, promener, chie..."
3,4.0,Nourriture excellente !,fr,1,nourriture excellente,"[nourriture, excellente]","[nourritur, excellent]","[nourriture, excellente]"
4,4.0,J aime bien pour prendre de l air frais,fr,1,aime bien prendre air frais,"[aime, bien, prendre, air, frais]","[aim, bien, prendr, air, frais]","[aime, bien, prendre, air, frais]"


### Part of speech tagging

In [64]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/andreamock/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [65]:
def word_pos_tagger(text):
    pos_tagged_text = nltk.pos_tag(text)
    return pos_tagged_text

In [66]:
data_cleanEn['text_tokens_pos_tagged'] = data_cleanEn['text_tokens'].apply(lambda x: word_pos_tagger(x))
data_cleanEn.head()

Unnamed: 0,num_stars,review_text,lang,label,text_clean,text_tokens,text_tokens_stem,text_tokens_lemma,text_tokens_pos_tagged
0,4.0,One of the nicest entry points to this invitin...,en,1,one nicest entry points inviting waterfront pr...,"[one, nicest, entry, points, inviting, waterfr...","[one, nicest, entri, point, invit, waterfront,...","[one, nicest, entry, point, inviting, waterfro...","[(one, CD), (nicest, JJS), (entry, NN), (point..."
1,5.0,"Waterfront to fish or just relax, great place ...",en,1,waterfront fish relax great place picnic hango...,"[waterfront, fish, relax, great, place, picnic...","[waterfront, fish, relax, great, place, picnic...","[waterfront, fish, relax, great, place, picnic...","[(waterfront, JJ), (fish, JJ), (relax, NN), (g..."
2,4.0,Everything except the parking is good here.,en,1,everything except parking good,"[everything, except, parking, good]","[everyth, except, park, good]","[everything, except, parking, good]","[(everything, NN), (except, IN), (parking, VBG..."
3,4.0,"Defenely the best park in Montreal East, Tetre...",en,1,defenely best park montreal east tetreauville ...,"[defenely, best, park, montreal, east, tetreau...","[defen, best, park, montreal, east, tetreauvil...","[defenely, best, park, montreal, east, tetreau...","[(defenely, RB), (best, JJS), (park, NN), (mon..."
4,5.0,It's so peaceful and happy place near the water,en,1,peaceful happy place near water,"[peaceful, happy, place, near, water]","[peac, happi, place, near, water]","[peaceful, happy, place, near, water]","[(peaceful, JJ), (happy, JJ), (place, NN), (ne..."


In [67]:
data_cleanFr['text_tokens_pos_tagged'] = data_cleanFr['text_tokens'].apply(lambda x: word_pos_tagger(x))
data_cleanFr.head()

Unnamed: 0,num_stars,review_text,lang,label,text_clean,text_tokens,text_tokens_stem,text_tokens_lemma,text_tokens_pos_tagged
0,4.0,Tres beau parc pour faire un pinic et profiter...,fr,1,tres beau parc faire pinic profiter beaute nature,"[tres, beau, parc, faire, pinic, profiter, bea...","[tre, beau, parc, fair, pinic, profit, beaut, ...","[tres, beau, parc, faire, pinic, profiter, bea...","[(tres, NNS), (beau, VBP), (parc, JJ), (faire,..."
1,5.0,Les gens sont tous très sociables.\nExceptionnel.,fr,1,gens tous trs sociablesexceptionnel,"[gens, tous, trs, sociablesexceptionnel]","[gen, tous, tr, sociablesexceptionnel]","[gen, tous, trs, sociablesexceptionnel]","[(gens, NNS), (tous, JJ), (trs, NNS), (sociabl..."
2,5.0,"Belle place pour marcher, jogger, promener le ...",fr,1,belle place marcher jogger promener chien fair...,"[belle, place, marcher, jogger, promener, chie...","[bel, plac, march, jogg, promen, chien, fair, ...","[belle, place, marcher, jogger, promener, chie...","[(belle, JJ), (place, NN), (marcher, RB), (jog..."
3,4.0,Nourriture excellente !,fr,1,nourriture excellente,"[nourriture, excellente]","[nourritur, excellent]","[nourriture, excellente]","[(nourriture, NN), (excellente, NN)]"
4,4.0,J aime bien pour prendre de l air frais,fr,1,aime bien prendre air frais,"[aime, bien, prendre, air, frais]","[aim, bien, prendr, air, frais]","[aime, bien, prendre, air, frais]","[(aime, RB), (bien, JJ), (prendre, NN), (air, ..."


In [72]:
data_cleanEn['text_cleaned'] = data_cleanEn['text_tokens_lemma'].apply(lambda x: " ".join(x))
data_cleanEn.head()

Unnamed: 0,num_stars,review_text,lang,label,text_clean,text_tokens,text_tokens_stem,text_tokens_lemma,text_tokens_pos_tagged,text_cleaned
0,4.0,One of the nicest entry points to this invitin...,en,1,one nicest entry points inviting waterfront pr...,"[one, nicest, entry, points, inviting, waterfr...","[one, nicest, entri, point, invit, waterfront,...","[one, nicest, entry, point, inviting, waterfro...","[(one, CD), (nicest, JJS), (entry, NN), (point...",one nicest entry point inviting waterfront pro...
1,5.0,"Waterfront to fish or just relax, great place ...",en,1,waterfront fish relax great place picnic hango...,"[waterfront, fish, relax, great, place, picnic...","[waterfront, fish, relax, great, place, picnic...","[waterfront, fish, relax, great, place, picnic...","[(waterfront, JJ), (fish, JJ), (relax, NN), (g...",waterfront fish relax great place picnic hango...
2,4.0,Everything except the parking is good here.,en,1,everything except parking good,"[everything, except, parking, good]","[everyth, except, park, good]","[everything, except, parking, good]","[(everything, NN), (except, IN), (parking, VBG...",everything except parking good
3,4.0,"Defenely the best park in Montreal East, Tetre...",en,1,defenely best park montreal east tetreauville ...,"[defenely, best, park, montreal, east, tetreau...","[defen, best, park, montreal, east, tetreauvil...","[defenely, best, park, montreal, east, tetreau...","[(defenely, RB), (best, JJS), (park, NN), (mon...",defenely best park montreal east tetreauville ...
4,5.0,It's so peaceful and happy place near the water,en,1,peaceful happy place near water,"[peaceful, happy, place, near, water]","[peac, happi, place, near, water]","[peaceful, happy, place, near, water]","[(peaceful, JJ), (happy, JJ), (place, NN), (ne...",peaceful happy place near water


In [73]:
data_cleanFr['text_cleaned'] = data_cleanFr['text_tokens_lemma'].apply(lambda x: " ".join(x))
data_cleanFr.head()

Unnamed: 0,num_stars,review_text,lang,label,text_clean,text_tokens,text_tokens_stem,text_tokens_lemma,text_tokens_pos_tagged,text_cleaned
0,4.0,Tres beau parc pour faire un pinic et profiter...,fr,1,tres beau parc faire pinic profiter beaute nature,"[tres, beau, parc, faire, pinic, profiter, bea...","[tre, beau, parc, fair, pinic, profit, beaut, ...","[tres, beau, parc, faire, pinic, profiter, bea...","[(tres, NNS), (beau, VBP), (parc, JJ), (faire,...",tres beau parc faire pinic profiter beaute nature
1,5.0,Les gens sont tous très sociables.\nExceptionnel.,fr,1,gens tous trs sociablesexceptionnel,"[gens, tous, trs, sociablesexceptionnel]","[gen, tous, tr, sociablesexceptionnel]","[gen, tous, trs, sociablesexceptionnel]","[(gens, NNS), (tous, JJ), (trs, NNS), (sociabl...",gen tous trs sociablesexceptionnel
2,5.0,"Belle place pour marcher, jogger, promener le ...",fr,1,belle place marcher jogger promener chien fair...,"[belle, place, marcher, jogger, promener, chie...","[bel, plac, march, jogg, promen, chien, fair, ...","[belle, place, marcher, jogger, promener, chie...","[(belle, JJ), (place, NN), (marcher, RB), (jog...",belle place marcher jogger promener chien fair...
3,4.0,Nourriture excellente !,fr,1,nourriture excellente,"[nourriture, excellente]","[nourritur, excellent]","[nourriture, excellente]","[(nourriture, NN), (excellente, NN)]",nourriture excellente
4,4.0,J aime bien pour prendre de l air frais,fr,1,aime bien prendre air frais,"[aime, bien, prendre, air, frais]","[aim, bien, prendr, air, frais]","[aime, bien, prendre, air, frais]","[(aime, RB), (bien, JJ), (prendre, NN), (air, ...",aime bien prendre air frais


## Finding the most common words

In [75]:
from collections import Counter

import string

In [76]:
def getMostCommonWords(reviews, n_most_common, stopwords=None):

    # flatten review column into a list of words, and set each to lowercase
    flattened_reviews = [word for review in reviews for word in \
                         review.lower().split()]


    # remove punctuation from reviews
    flattened_reviews = [''.join(char for char in review if \
                                 char not in string.punctuation) for \
                         review in flattened_reviews]


    # remove stopwords, if applicable
    if stopwords:
        flattened_reviews = [word for word in flattened_reviews if \
                             word not in stopwords]


    # remove any empty strings that were created by this process
    flattened_reviews = [review for review in flattened_reviews if review]

    return Counter(flattened_reviews).most_common(n_most_common)

In [78]:
getMostCommonWords(data_cleanEn['text_cleaned'], 15)

[('park', 10506),
 ('place', 6909),
 ('nice', 6287),
 ('great', 4611),
 ('beautiful', 3717),
 ('good', 2514),
 ('kid', 2424),
 ('lot', 2376),
 ('montreal', 2215),
 ('walk', 2038),
 ('view', 1805),
 ('one', 1693),
 ('area', 1649),
 ('city', 1573),
 ('time', 1537)]

In [77]:
getMostCommonWords(data_cleanFr['text_cleaned'], 15)

[('parc', 7548),
 ('trs', 4270),
 ('beau', 4043),
 ('a', 3013),
 ('bien', 2202),
 ('endroit', 1988),
 ('enfants', 1786),
 ('cest', 1697),
 ('belle', 1662),
 ('jeux', 1504),
 ('grand', 1263),
 ('plus', 1239),
 ('terrain', 1152),
 ('petit', 1134),
 ('beaucoup', 1071)]

## Text vectorization

In [79]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import train_test_split

In [90]:
vectorizer = TfidfVectorizer(min_df=15)
bow = vectorizer.fit_transform(list(data_cleanEn['text_cleaned']))
labels = data_cleanEn['label']

In [91]:
bow.shape

(25225, 1781)

In [92]:
len(vectorizer.get_feature_names())

1781

In [93]:
tfidfDict = dict(zip(vectorizer.get_feature_names(), bow.toarray()[0]))

In [94]:
tfidfDict['appreciated']

0.0

In [95]:
featureDf = pd.DataFrame.from_dict(tfidfDict, 
                                   orient='index', columns=['tfidf'])
featureDf.reset_index(inplace=True)
featureDf = featureDf.rename(columns = {'index':'feature'})

In [96]:
featureDf.sort_values('tfidf')[-10:]

Unnamed: 0,feature,tfidf
254,chalet,0.23186
1042,offer,0.23846
1581,toilet,0.243946
578,forget,0.24915
462,enjoyable,0.261808
1578,today,0.262938
1020,nicest,0.273447
1700,waterfront,0.273447
471,entry,0.283836
1212,promenade,0.306286


Now we can take a look at the words that have the highest tfidf score in the positive and negative sentiment datasets.

In [89]:
pos_reviewsEn = data_cleanEn[data_cleanEn['label'] == 1]
neg_reviewsEn = data_cleanEn[data_cleanEn['label'] == 0]

In [97]:
vectorizer_pos = TfidfVectorizer(min_df=15)
bow_pos = vectorizer_pos.fit_transform(list(pos_reviewsEn['text_cleaned']))
labels_pos = pos_reviewsEn['label']

In [98]:
vectorizer_neg = TfidfVectorizer(min_df=15)
bow_neg = vectorizer_neg.fit_transform(list(neg_reviewsEn['text_cleaned']))
labels_neg = neg_reviewsEn['label']

In [99]:
tfidfDictPos = dict(zip(vectorizer_pos.get_feature_names(), bow_pos.toarray()[0]))
tfidfDictNeg = dict(zip(vectorizer_neg.get_feature_names(), bow_neg.toarray()[0]))

In [100]:
posFeatureDf = pd.DataFrame.from_dict(tfidfDictPos, 
                                   orient='index', columns=['tfidf'])
posFeatureDf.reset_index(inplace=True)
posFeatureDf = posFeatureDf.rename(columns = {'index':'feature'})

In [101]:
negFeatureDf = pd.DataFrame.from_dict(tfidfDictNeg, 
                                   orient='index', columns=['tfidf'])
negFeatureDf.reset_index(inplace=True)
negFeatureDf = negFeatureDf.rename(columns = {'index':'feature'})

In [102]:
posFeatureDf.sort_values('tfidf')[-15:]

Unnamed: 0,feature,tfidf
1141,river,0.191344
1082,public,0.196317
373,dont,0.199794
29,along,0.207732
1049,point,0.220836
227,chalet,0.229339
933,offer,0.236938
516,forget,0.246874
1397,toilet,0.248596
410,enjoyable,0.258051


In [105]:
negFeatureDf.sort_values('tfidf')[-10:]

Unnamed: 0,feature,tfidf
137,full,0.0
136,front,0.0
135,friendly,0.0
134,friend,0.0
133,french,0.0
141,garden,0.0
416,youre,0.0
241,ok,0.45997
394,want,0.55537
34,beer,0.692814


## Splitting data for train and test

In [106]:
neg_reviewsEn.shape

(2783, 10)

In [107]:
pos_reviewsEn.shape

(22442, 10)

In [109]:
pos_sample = pos_reviewsEn.sample(2783)
pos_sample.head()

Unnamed: 0,num_stars,review_text,lang,label,text_clean,text_tokens,text_tokens_stem,text_tokens_lemma,text_tokens_pos_tagged,text_cleaned
2463,5.0,Great park. One of the best in the city.,en,1,great park one best city,"[great, park, one, best, city]","[great, park, one, best, citi]","[great, park, one, best, city]","[(great, JJ), (park, NN), (one, CD), (best, JJ...",great park one best city
3162,5.0,Great park to play softball,en,1,great park play softball,"[great, park, play, softball]","[great, park, play, softbal]","[great, park, play, softball]","[(great, JJ), (park, NN), (play, NN), (softbal...",great park play softball
5153,5.0,It's just a nice place where families and frie...,en,1,nice place families friends could come together,"[nice, place, families, friends, could, come, ...","[nice, place, famili, friend, could, come, tog...","[nice, place, family, friend, could, come, tog...","[(nice, JJ), (place, NN), (families, NNS), (fr...",nice place family friend could come together
7581,5.0,Cool,en,1,cool,[cool],[cool],[cool],"[(cool, NN)]",cool
18768,5.0,"This is an amazing park. Big one, with activit...",en,1,amazing park big one activities year long nice...,"[amazing, park, big, one, activities, year, lo...","[amaz, park, big, one, activ, year, long, nice...","[amazing, park, big, one, activity, year, long...","[(amazing, VBG), (park, NN), (big, JJ), (one, ...",amazing park big one activity year long nice s...


In [110]:
trainDataEn = pd.concat([pos_sample,neg_reviewsEn])
trainDataEn.shape

(5566, 10)

In [116]:
vectorizerEn = TfidfVectorizer(min_df=15)
bowEn = vectorizerEn.fit_transform(trainDataEn['text_cleaned'])

In [139]:
vectorizerEnTotal = TfidfVectorizer(min_df=15)
bowEnTotal = vectorizerEnTotal.fit_transform(data_cleanEn['text_cleaned'])

In [121]:
X_train, X_test, y_train, y_test = train_test_split(bowEn, trainDataEn['label'], test_size=0.3)

In [140]:
X_train1, X_test1, y_train1, y_test1 = train_test_split(bowEnTotal, 
                                                        data_cleanEn['label'], test_size=0.3)

In [122]:
# check out the dataset 
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(3896, 677)
(1670, 677)
(3896,)
(1670,)


In [142]:
# check out the dataset 
print(X_train1.shape)
print(X_test1.shape)
print(y_train.shape)
print(y_test1.shape)

(17657, 1781)
(7568, 1781)
(3896,)
(7568,)


### Using Random Forest classifier 

In [123]:
from sklearn.ensemble import RandomForestClassifier as rfc
from sklearn.model_selection import RandomizedSearchCV
from scipy import stats

In [124]:
classifier = rfc()
classifier.fit(X_train,y_train)
classifier.score(X_test,y_test)

0.7395209580838323

In [125]:
print("train shape: " + str(X_train.shape))
print("score on test: " + str(classifier.score(X_test, y_test)))
print("score on train: "+ str(classifier.score(X_train, y_train)))

train shape: (3896, 677)
score on test: 0.7395209580838323
score on train: 0.9535420944558521


In [143]:
classifier1 = rfc()
classifier1.fit(X_train1,y_train1)

print("train shape: " + str(X_train1.shape))
print("score on test: " + str(classifier1.score(X_test1, y_test1)))
print("score on train: "+ str(classifier1.score(X_train1, y_train1)))

train shape: (17657, 1781)
score on test: 0.9052589852008457
score on train: 0.9835192841366031


## Decision Tree

In [126]:
%%time

from sklearn.tree import DecisionTreeClassifier

clfdt = DecisionTreeClassifier(min_samples_split=30,max_depth=10)
clfdt.fit(X_train, y_train)

print("train shape: " + str(X_train.shape))
print("score on test: "  + str(clfdt.score(X_test, y_test)))
print("score on train: " + str(clfdt.score(X_train, y_train)))

train shape: (3896, 677)
score on test: 0.6730538922155689
score on train: 0.7358829568788501
CPU times: user 47.4 ms, sys: 3.05 ms, total: 50.5 ms
Wall time: 53.8 ms


In [144]:
%%time

clfdt1 = DecisionTreeClassifier(min_samples_split=30,max_depth=10)
clfdt1.fit(X_train1, y_train1)

print("train shape: " + str(X_train1.shape))
print("score on test: "  + str(clfdt1.score(X_test1, y_test1)))
print("score on train: " + str(clfdt1.score(X_train1, y_train1)))

train shape: (17657, 1781)
score on test: 0.8970665961945031
score on train: 0.9030979215042193
CPU times: user 260 ms, sys: 10.5 ms, total: 271 ms
Wall time: 398 ms


In [127]:
%%time

from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bg=BaggingClassifier(DecisionTreeClassifier(min_samples_split=10,max_depth=3),max_samples=0.5,max_features=1.0,n_estimators=10)
bg.fit(X_train, y_train)

print("train shape: " + str(X_train.shape))
print("score on test: " + str(bg.score(X_test, y_test)))
print("score on train: "+ str(bg.score(X_train, y_train)))

train shape: (3896, 677)
score on test: 0.6395209580838324
score on train: 0.6419404517453798
CPU times: user 125 ms, sys: 9.84 ms, total: 135 ms
Wall time: 167 ms


In [146]:
%%time

bg1=BaggingClassifier(DecisionTreeClassifier(min_samples_split=10,max_depth=3),max_samples=0.5,max_features=1.0,n_estimators=10)
bg1.fit(X_train1, y_train1)

print("train shape: " + str(X_train1.shape))
print("score on test: " + str(bg1.score(X_test1, y_test1)))
print("score on train: "+ str(bg1.score(X_train1, y_train1)))

train shape: (17657, 1781)
score on test: 0.8940274841437632
score on train: 0.8933567423684657
CPU times: user 465 ms, sys: 17.4 ms, total: 482 ms
Wall time: 621 ms


In [128]:
# boosting decision tree
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

# setting 
# min_samples_split=10
# max_depth=4

adb = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2),n_estimators=100,learning_rate=0.5)
adb.fit(X_train, y_train)

print("train shape: " + str(X_train.shape))
print("score on test: " + str(adb.score(X_test, y_test)))
print("score on train: "+ str(adb.score(X_train, y_train)))

train shape: (3896, 677)
score on test: 0.7119760479041917
score on train: 0.8429158110882957


In [147]:
# boosting decision tree

adb1 = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2),n_estimators=100,learning_rate=0.5)
adb1.fit(X_train1, y_train1)

print("train shape: " + str(X_train1.shape))
print("score on test: " + str(adb1.score(X_test1, y_test1)))
print("score on train: "+ str(adb1.score(X_train1, y_train1)))

train shape: (17657, 1781)
score on test: 0.9011627906976745
score on train: 0.9138018916010647


In [129]:
X_train[5]

<1x677 sparse matrix of type '<class 'numpy.float64'>'
	with 3 stored elements in Compressed Sparse Row format>

## Naive Bayes

Sklearn Documentation:

- Naive Bayes: https://scikit-learn.org/stable/modules/naive_bayes.html
- MultinomialNB: https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html#sklearn.naive_bayes.MultinomialNB

In [148]:
from sklearn.naive_bayes import MultinomialNB

In [131]:
%%time
mnb = MultinomialNB().fit(X_train, y_train)

CPU times: user 4.74 ms, sys: 5.39 ms, total: 10.1 ms
Wall time: 13.9 ms


In [132]:
print("train shape: " + str(X_train.shape))
print("score on test: " + str(mnb.score(X_test, y_test)))
print("score on train: "+ str(mnb.score(X_train, y_train)))

train shape: (3896, 677)
score on test: 0.7491017964071857
score on train: 0.7936344969199178


In [149]:
%%time
mnb1 = MultinomialNB().fit(X_train1, y_train1)

print("train shape: " + str(X_train1.shape))
print("score on test: " + str(mnb1.score(X_test1, y_test1)))
print("score on train: "+ str(mnb1.score(X_train1, y_train1)))

train shape: (17657, 1781)
score on test: 0.8982558139534884
score on train: 0.9032111910290537
CPU times: user 22.2 ms, sys: 24.5 ms, total: 46.8 ms
Wall time: 65 ms


## Logistic Regression 

Sklearn Documentation:

- LogisticRegression: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
- SGD Classifier: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html#sklearn.linear_model.SGDClassifier

In [133]:
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier

In [134]:
%%time

lr=LogisticRegression(max_iter=5000)
lr.fit(X_train, y_train)

CPU times: user 177 ms, sys: 44.1 ms, total: 221 ms
Wall time: 374 ms


LogisticRegression(max_iter=5000)

In [135]:
print("train shape: " + str(X_train.shape))
print("score on test: " + str(lr.score(X_test, y_test)))
print("score on train: "+ str(lr.score(X_train, y_train)))

train shape: (3896, 677)
score on test: 0.7449101796407186
score on train: 0.8128850102669405


In [150]:
%%time

lr1=LogisticRegression(max_iter=5000)
lr1.fit(X_train1, y_train1)
print("train shape: " + str(X_train1.shape))
print("score on test: " + str(lr1.score(X_test1, y_test1)))
print("score on train: "+ str(lr1.score(X_train1, y_train1)))

train shape: (17657, 1781)
score on test: 0.9053911205073996
score on train: 0.9085348586962678
CPU times: user 268 ms, sys: 25.2 ms, total: 293 ms
Wall time: 446 ms


In [136]:
%%time

#logistic regression with stochastic gradient decent
sgd=SGDClassifier()
sgd.fit(X_train, y_train)

CPU times: user 21.3 ms, sys: 7.03 ms, total: 28.3 ms
Wall time: 52 ms


SGDClassifier()

In [137]:
print("train shape: " + str(X_train.shape))
print("score on test: " + str(sgd.score(X_test, y_test)))
print("score on train: "+ str(sgd.score(X_train, y_train)))

train shape: (3896, 677)
score on test: 0.7317365269461078
score on train: 0.8287987679671458


In [151]:
%%time

#logistic regression with stochastic gradient decent
sgd1=SGDClassifier()
sgd1.fit(X_train1, y_train1)

print("train shape: " + str(X_train1.shape))
print("score on test: " + str(sgd1.score(X_test1, y_test1)))
print("score on train: "+ str(sgd1.score(X_train1, y_train1)))

train shape: (17657, 1781)
score on test: 0.9024841437632135
score on train: 0.9054765815257405
CPU times: user 45.9 ms, sys: 8.87 ms, total: 54.7 ms
Wall time: 79.6 ms


## K-nearest neighbors

In [138]:
%%time

from sklearn.neighbors import KNeighborsClassifier

#knn = KNeighborsClassifier(n_neighbors=5,algorithm = 'ball_tree')
knn = KNeighborsClassifier(algorithm = 'brute', n_jobs=-1)

knn.fit(X_train, y_train)

print("train shape: " + str(X_train.shape))
print("score on test: " + str(knn.score(X_test, y_test)))
print("score on train: "+ str(knn.score(X_train, y_train)))

train shape: (3896, 677)
score on test: 0.6239520958083832
score on train: 0.7474332648870636
CPU times: user 1.43 s, sys: 401 ms, total: 1.83 s
Wall time: 2.16 s


In [153]:
%%time
#knn = KNeighborsClassifier(n_neighbors=5,algorithm = 'ball_tree')
knn1 = KNeighborsClassifier(algorithm = 'brute', n_jobs=-1)

knn1.fit(X_train1, y_train1)

print("train shape: " + str(X_train1.shape))
print("score on test: " + str(knn1.score(X_test1, y_test1)))
print("score on train: "+ str(knn1.score(X_train1, y_train1)))

train shape: (17657, 1781)
score on test: 0.8665433403805497
score on train: 0.895735402389987
CPU times: user 36.9 s, sys: 11 s, total: 48 s
Wall time: 43.4 s


## Neural network pre-programmed

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

#allReviewEn[['']]

train, test = train_test_split(allReviewEn, test_size = 0.3, random_state=42)

# clean the indexing
train.reset_index(drop=True),test.reset_index(drop=True)

# save train and test in csv files 
train[['review_text', 'label']].to_csv('all_en_train.csv', index=False)
test[['review_text', 'label']].to_csv('all_en_test.csv', index=False)

### Using Torchtest to processs text data

import numpy as np 

import torch 
import torchtext

from torchtext.legacy.data import Field, BucketIterator, TabularDataset, LabelField

import nltk 
nltk.download('punkt') # for punkt tokenizer

from nltk import word_tokenize 

# torchtext field parameter specifies how data should be processed, here tokenized
TEXT = Field(tokenize = word_tokenize)

LABEL = LabelField(dtype = torch.float) # convert 

datafields = [ ('review_text', TEXT), ('label', LABEL)] 

# specify what data that will work with, split to train and text, map to field 
trn, tst = TabularDataset.splits(path = '/Users/andreamock/Documents/review_datasets',
                               train = 'all_en_train.csv', test = 'all_en_test.csv', format = 'csv',
                               skip_header = True, fields = datafields)


# training examples 
trn[:5]

print(f'Number of training examples: {len(trn)}')
print(f'Number of testing examples: {len(tst)}')

# each example has label and text
trn[5].__dict__.keys()

trn[1].review_text # text has been tokenized in individual words

trn[1].label

# limit size of feature vectors to 15000, use one-encoding to get the top 15000 words in vocab
TEXT.build_vocab(trn, max_size = 15000)

LABEL.build_vocab(trn)

print(f'Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}')
print(f'Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}')
# two additional tokens were added to vocab, one for unknown words and another for padding to make sentences equal lengths

print(TEXT.vocab.freqs.most_common(50)) 

print(TEXT.vocab.itos[:10]) # integer to string mapping 0 and 1 to unknown and padding

batch_size = 64 

# returns a batch of examples where each example is of similar length (thus minimizing padding for each example)
train_iterator, test_iterator = BucketIterator.splits(
    (trn,tst), batch_size = batch_size, sort_key = lambda x: len(x.review_text), sort_within_batch = False
)

## Designing an RNN for binary text classification 

import torch.nn as nn

class RNN(nn.Module):
    
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        # input_dim = input dimensions of words 
        # embedding_dim = dimension of word embeddings, dense word representation for training RNN
        # hidden_dim = dimension of hidden state of RNN
        # output_dim = output dimensions of RNN output
        
        super().__init__()
        #  convert one-hot encoded sentences to dense format using embeddings to represent each word
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        # input to rnn is current word's embedding and previous hidden state, one word per time instance (memory cell)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim)
        # fully connected layer to classify as positive or negative 
        self.fc = nn.Linear(hidden_dim, output_dim)
        
        self.dropout = nn.Dropout(0.3)
        
    def forward(self, text):
        # input sentence (list of indexes of one hot encoded words) is represented using its embedding
        embedded = self.embedding(text)
        
        embedded_dropout = self.dropout(embedded)
        
        # output = concatentation of hidden state for every time step (ie word) [sentence length, batch size, hiddendim]
        # hidden = final hidden state fed into linear layer
        output, (hidden, _) = self.rnn(embedded_dropout)
        
        hidden_1D = hidden.squeeze(0) # get rid of unnecessary dimension 
        
        assert torch.equal(output[-1, :, :], hidden_1D) # confirm that it is indeed last hidden state 
        
        return self.fc(hidden_1D) # last hidden state fed into fully connected layer

# setting dimensions 
input_dim = len(TEXT.vocab)
embedding_dim = 100
hidden_dim = 256
output_dim=1

model = RNN(input_dim, embedding_dim, hidden_dim, output_dim)

model # see what our model looks like

# train with optimizer
import torch.optim as optim 

optimizer = optim.Adam(model.parameters(), lr=1e-6)

# binary cross entropy with logits (cross-entropy for binary classification, 
# w/ sigmoid activation func to predict in range of 0 and 1)
criterion = nn.BCEWithLogitsLoss()

def train(model, iterator, optimizer, criterion): # helper function for training process
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:  # iterator over all batches of training data
        
        optimizer.zero_grad() # zero out gradients of optimizer
                
        predictions = model(batch.review_text).squeeze(1) # make predictions, squeeze to be 1d instead of [, ]
        
        loss = criterion(predictions, batch.label) # calculate loss
        
        rounded_preds = torch.round(torch.sigmoid(predictions))
        correct = (rounded_preds == batch.label).float() # how many were correct
        
        acc = correct.sum() / len(correct)
        
        loss.backward() # backward pass on rnn
        
        optimizer.step()
        
        epoch_loss += loss.item() # keep track of epoch loss and accuracy
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

num_epochs = 5

for epoch in range(num_epochs):
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    print(f' Epoch: {epoch+1}, Train loss: {train_loss}, Train Acc: {train_acc*100:.2f}%')

Now we can test the accuracy on our test data.

# don't want to update the parameters when evaluating the accuracy
epoch_loss = 0
epoch_acc = 0

model.eval()

with torch.no_grad():

    for batch in test_iterator:

        predictions = model(batch.review_text).squeeze(1)

        loss = criterion(predictions, batch.label)

        rounded_preds = torch.round(torch.sigmoid(predictions))
        
        correct = (rounded_preds == batch.label).float() 
        acc = correct.sum() / len(correct)

        epoch_loss += loss.item()
        epoch_acc += acc.item()

test_loss = epoch_loss / len(test_iterator)
test_acc  = epoch_acc / len(test_iterator)

print(f'| Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}% |')