# PostNord Trustpilot Reviews

## Structure of the code

1. Combine header and text column
2. Anonymisation
    - Remove names
    - Remove dates
    - Create unique ids for each person
    - Use review and url to remove repeated comments
    - Remove urls
3. Cleaning and processing
    - If a comment was only punctuation => remove that row
4. Balance data (around 17,000 per category)
    - Randomise data (in categories)
    - Keep the first 17,000 of each category

## Initial code

In [254]:
#!pip install -U spacy
#!python -m spacy download da_core_news_md
#!pip install scikit-learn nltk tensorflow

In [379]:
# system tools
import os
import sys
import random
import spacy
import da_core_news_md
import requests as req

# simple text processing tools
import re
import tqdm
import unicodedata
from bs4 import BeautifulSoup
import nltk
nltk.download('punkt')

# data wranling
import pandas as pd
import numpy as np

# tensorflow
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import (Dense,
                                     Flatten,
                                     Conv1D,
                                     MaxPooling1D,
                                     Embedding,
                                     LSTM,
                                     Dropout)
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.regularizers import L2
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences

import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB

from tensorflow.keras.layers import Dense, Input, Dropout
from tensorflow.keras.models import Model
from sklearn.linear_model import LogisticRegression



# scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.metrics import (confusion_matrix, classification_report)
from sklearn.preprocessing import LabelBinarizer, LabelEncoder

# visualisations 
import matplotlib.pyplot as plt

[nltk_data] Downloading package punkt to /home/ucloud/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [256]:
# define path
path = os.path.join("..", "in", "postnord_trustpilot_reviews.csv")

In [257]:
# read csv
df = pd.read_csv(path)
# fill empty columns with white space
df.fillna(" ", inplace = True)
# rename columns
df.columns = ['order', 'name', 'date', 'rating', 'text', 'profile_link', 'review_count', 'header']

In [258]:
# make deep copy of the 20 first lines in the data
sm = df.copy(deep = True)

## Combination of header and text column

Hvis headeren er identisk med teksten eller hvis headerens længde er identisk med det tilsvarende vindue i teksten
    
```df['review'] = df['text']```

Ellers: `df['review'] = df['header'] + " " + df['text']`

_Kan også omvendes_

In [259]:
# remove dots from the end of the header
sm['header'] = sm['header'].str.replace('…', '', regex = False)

In [260]:
# create a list
review = []

# loop over the dataframe
for index, row in sm.iterrows():
    # txt is the text-column
    txt = row["text"]
    # head is the header-column
    head = row["header"]
    # search for the header text in the text-column
    x = re.search(f"^{re.escape(head)}", txt)
    # if the header text occurs in the text-column
    if x:
        # append the text column to the list
        review.append(row['text'])
    # otherwise...
    else:
        # append the header column and the text column to the list with a white space in between
        review.append(row['header'] + " " + row['text'])

In [261]:
# create a new review-column from the list
sm['review'] = review

## Removal of duplicates

In [262]:
len(sm)

399980

In [263]:
sm = sm.drop_duplicates(subset=['profile_link', 'date', 'review'], keep='first')

In [264]:
len(sm)

399537

## Anonymisation

In [265]:
sm = sm.drop(columns=['name', 'date', 'profile_link', 'header', 'text', 'order', 'review_count'])

In [266]:
sm.head()

Unnamed: 0,rating,review
0,5,Hurtig levering.
1,5,Altid pakker til tiden
2,5,Som sædvanlig er min pakke leveret på bedste m...
3,5,"Hurtig behandling Alt fungerede, hurtig leveri..."
4,5,"Forbilledligt Pakkepost, når den er bedst!"


## Cleaning and processing

In [267]:
clean_review = []

for text in sm['review'].tolist():
    text = re.sub('[^\w\s]+', '', text)
    text = re.sub(' +', ' ', text)
    text = text.strip()
    text = text.lower()
    clean_review.append(text)

In [268]:
sm['review'] = clean_review

In [269]:
sm.head()

Unnamed: 0,rating,review
0,5,hurtig levering
1,5,altid pakker til tiden
2,5,som sædvanlig er min pakke leveret på bedste m...
3,5,hurtig behandling alt fungerede hurtig leverin...
4,5,forbilledligt pakkepost når den er bedst


In [270]:
sm['review'].replace('', np.nan, inplace=True)
print("# rows before removing empty rows: ", len(sm))
sm.dropna(subset=['review'], inplace=True)
print("# rows after removing empty rows: ", len(sm))

# rows before removing empty rows:  399537
# rows after removing empty rows:  398866


In [271]:
# removal of stopwords
url = "https://gist.githubusercontent.com/berteltorp/0cf8a0c7afea7f25ed754f24cfc2467b/raw/fa34ef448aff6adbb4b6bab9bda62a8b0f1ee597/stopord.txt"
res = req.get(url)

txt_path = os.path.join("..", "in", "stopwords.txt")

file = open(txt_path, "w")
file.write(res.text)
file.close()

In [272]:
with open(txt_path) as f:
    lines = f.readlines()
f.close()

In [273]:
stopwords = []

for l in lines:
    stopwords.append(l.strip())

In [274]:
good_words = ["god", "godt"]

In [275]:
for word in list(stopwords):
    if word in good_words:
        stopwords.remove(word)

In [276]:
sm['no_stopwords'] = sm['review'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stopwords)]))

In [277]:
sm.head()

Unnamed: 0,rating,review,no_stopwords
0,5,hurtig levering,hurtig levering
1,5,altid pakker til tiden,pakker tiden
2,5,som sædvanlig er min pakke leveret på bedste m...,sædvanlig pakke leveret bedste måde yderst til...
3,5,hurtig behandling alt fungerede hurtig leverin...,hurtig behandling fungerede hurtig leveringtak
4,5,forbilledligt pakkepost når den er bedst,forbilledligt pakkepost bedst


In [278]:
sm['no_stopwords'].replace('', np.nan, inplace=True)
print("# rows before removing empty rows", len(sm))
sm.dropna(subset=['no_stopwords'], inplace=True)
print("# rows after removing empty rows: ", len(sm))

# rows before removing empty rows 398866
# rows after removing empty rows:  397885


In [279]:
sm["review_length"] = sm.review.str.len()
sm["no_stopwords_length"] = sm.no_stopwords.str.len()
sm.sort_values("review_length")

Unnamed: 0,rating,review,no_stopwords,review_length,no_stopwords_length
334133,5,å,å,1,1
244318,5,v,v,1,1
41166,5,5,5,1,1
225933,1,1,1,1,1
233706,5,a,a,1,1
...,...,...,...,...,...
31004,1,advarsel om dato for levering i post nord sms ...,advarsel dato levering post nord sms d 48 modt...,2475,1388
157335,1,hvis man bare kunne give minus 5 stjerner ja h...,give minus 5 stjerner give minus 5 stjerner va...,2618,1485
269276,1,postnord trænger til at finde ud af hvad kunde...,postnord trænger kundeservice egentligt postno...,2828,1503
128294,1,post nord i er forfærdelige jeg skulle have en...,post nord forfærdelige brevpakke afkrydset afh...,2885,1724


## Balancing the data

In [280]:
# 17122 3-star reviews
sm['rating'].value_counts()

5    252952
1     63732
4     43021
2     21108
3     17072
Name: rating, dtype: int64

In [281]:
def balance(dataframe, random_state = 2):
    """
    Create a balanced sample from imbalanced datasets.
    
    dataframe: 
        Pandas dataframe with a column called 'review' and one called 'rating'
    n:         
        Number of samples from each label, defaults to XXXXX
        
    random_state:
        Random state, defaults to 2
    """
    # Use pandas select a random bunch of examples from each label
    out = (dataframe.groupby('rating', as_index=False)
            .apply(lambda x: x.sample(dataframe['rating'].value_counts()[3], random_state = 2))
            .reset_index(drop=True))
    
    return out

In [365]:
df_balanced = balance(sm, random_state = 7)

In [366]:
df_balanced['rating'].value_counts()

1    17072
2    17072
3    17072
4    17072
5    17072
Name: rating, dtype: int64

In [367]:
df_balanced.sort_values("no_stopwords_length")

Unnamed: 0,rating,review,no_stopwords,review_length,no_stopwords_length
9928,1,at i ikke j,j,11,1
81148,5,i får 5,5,7,1
45088,3,de kan få 3,3,11,1
68018,4,4 store,4,7,1
80988,5,kun 1 dag ikke noget at komme efter,1,35,1
...,...,...,...,...,...
6740,1,postnord bryder reglerne for at tjene penge en...,postnord bryder reglerne tjene penge engang ek...,1622,973
29672,2,xxxx onsdag som var varslet som afleveringsdag...,xxxx onsdag varslet afleveringsdag ringet dørt...,1832,987
13890,1,man bliver så træt den 256 bestiller jeg et dy...,træt 256 bestiller dyrt produkt forventet leve...,1742,997
11240,1,dårlig service i en covid19 periode folk har i...,dårlig service covid19 periode folk lært holde...,2076,1051


## Machine

In [368]:
X = df_balanced["no_stopwords"]
y = df_balanced["rating"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
y_train.value_counts()

5    13714
2    13694
1    13684
3    13626
4    13570
Name: rating, dtype: int64

In [369]:
y_train.value_counts().max()

13714

In [375]:
vectorizer = CountVectorizer(ngram_range = (1,3), 
                             lowercase = False,
                             max_df = 0.99,  
                             min_df = 0.01,
                             max_features = None,
                             encoding = 'utf-16')

X_train_feats = vectorizer.fit_transform(X_train)
X_test_feats = vectorizer.transform(X_test)

feature_names = vectorizer.get_feature_names_out()

In [384]:
classifier = LogisticRegression(random_state = 7, max_iter = (y_train.value_counts().max() * 5)).fit(X_train_feats, y_train)

In [385]:
y_pred = classifier.predict(X_test_feats)

classifier_metrics = classification_report(y_test, y_pred)
print(classifier_metrics)

              precision    recall  f1-score   support

           1       0.47      0.54      0.51      3388
           2       0.39      0.35      0.37      3378
           3       0.39      0.29      0.33      3446
           4       0.46      0.34      0.39      3502
           5       0.48      0.72      0.58      3358

    accuracy                           0.45     17072
   macro avg       0.44      0.45      0.43     17072
weighted avg       0.44      0.45      0.43     17072



## Deep learning

In [285]:
X = df_balanced["no_stopwords"].values
y = df_balanced["rating"].values

In [286]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [287]:
# define out-of-vobabulary token
t = Tokenizer(oov_token = "<UNK>")

In [288]:
t.fit_on_texts(X_train)

In [301]:
# set padding value – if the kernel goes outside the matrix, we ensure that the values are still the same length
t.word_index["<PAD>"] = 0
# turns the texts into a list of integer values
X_train_seqs = t.texts_to_sequences(X_train)
X_test_seqs = t.texts_to_sequences(X_test)
# sequence normalisation
#MAX_SEQUENCE_LENGTH = df_balanced.review.str.len().max()
MAX_SEQUENCE_LENGTH = int(df_balanced["no_stopwords"].apply(len).mean())
# add padding to sequences
X_train_pad = sequence.pad_sequences(X_train_seqs, maxlen=MAX_SEQUENCE_LENGTH)
X_test_pad = sequence.pad_sequences(X_test_seqs, maxlen=MAX_SEQUENCE_LENGTH)  

In [302]:
X_train_pad

array([[   0,    0,    0, ...,   79,  714,  347],
       [   0,    0,    0, ...,  140,   44, 1162],
       [   0,    0,    0, ...,  381,   14,   25],
       ...,
       [   0,    0,    0, ...,    0,   23,   15],
       [   0,    0,    0, ..., 1334,  398,  118],
       [   0,    0,    0, ...,    7, 7853,   11]], dtype=int32)

In [303]:
# define parameters for model

# overall vocabulary size
VOCAB_SIZE = len(t.word_index)
# number of dimensions for embeddings
EMBED_SIZE = 300
# number of epochs to train for
EPOCHS = 5
# batch size for training
BATCH_SIZE = 128

In [304]:
# clear models in memory
tf.keras.backend.clear_session()

In [305]:
# create the model
model = Sequential()

# embedding layer
model.add(Embedding(VOCAB_SIZE, 
                    EMBED_SIZE, 
                    input_length=MAX_SEQUENCE_LENGTH))

# first convolution layer and pooling
model.add(Conv1D(filters=128, 
                 kernel_size=4, 
                 padding='same',
                 activation='relu'))
model.add(MaxPooling1D(pool_size=2))

# second convolution layer and pooling
model.add(Conv1D(filters=64, 
                 kernel_size=4, 
                 padding='same', 
                 activation='relu'))
model.add(MaxPooling1D(pool_size=2))

# third convolution layer and pooling
model.add(Conv1D(filters=32, 
                 kernel_size=4, 
                 padding='same', 
                 activation='relu'))
model.add(MaxPooling1D(pool_size=2))

# fully-connected classification layer
model.add(Flatten())
model.add(Dense(256, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
    
# compile model
model.compile(loss='binary_crossentropy',
              optimizer='adam', 
              metrics=['accuracy'])
# train model
H = model.fit(X_train_pad, y_train,
              epochs = EPOCHS,
              batch_size = BATCH_SIZE,
              validation_split = 0.1, # takes the remaining 10% of the training data, after having trained on 90%
              verbose = True)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [306]:
# final evaluation of the model
scores = model.evaluate(X_test_pad, y_test, verbose = 1)
print(f"Accuracy: {scores[1]}")

Accuracy: 0.1984536051750183


In [312]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df_balanced['no_stopwords'], df_balanced['rating'], test_size=0.2)

# Tokenize the text data
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=10000, oov_token='<OOV>')
tokenizer.fit_on_texts(X_train)

# Convert the text data to sequences
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Pad the sequences to ensure they all have the same length
X_train_padded = tf.keras.preprocessing.sequence.pad_sequences(X_train_seq, padding='post', maxlen=256)
X_test_padded = tf.keras.preprocessing.sequence.pad_sequences(X_test_seq, padding='post', maxlen=256)

# Define the deep learning model architecture
inputs = Input(shape=(256,))
x = Dense(256, activation='relu')(inputs)
x = Dropout(0.5)(x)
x = Dense(128, activation='relu')(inputs)
x = Dropout(0.5)(x)
x = Dense(64, activation='relu')(x)
x = Dropout(0.5)(x)
outputs = Dense(1, activation='sigmoid')(x)
model = Model(inputs=inputs, outputs=outputs)

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
history = model.fit(X_train_padded, y_train, epochs=5, batch_size=256, validation_split=0.2)

# Evaluate the model on the test set
model.evaluate(X_test_padded, y_test)

Epoch 1/5


  return t[start:end]


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


[-242542912.0, 0.1991565078496933]

In [None]:
# do predictions with 0.5 decision boundary
predictions = (model.predict(X_test_pad) > 0.5).astype("int32")
# make classification report
labels = ["non-toxic", "toxic"]
report = classification_report(y_test, predictions, target_names = labels)

In [300]:
int(sm["no_stopwords"].apply(len).mean())

47