# Preprocessing and feature engineering

## Importing libraries and documents

In [343]:
import pandas as pd
import numpy as np
import regex as re
import string
from tqdm.notebook import tqdm
import collections

import seaborn as sns
sns.set(style="darkgrid")
sns.set(font_scale=1.3)
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
import nltk
from sklearn import *

from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

In [102]:
df = pd.read_csv("../reviews_scores.csv", index_col=0)
df.head(5)

Unnamed: 0,rating,title,review_body,date,country,sentiment_value,scores,compound,comp_score
0,5,Birthday week,An amazing once in a lifetime experience. You ...,2019 December,United States,positive,"{'neg': 0.0, 'neu': 0.73, 'pos': 0.27, 'compou...",0.9866,positive
1,5,You must visit it!,"Whatever I say, it can't describe this archite...",2020 June,Georgia,positive,"{'neg': 0.0, 'neu': 0.84, 'pos': 0.16, 'compou...",0.658,positive
2,5,Amazing Masterpiece,It is impossible to describe the greatness of ...,2019 December,United States,positive,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,negative
3,5,Incredible building,"An absolutely stunning building, still in the ...",2019 December,France,positive,"{'neg': 0.142, 'neu': 0.573, 'pos': 0.286, 'co...",0.6989,positive
4,5,Gaudi a true Genius!!,Gaudi was an Architectural Legend! Once this ...,2020 November,United Kingdom,positive,"{'neg': 0.123, 'neu': 0.773, 'pos': 0.104, 'co...",-0.2187,neutral


## Text preprocessing pipeline

In [3]:
def remove_specials(review):
    return re.sub('[^A-Za-z0-9]+', ' ', review)

def remove_digits(review):
    return re.sub('\d+', ' ', review)
    
def to_lower(review):
    return review.lower()

def tokenizer(review):
    review = nltk.word_tokenize(review)
    return review

def remove_stopwords(review):
    stop = stopwords.words('english')
    review = [x for x in review if x not in stop]
    return review
    
def remove_empty_tokens(review):
    review = [t for t in review if len(t) > 0]
    return review

def stemming(review):
    porter = PorterStemmer()
    stems = []
    for word in review:
        stems.append(porter.stem(word))
    return stems

def lemmatizer(review):
    lemmatizer = WordNetLemmatizer()
    lemmas = []
    for word in review:
        lemmas.append(lemmatizer.lemmatize(word))
    return lemmas

In [None]:
# Transforming the column with the review
df["review_body"] = df["review_body"].apply(remove_specials).apply(remove_digits).apply(to_lower)
.apply(tokenizer).apply(remove_stopwords).apply(remove_empty_tokens).apply(stemming).apply(lemmatizer)

In [113]:
df.head(5)

Unnamed: 0,rating,title,review_body,date,country,sentiment_value,scores,compound,comp_score
0,5,Birthday week,"[amaz, lifetim, experi, look, photograph, time...",2019 December,United States,positive,"{'neg': 0.0, 'neu': 0.73, 'pos': 0.27, 'compou...",0.9866,positive
1,5,You must visit it!,"[whatev, say, describ, architectur, miracl, ad...",2020 June,Georgia,positive,"{'neg': 0.0, 'neu': 0.84, 'pos': 0.16, 'compou...",0.658,positive
2,5,Amazing Masterpiece,"[imposs, describ, great, cathedr, gaudi, geniu...",2019 December,United States,positive,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,negative
3,5,Incredible building,"[absolut, stun, build, still, process, built, ...",2019 December,France,positive,"{'neg': 0.142, 'neu': 0.573, 'pos': 0.286, 'co...",0.6989,positive
4,5,Gaudi a true Genius!!,"[gaudi, architectur, legend, church, finish, a...",2020 November,United Kingdom,positive,"{'neg': 0.123, 'neu': 0.773, 'pos': 0.104, 'co...",-0.2187,neutral


## Feature selection

Feature selection is considered to be the main step in order to classify sentiments because it transforms unstructured data into structured data. A statistical method that builds BOW with the most relevant words taking into account the frequency of presence of the word in the text measure and `sentiment_value` and `comp_score` class label. 

In [163]:
# Creating a new column without list with words in col
df['review_str'] = [', '.join(map(str, word)) for word in df['review_body']]
df.head(1)

Unnamed: 0,rating,title,review_body,date,country,sentiment_value,scores,compound,comp_score,review_str
0,5,Birthday week,"[amaz, lifetim, experi, look, photograph, time...",2019 December,United States,positive,"{'neg': 0.0, 'neu': 0.73, 'pos': 0.27, 'compou...",0.9866,positive,"amaz, lifetim, experi, look, photograph, time,..."


### Creating review vectors with `CountVectorizer` & Frequency dictionary

In [473]:
#Initializing the model and transforming data
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=4000)
bow = cv.fit_transform(df["review_str"])

#Creating df with vectors of max_features per review
matrix = pd.DataFrame(bow.toarray(),
                      columns=cv.get_feature_names())

In [379]:
#Bringing the target label into the df
matrix[["human_label", "machine_label"]] = df[["sentiment_value", "comp_score"]]

#Encoding the class target
matrix["human_label"] = df.sentiment_value.replace({"positive":3, "neutral":2, "negative": 1})
matrix["machine_label"] = df.comp_score.replace({"positive":3, "neutral":2, "negative": 1})

In [380]:
# Saving file for modelling in next notebook
matrix.to_csv("../matrix_cv.csv")

In [132]:
#Creating a DataFrame for word frequencies for later plotting
cv_word_freq = dict(zip(cv.get_feature_names(), np.asarray(bow.sum(axis=0)).ravel()))
cv_word_counter = collections.Counter(cv_word_freq)
cv_word_counter_df = pd.DataFrame(cv_word_counter.most_common(500), columns = ['word', 'freq'])
cv_word_counter_df.head(5)

Unnamed: 0,word,freq
0,visit,29188
1,see,27334
2,ticket,25640
3,go,22226
4,time,20569


### Creating Bag of Words for subgroup negative reviews

In [144]:
# Creating subsets for negative reviews both machine and human approach
negative_df_h = df[df["sentiment_value"] == "negative"].copy()
negative_df_m = df[df["comp_score"] == "negative"].copy()

#### The human approach

In [146]:
#Initializing the model and transforming data
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=500)
bow_neg_h = cv.fit_transform(negative_df_h["review_str"])

In [150]:
#Creating a DataFrame for word frequencies for latter plotting
word_freq_h = dict(zip(cv.get_feature_names(), np.asarray(bow_neg_h.sum(axis=0)).ravel()))
word_counter_h = collections.Counter(word_freq_h)
word_counter_df_h = pd.DataFrame(word_counter_h.most_common(500), columns = ['word', 'freq'])
word_counter_df_h.head(5)

Unnamed: 0,word,freq
0,ticket,505
1,go,402
2,see,358
3,visit,305
4,time,285


### The machine approach

In [151]:
#Initializing the model and transforming data
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=500)
bow_neg_m = cv.fit_transform(negative_df_m["review_str"])

In [157]:
#Creating a DataFrame for word frequencies for latter plotting
word_freq_m = dict(zip(cv.get_feature_names(), np.asarray(bow_neg_m.sum(axis=0)).ravel()))
word_counter_m = collections.Counter(word_freq_m)
word_counter_df_m = pd.DataFrame(word_counter_m.most_common(500), columns = ['word', 'freq'])
word_counter_df_m.head(5)

Unnamed: 0,word,freq
0,see,1409
1,ticket,1336
2,visit,1320
3,go,1130
4,time,952


In [551]:
def cv_function(string):
    cv = CountVectorizer(max_features=4000)
    my_text = cv.fit_transform(string)
    my_columns = cv.get_feature_names()
    
    cv_test = CountVectorizer(max_features=4000)
    all_words = cv_test.fit_transform(df["review_str"])
    all_columns = cv_test.get_feature_names()
    test_matrix = pd.DataFrame(columns=all_columns)
    test_matrix.loc[0] = 0
    
    for word in my_columns:
        print(word)
        if word in all_columns:
            test_matrix.word = 1
    
    return test_matrix

In [553]:
test = cv_function(["Hello Mar you are beautiful"])

are
beautiful
hello
mar
you


In [561]:
test["beauti"]

0    0
Name: beauti, dtype: object

## Ensembling the pipeline

In [442]:
# Creating transformers from the preprocessing pipeline
from sklearn.preprocessing import FunctionTransformer

SpecialsRemover = FunctionTransformer(remove_specials)
DigitsRemover = FunctionTransformer(remove_digits)
LowerTransformer = FunctionTransformer(to_lower)
WordTokenizer = FunctionTransformer(tokenizer)
StopwordsRemover = FunctionTransformer(remove_stopwords)
EmptyTokensRemover = FunctionTransformer(remove_empty_tokens)
WordStemmer = FunctionTransformer(stemming)
WordLemmatizer = FunctionTransformer(lemmatizer)
Vectorizer = FunctionTransformer(cv_function)

# Creating the pipeline for preprocessing the new reviews
from sklearn.pipeline import Pipeline

preproc_pipe = Pipeline([
    ('specials_remover', SpecialsRemover), 
    ("digits_remover", DigitsRemover),
    ("to_lower", LowerTransformer),
    ("tokenizer", WordTokenizer),
    ("stopwords_remover", StopwordsRemover),
    ("empty_tokens", EmptyTokensRemover),
    ("stemmer", WordStemmer),
    ("lemmatizer", WordLemmatizer),
    ("cv", Vectorizer)])

# Modelling

In [184]:
# Creating df with both machine and human sentiments
machine = matrix.drop("human_label", axis=1)
human = matrix.drop("machine_label", axis=1)

## Splitting the data into train and test set

The two different approaches of the machine and human labels are going to be considered in the model. To split the data into train and test sets, the parameter `stratify` will be set to get a representative set of each label.

In [187]:
# Defining target variable and features for machine set
X_mac = machine.drop("machine_label", axis=1)
y_mac = machine["machine_label"]

In [191]:
# Machine set
from sklearn.model_selection import train_test_split
X_train_m, X_test_m, y_train_m, y_test_m = train_test_split(X_mac, y_mac, test_size=0.2, random_state=8, stratify=y_mac)

In [192]:
# Defining target variable and features for machine set
X_hum = human.drop("human_label", axis=1)
y_hum = human["human_label"]

In [193]:
# Machine set
from sklearn.model_selection import train_test_split
X_train_h, X_test_h, y_train_h, y_test_h = train_test_split(X_hum, y_hum, test_size=0.2, random_state=8, stratify=y_hum)

## Classification models

### Logistic Regression Classifier

#### Machine set

In [194]:
from sklearn.linear_model import LogisticRegression
lr_clf = LogisticRegression(random_state = 8)
lr_clf.fit(X_train_m, y_train_m)
y_pred_m = lr_clf.predict(X_test_m)

In [195]:
from sklearn.metrics import confusion_matrix, accuracy_score
acc =  accuracy_score(y_test_m, y_pred_m)
cm = confusion_matrix(y_test_m, y_pred_m)

print(acc)
print(cm)

0.8846153846153846
[[  0   0   8]
 [  0   0   7]
 [  3   0 138]]


#### Human set

In [198]:
from sklearn.linear_model import LogisticRegression
lr_clf = LogisticRegression(random_state = 8)
lr_clf.fit(X_train_h, y_train_h)
y_pred_h = lr_clf.predict(X_test_h)

In [199]:
from sklearn.metrics import confusion_matrix, accuracy_score
acc =  accuracy_score(y_test_h, y_pred_h)
cm = confusion_matrix(y_test_h, y_pred_h)

print(acc)
print(cm)

0.9487179487179487
[[  0   0   3]
 [  0   0   5]
 [  0   0 148]]


### Naive Bayes Classifier

#### Machine set

In [200]:
from sklearn.naive_bayes import MultinomialNB
nb_clf = MultinomialNB()
nb_clf.fit(X_train_m, y_train_m)
y_pred_m = nb_clf.predict(X_test_m)

In [201]:
acc =  accuracy_score(y_test_m, y_pred_m)
cm = confusion_matrix(y_test_m, y_pred_m)

print(acc)
print(cm)

0.8653846153846154
[[  0   1   7]
 [  0   0   7]
 [  2   4 135]]


#### Human set

In [202]:
from sklearn.naive_bayes import MultinomialNB
nb_clf = MultinomialNB()
nb_clf.fit(X_train_h, y_train_h)
y_pred_h = nb_clf.predict(X_test_h)

In [203]:
acc =  accuracy_score(y_test_h, y_pred_h)
cm = confusion_matrix(y_test_h, y_pred_h)

print(acc)
print(cm)

0.9294871794871795
[[  0   0   3]
 [  0   0   5]
 [  0   3 145]]


In [384]:
matrix

Unnamed: 0,aback,abandon,abbey,abil,abit,abl,abound,abroad,absenc,absolut,...,youth,youtub,yr,zero,zip,zone,zoo,zoom,human_label,machine_label
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,3,3
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,3,3
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,3,1
3,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,3,3
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,3,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55563,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,3,3
55564,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,3,3
55565,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,3,3
55566,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,3,3


In [404]:
# Creating the pipeline for preprocessing the new reviews
from sklearn.pipeline import Pipeline
model_pipe = Pipeline([("Classifier", LogisticRegression())])


best_model = model_pipe.fit(matrix.drop(["human_label","machine_label"],axis=1), matrix.human_label)

In [212]:
positive_reviews = pd.Series(["Whatever I say, it can't describe this architectural miracle! The only advice that I can give you, is to have plenty of time and hire a guide!",
"It is impossible to describe the greatness of this Cathedral and Gaudi genius!Buy your tickets ahead of time, plan to spend a few hours at this place.",
"Gaudi was an Architectural Legend! Once this Church has been finished it should be an additional Wonder of the World! Every detail on the outside of this building truly well thought out, shame he is not alive to see its completion."])

In [499]:
positive_review = "Gaudi was an Architectural Legend! Once this Church has been finished it should be an additional Wonder of the World! Every detail on the outside of this building truly well thought out, shame he is not alive to see its completion."
negative_review = "Rip off!! The website crashes so they charge you extra to pay at the entrance. They know the website crashes and yet they do nothing to help!!"

In [207]:
negative_reviews = pd.Series(["An impressive building but terribly run. Approached the building, no signs about buying tickets so headed to the queue of people trying to enter, security guard told us tickets round the front. They wouldn’t sell us a €20 euro ticket and these were only available about 5 hours later in the day so had to pay €32 euros each for a ticket with audio guide. The audio guide don’t work properly. Was scanned by security in two locations. I’m not religious but how is it acceptable to charge a fortune to enter the house of god. Don’t visit just admire the view from outside which is much better than the view from inside the perimeter. Watch out for pickpockets, why Spain doesn’t clamp down on this I don’t know. Shameful",
"Rip off!! The website crashes so they charge you extra to pay at the entrance. They know the website crashes and yet they do nothing to help!!"])

In [522]:
preproc_pipe.transform("Hello Mar, you are amazing")

Unnamed: 0,amaz,hello,mar
0,0,1,0
1,0,0,1
2,1,0,0


In [517]:
pd.DataFrame(preproc_pipe.transform(negative_review).sum()).T

Unnamed: 0,charg,crash,entranc,extra,help,know,noth,pay,rip,websit,yet
0,1,2,1,1,1,1,1,1,1,2,1


In [504]:
prep_review

Unnamed: 0,charg,crash,entranc,extra,help,know,noth,pay,rip,websit,yet
0,0,0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,0,0,0,0,1,0
2,0,1,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,0
4,0,0,0,1,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,1,0,0,0
6,0,0,1,0,0,0,0,0,0,0,0
7,0,0,0,0,0,1,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,1,0
9,0,1,0,0,0,0,0,0,0,0,0


In [361]:
matrix.shape

(779, 502)

In [519]:
new_review = pd.DataFrame(prep_review.sum(axis = 0)).T

In [397]:
test_matrix = matrix.align(prep_review, join='outer', axis=1, fill_value=0)

In [462]:
matrix

Unnamed: 0,aback,abandon,abbey,abil,abit,abl,abound,abroad,absenc,absolut,...,youth,youtub,yr,zero,zip,zone,zoo,zoom,human_label,machine_label
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,3,3
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,3,3
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,3,1
3,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,3,3
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,3,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55563,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,3,3
55564,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,3,3
55565,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,3,3
55566,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,3,3


In [502]:
prep_review

Unnamed: 0,charg,crash,entranc,extra,help,know,noth,pay,rip,websit,yet
0,0,0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,0,0,0,0,1,0
2,0,1,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,0
4,0,0,0,1,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,1,0,0,0
6,0,0,1,0,0,0,0,0,0,0,0
7,0,0,0,0,0,1,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,1,0
9,0,1,0,0,0,0,0,0,0,0,0


In [564]:
best_model.predict(cv.transform(pd.Series(["prep_review"])))

array([3])

In [520]:
best_model.predict(new_review)

ValueError: X has 11 features per sample; expecting 4000

In [496]:
pd.DataFrame(cv.transform(["yo whats uasodjaw terrible awful ugly"]).toarray()).sum(axis = 0).sum()

1

In [493]:
pd.DataFrame(cv.transform(["yo whats uasodjaw terrible awful ugly"]).toarray()).sum(axis = 0).sum()

1