In [27]:
import pandas as pd
import numpy as np

import re
import datetime as dt
import random
import os
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import nltk
from nltk.tokenize import word_tokenize
import spacy
import re
import collections
import string
from wordcloud import WordCloud

from nltk.stem import PorterStemmer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Text preprocessing

In [19]:
df = pd.read_csv(os.path.join('text_processing', 'raw_data', 'expedia_eng_reviews.csv'))
corpus = list(df.loc[:, 'description'])

In [17]:
class PreprocessText:
    
    def __init__(self, raw_text):
        self.text = raw_text
        
    # expand abbreviations
    def decontract_words(self):
        # punctuation mistake 
        phrase = re.sub(r"’", "'", self.text)
        phrase = re.sub(r'\\', "'", self.text)

        # specific
        phrase = re.sub(r"won\'t", "will not", phrase)
        phrase = re.sub(r"can\'t", "can not", phrase)

        # general
        phrase = re.sub(r"n\'t", " not", phrase)
        phrase = re.sub(r"\'re", " are", phrase)
        phrase = re.sub(r"\'s", " is", phrase)
        phrase = re.sub(r"\'d", " would", phrase)
        phrase = re.sub(r"\'ll", " will", phrase)
        phrase = re.sub(r"\'t", " not", phrase)
        phrase = re.sub(r"\'ve", " have", phrase)
        phrase = re.sub(r"\'m", " am", phrase)
        return phrase
    
    def tokenize_words(self, decontract=True):
        if decontract:
            text = self.decontract_words()
        else:
            text = self.text
            
        tokenized_text = word_tokenize(text)
        return tokenized_text
    
    def preprocess_tokens(self, decontract=True):
        words = self.tokenize_words(decontract)
            
        # create list of punctuation characters
        punctuation = string.punctuation
        customized_punctuation = ['“','”', '...', '', '’']
        for punct in customized_punctuation:
            punctuation += punct
            
        # lower words and remove punctuation
        words = [word.lower().replace('\n', '') for word in words if word not in punctuation]
        words = [word for word in words if len(re.findall(r'\d+', word)) == 0]
        
        return words
    
    def remove_stop_words_tokens(self, keep_stop_words=['most', 'very', 'not'], decontract=True):
        words = self.preprocess_tokens(decontract)
        
        # define stop words
        nltk_stop_words = nltk.corpus.stopwords.words('english')
        for stop_word in keep_stop_words:
            nltk_stop_words.remove(stop_word)
            
        # return text without stop words
        text_without_stop_words = [t for t in words if t not in nltk_stop_words]
        
        return text_without_stop_words
    
    def stemm_tokenized_words(self, keep_stop_words=['most', 'very', 'not'], decontract=True):
        words = self.remove_stop_words_tokens(keep_stop_words, decontract)
        
        # initialize stemmers
        stemmer = PorterStemmer()
        stemmed_words = []
        for word in words:
            stemmed_words.append(stemmer.stem(word))
        
        return stemmed_words

In [20]:
preprocessed_corpus = [PreprocessText(raw_text) for raw_text in corpus]
preprocessed_corpus = [text_item.stemm_tokenized_words() for text_item in preprocessed_corpus]

# TD-IDF

#### What is TD-IDF?

TF-IDF is a statistical measure that evaluates how relevant a word is to a document in a collection of documents. This is done by multiplying two metrics: how many times a word appears in a document, and the inverse document frequency of the word across a set of documents.

It has many uses, most importantly in automated text analysis, and is very useful for scoring words in machine learning algorithms for Natural Language Processing (NLP).

TF-IDF (term frequency-inverse document frequency) was invented for document search and information retrieval. It works by increasing proportionally to the number of times a word appears in a document, but is offset by the number of documents that contain the word. So, words that are common in every document, such as this, what, and if, rank low even though they may appear many times, since they don’t mean much to that document in particular.

However, if the word Bug appears many times in a document, while not appearing many times in others, it probably means that it’s very relevant. For example, if what we’re doing is trying to find out which topics some NPS responses belong to, the word Bug would probably end up being tied to the topic Reliability, since most responses containing that word would be about that topic.

#### How is TD-IDF computed?

TF-IDF for a word in a document is calculated by multiplying two different metrics:

- The term frequency of a word in a document. There are several ways of calculating this frequency, with the simplest being a raw count of instances a word appears in a document. Then, there are ways to adjust the frequency, by length of a document, or by the raw frequency of the most frequent word in a document.
- The inverse document frequency of the word across a set of documents. This means, how common or rare a word is in the entire document set. The closer it is to 0, the more common a word is. This metric can be calculated by taking the total number of documents, dividing it by the number of documents that contain a word, and calculating the logarithm.
So, if the word is very common and appears in many documents, this number will approach 0. Otherwise, it will approach 1.

Multiplying these two numbers results in the TF-IDF score of a word in a document. The higher the score, the more relevant that word is in that particular document.

To put it in more formal mathematical terms, the TF-IDF score for the word t in the document d from the document set D is calculated as follows:

$$tf(t, d) = \frac{f_d(t)}{\max_{w \in d} f_d(w)}$$


$$idf(t, D) = \ln(\frac{|D|}{|\{d \in D: t \in d\}|})$$


$$tfidf(t, d, D) = tf(t, d) * idf(t, D)$$


where $f_d(t)$ is the frequency of term t in document d, and D is the corpus of documents.

#### Why does it work?

Machine learning with natural language is faced with one major hurdle – its algorithms usually deal with numbers, and natural language is, well, text. So we need to transform that text into numbers, otherwise known as text vectorization. It’s a fundamental step in the process of machine learning for analyzing data, and different vectorization algorithms will drastically affect end results, so you need to choose one that will deliver the results you’re hoping for.

Once you’ve transformed words into numbers, in a way that’s machine learning algorithms can understand, the TF-IDF score can be fed to algorithms such as Naive Bayes and Support Vector Machines, greatly improving the results of more basic methods like word counts.

#### Basic word frequency count

In [23]:
from sklearn.feature_extraction.text import CountVectorizer
# list of text documents
text = ["The quick brown fox jumped over the lazy dog."]
# create the transform
vectorizer = CountVectorizer()
# tokenize and build vocab
vectorizer.fit(text)
# summarize
print(vectorizer.vocabulary_)

{'the': 7, 'quick': 6, 'brown': 0, 'fox': 2, 'jumped': 3, 'over': 5, 'lazy': 4, 'dog': 1}


In [25]:
# encode document
vector = vectorizer.transform(text)
# summarize encoded vector
print(vector.shape)
print(type(vector))
print(vector.toarray())

(1, 8)
<class 'scipy.sparse.csr.csr_matrix'>
[[1 1 1 1 1 1 1 2]]


In [26]:
text2 = ["the puppy"]
vector = vectorizer.transform(text2)
print(vector.toarray())

[[0 0 0 0 0 0 0 1]]


#### Word frequency using TDIF-Vectorizer

Below is an example of using the TfidfVectorizer to learn vocabulary and inverse document frequencies across 3 small documents and then encode one of those documents.

In [28]:
# list of text documents
text = ["The quick brown fox jumped over the lazy dog.",
		"The dog.",
		"The fox"]
# create the transform
vectorizer = TfidfVectorizer()
# tokenize and build vocab
vectorizer.fit(text)

TfidfVectorizer()

In [29]:
# summarize
print(vectorizer.vocabulary_)

{'the': 7, 'quick': 6, 'brown': 0, 'fox': 2, 'jumped': 3, 'over': 5, 'lazy': 4, 'dog': 1}


In [30]:
print(vectorizer.idf_)

[1.69314718 1.28768207 1.28768207 1.69314718 1.69314718 1.69314718
 1.69314718 1.        ]


In [31]:
# encode document
vector = vectorizer.transform([text[0]])
# summarize encoded vector
print(vector.shape)
print(vector.toarray())

(1, 8)
[[0.36388646 0.27674503 0.27674503 0.36388646 0.36388646 0.36388646
  0.36388646 0.42983441]]


A vocabulary of 8 words is learned from the documents and each word is assigned a unique integer index in the output vector.
The inverse document frequencies are calculated for each word in the vocabulary, assigning the lowest score of 1.0 to the most frequently observed word: “the” at index 7.


Finally, the first document is encoded as an 8-element sparse array and we can review the final scorings of each word with different values for “the“, “fox“, and “dog” from the other words in the vocabulary. The scores are normalized to values between 0 and 1 and the encoded document vectors can then be used directly with most machine learning algorithms.

In [33]:
vectors = vectorizer.transform([text[0], text[1]])
print(vectors.shape)
print(type(vectors))

(2, 8)
<class 'scipy.sparse.csr.csr_matrix'>


#### Application to Expedia data

In [35]:
clean_corpus_before_idf = [' '.join(words) for words in preprocessed_corpus]

In [36]:
vectorizer = TfidfVectorizer()
# tokenize and build vocab
vectorizer.fit(clean_corpus_before_idf)

TfidfVectorizer()

In [37]:
print('Number of words: %.f' % len(vectorizer.vocabulary_))

Number of words: 5835


In [38]:
print(vectorizer.idf_[:10])
print(len(vectorizer.idf_))

[9.34152942 9.34152942 9.34152942 8.08876645 8.93606431 8.24291713
 8.24291713 9.34152942 5.09303418 9.34152942]
5835


In [41]:
# each row corresponds to one review and each column to one word 
corpus_tdif_matrix = vectorizer.transform(clean_corpus_before_idf)
print(corpus_tdif_matrix.toarray().shape)

(8388, 5835)


In practice using TF-IDF-vectors, that have been calculated with the entire corpus (training and test subsets combined), while training the model might introduce some data leakage and hence yield in too optimistic performance measures. This is because the IDF-part of the training set's TF-IDF features will then include information from the test set already.
Calculating them completely separately for the training and test set is not a good idea either, because besides testing the quality of your model then you will be also testing the quality of your IDF-estimation. And because the test set is usually small this will be a poor estimation and will worsen your performance measures.

Therefore the solution would be (analogously to the common mean imputation of missing values) to perform TF-IDF-normalization on the training set seperately and then use the IDF-vector from the training set to calculate the TF-IDF vectors of the test set.