# Making necessary imports

In [1]:
import string
import pandas as pd

In [2]:
# NLP library imports
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/zigfridzvezdin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/zigfridzvezdin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/zigfridzvezdin/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Discovering [Stemming](https://en.wikipedia.org/wiki/Stemming) and [Lemmatisation](https://en.wikipedia.org/wiki/Lemmatisation)


If you want to understand how the [Porter Algorithm](https://fr.wikipedia.org/wiki/Racinisation#Algorithme_de_Porter) works.

In [3]:
#create Stemmer objects
porter = PorterStemmer()
lancaster = LancasterStemmer()

## Visualizing the effects of two different stemmers on basic words

In [4]:
#A list of words to be stemmed
word_list = ["friend", "friendship", "friends", "friendships","stabil","destabilize","misunderstanding","railroad","moonlight","football"]
print("{0:20}{1:20}{2:20}".format("Word","Porter Stemmer","lancaster Stemmer"))
for word in word_list:
    print("{0:20}{1:20}{2:20}".format(word,porter.stem(word),lancaster.stem(word)))

Word                Porter Stemmer      lancaster Stemmer   
friend              friend              friend              
friendship          friendship          friend              
friends             friend              friend              
friendships         friendship          friend              
stabil              stabil              stabl               
destabilize         destabil            dest                
misunderstanding    misunderstand       misunderstand       
railroad            railroad            railroad            
moonlight           moonlight           moonlight           
football            footbal             footbal             


## Effects on a total sentence

In [5]:
def stemSentence(sentence, stemmer):
    
    token_words = word_tokenize(sentence)
    stem_sentence = []
    
    for word in token_words:
        stem_sentence.append(stemmer.stem(word))
        stem_sentence.append(" ")
    
    return "".join(stem_sentence)

In [6]:
# And compare differences
sentence="Pythoners are very intelligent and work very pythonly and now they are pythoning their way to success."

print(stemSentence(sentence, lancaster))
print(stemSentence(sentence, porter))

python ar very intellig and work very python and now they ar python their way to success . 
python are veri intellig and work veri pythonli and now they are python their way to success . 


In [7]:
# Look at what is happening on a french sentence
sentence="Ce matin je suis allé acheter une galette à la boulangerie puis je me suis régalé avant de venir en cours."

print(stemSentence(sentence, lancaster))
print(stemSentence(sentence, porter))

ce matin je sui allé achet un galet à la boulangery pui je me sui régalé av de venir en cour . 
Ce matin je sui allé achet une galett à la boulangeri pui je me sui régalé avant de venir en cour . 


## A stemmer to use on different languages (for example french..)

In [8]:
def frenchStemSentence(sentence):
    frenchStemmer=SnowballStemmer("french", ignore_stopwords=False)
    token_words=word_tokenize(sentence)
    stem_sentence=[]
    for word in token_words:
        stem_sentence.append(frenchStemmer.stem(word))
        stem_sentence.append(" ")
    return "".join(stem_sentence)

frenchStemSentence("cette phrase est à la fois amusante et surprenante")

'cet phras est à la fois amus et surpren '

In [9]:
frenchStemSentence(sentence)

'ce matin je suis allé achet une galet à la boulanger puis je me suis régal avant de ven en cour . '

## Having a look at lemmatization

In [10]:
# Initiate lemmatizer
lemmatizer = WordNetLemmatizer()

# Create lematizing function
def lemmatize(sentence):
    tokens=word_tokenize(sentence)
    tokens = [lemmatizer.lemmatize(lemmatizer.lemmatize(lemmatizer.lemmatize(token,pos='a'),pos='v'),pos='n') for token in tokens]
    return " ".join(tokens)

# And display results
lemmer = lemmatize("Such an analysis can reveal features that are not easily visible from the variations in the individual genes and can lead to a picture of expression that is more biologically transparent")

# Applying one of those modification to our dataset

 **Preparing both functions**

In [11]:
# Lemmatization
lemmatizer = WordNetLemmatizer()
def lemmatize(tokens):
    tokens = [lemmatizer.lemmatize(lemmatizer.lemmatize(lemmatizer.lemmatize(token,pos='a'),pos='v'),pos='n') for token in tokens]
    return tokens  

# Stemming
frenchStemmer=SnowballStemmer("french")
def stem(tokens):
    tokens = [frenchStemmer.stem(token) for token in tokens]
    return tokens  


In [12]:
df = pd.read_csv('AllReviewsClean.csv')

**And finally applying it to our dataFrame**

In [13]:
# Lemmantizing for English reviews.
df.loc[df['language'] == 'en', 'tokens'] = df[df.language == 'en'].review.apply(lambda s: lemmatize(word_tokenize(s)))   
# Stemming for French reviews.
df.loc[df['language'] == 'fr', 'tokens'] = df[df.language == 'fr'].review.apply(lambda s: stem(word_tokenize(s))) 

# And displaying results
df.head()

Unnamed: 0,date,review,rating,title,language,trip_date,hotel,source,tokens
0,1549152000000000000,enjoyed wonderful stay longleat centerparcs ti...,5,Sub-Tropical Swimming Paradise,en,1546300800000000000,Longleat Forest,TripAdvisor,"[enjoy, wonderful, stay, longleat, centerparcs..."
1,1549238400000000000,long weekend break booked 8 friday 1st feb giv...,1,Appalling customer service,en,1548979200000000000,Longleat Forest,TripAdvisor,"[long, weekend, break, book, 8, friday, 1st, f..."
2,1549238400000000000,stayed fri fir area 45 minute walk car park go...,5,Snow,en,1546300800000000000,Longleat Forest,TripAdvisor,"[stay, fri, fir, area, 45, minute, walk, car, ..."
3,1549238400000000000,center parcs longleat numerous times always en...,1,Disaster in management,en,1548979200000000000,Longleat Forest,TripAdvisor,"[center, parcs, longleat, numerous, time, alwa..."
4,1549238400000000000,like many people would seem break weekend happ...,1,Almost visit,en,1548979200000000000,Longleat Forest,TripAdvisor,"[like, many, people, would, seem, break, weeke..."


In [14]:
df.to_csv("AllReviewsCleanTokens.csv", index=True)