# Making necessary imports

In [1]:
import string
import pandas as pd

In [2]:
# NLP library imports
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/zigfridzvezdin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/zigfridzvezdin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/zigfridzvezdin/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

# Applying the transformation we've seen to our dataset

In [3]:
# Loading dataFrame
df = pd.read_json('session_3_notebook_2.json')

# Preparing transformations for preprocessing function
caracters_to_remove = list(string.punctuation)
transformation_car_dict = {initial:" " for initial in caracters_to_remove}

with_accent = ['é', 'è', 'ê', 'à', 'ù', 'ç', 'ô', 'î']
without_accent = ['e', 'e', 'e', 'a', 'u', 'c', 'o', 'i']
transformation_accent_dict = {before:after for before, after in zip(with_accent, without_accent)}

stopW = stopwords.words('french')
stopW += ['les', 'a', 'tout']


# Preprocessing function to apply to the content column
def preprocessing(review):
  
    # Tokenization
    tokens = word_tokenize(review)
    
    # Deleting words with  only one caracter
    tokens = [token for token in tokens if len(token)>2]
    
    # stopwords + lowercase
    tokens = [token.lower() for token in tokens if token.lower() not in stopW]   
    
    # Removing accents
    tokens = [token.translate(str.maketrans(transformation_accent_dict)) for token in tokens]
    
    # Deleting specific caracters
    tokens = [token.translate(str.maketrans(transformation_car_dict)) for token in tokens]
        
    return tokens
  

# Creating a new column swith tokenized reviews
df['tokens'] = df['content'].apply(preprocessing)

# Displaying part of the result
df.tail()

Unnamed: 0,content,language,name,stars,title,tokens
2016-10-11 07:54:28,"calme, reposant, confortable, dépaysant.",fr,jerome,5,bon séjour,"[calme, reposant, confortable, depaysant]"
2017-09-15 11:17:50,moi je vais parler aujourd'hui du service comm...,fr,sophie duhamel,1,Non professionnel,"[vais, parler, aujourd hui, service, commercia..."
2018-05-12 10:02:58,"pas grand chose ne marche, ni l’internet, ni w...",fr,manuele civico,1,Pas grand chose ne marche !,"[grand, chose, marche, internet, wifi, trop, i..."
2018-08-08 07:52:09,"parc très agréable, difficile de s'y retrouver...",fr,alain,3,Les TROIS FORETS,"[parc, tres, agreable, difficile, s y, retrouv..."
2018-11-22 17:23:07,"nous avons passé un excellent week-end, tout é...",fr,cor boonen,4,Week-end,"[passe, excellent, week end, bien, entretenu]"


# Discovering [Stemming](https://en.wikipedia.org/wiki/Stemming) and [Lemmatisation](https://en.wikipedia.org/wiki/Lemmatisation)


If you want to understand how the [Porter Algorithm](https://fr.wikipedia.org/wiki/Racinisation#Algorithme_de_Porter) works.

In [4]:
#create Stemmer objects
porter = PorterStemmer()
lancaster=LancasterStemmer()

## Visualizing the effects of two different stemmers on basic words

In [5]:
#A list of words to be stemmed
word_list = ["friend", "friendship", "friends", "friendships","stabil","destabilize","misunderstanding","railroad","moonlight","football"]
print("{0:20}{1:20}{2:20}".format("Word","Porter Stemmer","lancaster Stemmer"))
for word in word_list:
    print("{0:20}{1:20}{2:20}".format(word,porter.stem(word),lancaster.stem(word)))

Word                Porter Stemmer      lancaster Stemmer   
friend              friend              friend              
friendship          friendship          friend              
friends             friend              friend              
friendships         friendship          friend              
stabil              stabil              stabl               
destabilize         destabil            dest                
misunderstanding    misunderstand       misunderstand       
railroad            railroad            railroad            
moonlight           moonlight           moonlight           
football            footbal             footbal             


## Effects on a total sentence

In [6]:
def stemSentence(sentence, stemmer):
    
    token_words = word_tokenize(sentence)
    stem_sentence = []
    
    for word in token_words:
        stem_sentence.append(stemmer.stem(word))
        stem_sentence.append(" ")
    
    return "".join(stem_sentence)

In [7]:
# And compare differences
sentence="Pythoners are very intelligent and work very pythonly and now they are pythoning their way to success."

print(stemSentence(sentence, lancaster))
print(stemSentence(sentence, porter))

python ar very intellig and work very python and now they ar python their way to success . 
python are veri intellig and work veri pythonli and now they are python their way to success . 


In [8]:
# Look at what is happening on a french sentence
sentence="Ce matin je suis allé acheter une galette à la boulangerie puis je me suis régalé avant de venir en cours."

print(stemSentence(sentence, lancaster))
print(stemSentence(sentence, porter))

ce matin je sui allé achet un galet à la boulangery pui je me sui régalé av de venir en cour . 
Ce matin je sui allé achet une galett à la boulangeri pui je me sui régalé avant de venir en cour . 


## A stemmer to use on different languages (for example french..)

In [9]:
def frenchStemSentence(sentence):
    frenchStemmer=SnowballStemmer("french", ignore_stopwords=False)
    token_words=word_tokenize(sentence)
    stem_sentence=[]
    for word in token_words:
        stem_sentence.append(frenchStemmer.stem(word))
        stem_sentence.append(" ")
    return "".join(stem_sentence)

frenchStemSentence("cette phrase est à la fois amusante et surprenante")

'cet phras est à la fois amus et surpren '

## Having a look at lemmatization

In [10]:
# Initiate lemmatizer
lemmatizer = WordNetLemmatizer()

# Create lematizing function
def lemmatize(sentence):
    tokens=word_tokenize(sentence)
    tokens = [lemmatizer.lemmatize(lemmatizer.lemmatize(lemmatizer.lemmatize(token,pos='a'),pos='v'),pos='n') for token in tokens]
    return " ".join(tokens)

# And display results
lemmer = lemmatize("Such an analysis can reveal features that are not easily visible from the variations in the individual genes and can lead to a picture of expression that is more biologically transparent")

In [11]:
lemmer

'Such an analysis can reveal feature that be not easily visible from the variation in the individual gene and can lead to a picture of expression that be more biologically transparent'

# Applying one of those modification to our dataset

 **Preparing both functions**

In [12]:
# Lemmatization
lemmatizer = WordNetLemmatizer()
def lemmatize(tokens):
    tokens = [lemmatizer.lemmatize(lemmatizer.lemmatize(lemmatizer.lemmatize(token,pos='a'),pos='v'),pos='n') for token in tokens]
    return tokens  

# Stemming
frenchStemmer=SnowballStemmer("french")
def stem(tokens):
    tokens = [frenchStemmer.stem(token) for token in tokens]
    return tokens  


**Selecting which one to apply, given the language used in your reviews**

In [13]:
# Are your reviews in English ? (here it is unfortunately not the case)
english = False

**And finally applying it to our dataFrame**

In [14]:
# Making appropriate modification
if english:
    df['inflected'] = df['tokens'].apply(lemmatize)

else:
    df['inflected'] = df['tokens'].apply(stem)

# And displaying results
df.head()

Unnamed: 0,content,language,name,stars,title,tokens,inflected
2014-12-23 18:56:04,expérience moyenne,fr,james james,1,4 jours au bois les francs,"[experience, moyenne]","[experient, moyen]"
2014-12-23 20:05:44,une rivière sauvage annoncée et non présenté,fr,victor miguel,3,Moyen,"[riviere, sauvage, annoncee, non, presente]","[rivier, sauvag, annonce, non, present]"
2014-12-24 09:11:08,très beau domaine mais pêchant par de nombreux...,fr,frederic lefebvre,4,Lac de l'ailette - Centerparcs,"[tres, beau, domaine, pechant, nombreux, petit...","[tre, beau, domain, pech, nombreux, petit, det..."
2014-12-24 10:14:25,"pas de problème pour parcourir le site, une ré...",fr,christian briola,5,un site plutôt pratique et agréable.,"[probleme, parcourir, site, reservation, effec...","[problem, parcour, sit, reserv, effectue, quel..."
2014-12-24 11:47:54,"location premium 6 personnes vue sur le lac, t...",fr,patricia jamet,3,"beau parc, belle vue, mais quelques remarques ...","[location, premium, personnes, vue, lac, trop,...","[locat, premium, person, vu, lac, trop, beau, ..."


# Final modification

In [15]:
# Why not doing the same on title 
df['title'] = df['title'].apply(preprocessing).apply(stem)
df.reset_index(drop = True, inplace = True)

# Finally keeping only necessary columns
del(df['content'])
del(df['tokens'])

df.head(10)

Unnamed: 0,language,name,stars,title,inflected
0,fr,james james,1,"[jour, bois, franc]","[experient, moyen]"
1,fr,victor miguel,3,[moyen],"[rivier, sauvag, annonce, non, present]"
2,fr,frederic lefebvre,4,"[lac, l ailet, centerparc]","[tre, beau, domain, pech, nombreux, petit, det..."
3,fr,christian briola,5,"[sit, plutot, pratiqu, agreabl]","[problem, parcour, sit, reserv, effectue, quel..."
4,fr,patricia jamet,3,"[beau, parc, bel, vu, quelqu, remarqu, ]","[locat, premium, person, vu, lac, trop, beau, ..."
5,fr,duboc,5,"[rien, signal]","[aucun, souc, reserv, pai, prix, interess, res..."
6,fr,donin de rosiere,3,"[cottag, rafraich, urgenc]","[c et, deuxiem, fois, allion, centrerparc, pre..."
7,fr,semeteys,5,"[tre, facil, utilis]","[sit, tre, facil, reserv, c est, fait, tre, ra..."
8,fr,robert weber,4,"[problem, particuli, sauf, cet, odeur, tabac]","[parf, san, cet, desagre, odeur, tabac, vrai, ..."
9,fr,gualicia,4,[prec],"[lor, recherch, bien, clair, sit, reserv, sup,..."
