In this notebook process of text preparation is presented

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import re
import sys
import nltk

In [2]:
train = pd.read_csv('../data/raw/train.csv')
test = pd.read_csv('../data/raw/test.csv')
train.head()

Unnamed: 0,review,sentiment
0,I caught this little gem totally by accident b...,positive
1,I can't believe that I let myself into this mo...,negative
2,*spoiler alert!* it just gets to me the nerve ...,negative
3,If there's one thing I've learnt from watching...,negative
4,"I remember when this was in theaters, reviews ...",negative


We should delete URL adresses and, which is more important, HTML markers

In [3]:
def clean_url(text):
    return re.sub(r'http\S+', '', text)

def clean_html(text):
    return re.sub(r'<.*?>', '', text)

In [4]:
train['review'] = train['review'].apply(clean_url).apply(clean_html)
test['review'] = test['review'].apply(clean_url).apply(clean_html)

train.head()

Unnamed: 0,review,sentiment
0,I caught this little gem totally by accident b...,positive
1,I can't believe that I let myself into this mo...,negative
2,*spoiler alert!* it just gets to me the nerve ...,negative
3,If there's one thing I've learnt from watching...,negative
4,"I remember when this was in theaters, reviews ...",negative


Leaving only letters

In [5]:
def clean_non_alphabetic(text):
    return re.sub(r'[^a-zA-Z]', ' ', text)

train['review'] = train['review'].apply(clean_non_alphabetic)
test['review'] = test['review'].apply(clean_non_alphabetic)

Cleaning multiple spaces and reducing all words to lower letter

In [6]:
def clean_multiple_spaces(text):
    return re.sub(r'\s+', ' ', text)

def to_lower(text):
    return text.lower()

train['review'] = train['review'].apply(to_lower).apply(clean_multiple_spaces)
test['review'] = test['review'].apply(to_lower).apply(clean_multiple_spaces)

Now more sofisticated processing - tokenizing, removing stopwords, removing very short words, lemmatizing 

In [7]:
from nltk.tokenize import word_tokenize

train['tokens'] = train['review'].apply(word_tokenize)
test['tokens'] = test['review'].apply(word_tokenize)

train.head()

Unnamed: 0,review,sentiment,tokens
0,i caught this little gem totally by accident b...,positive,"[i, caught, this, little, gem, totally, by, ac..."
1,i can t believe that i let myself into this mo...,negative,"[i, can, t, believe, that, i, let, myself, int..."
2,spoiler alert it just gets to me the nerve so...,negative,"[spoiler, alert, it, just, gets, to, me, the, ..."
3,if there s one thing i ve learnt from watching...,negative,"[if, there, s, one, thing, i, ve, learnt, from..."
4,i remember when this was in theaters reviews s...,negative,"[i, remember, when, this, was, in, theaters, r..."


In [8]:
from nltk.corpus import stopwords

nltk.download('stopwords')

stop_words = set(stopwords.words('english'))


def remove_stopwords(tokens):
    return [t for t in tokens if t not in stop_words]

train['tokens'] = train['tokens'].apply(remove_stopwords)
test['tokens'] = test['tokens'].apply(remove_stopwords)

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/bstepniewski/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
from nltk.stem import WordNetLemmatizer

lemma = WordNetLemmatizer()

def lemmatize(tokens):
    return [lemma.lemmatize(word = w, pos = 'v') for w in tokens]

train['tokens_lemmatized'] = train['tokens'].apply(lemmatize)
test['tokens_lemmatized'] = test['tokens'].apply(lemmatize)

train.head()

Unnamed: 0,review,sentiment,tokens,tokens_lemmatized
0,i caught this little gem totally by accident b...,positive,"[caught, little, gem, totally, accident, back,...","[catch, little, gem, totally, accident, back, ..."
1,i can t believe that i let myself into this mo...,negative,"[believe, let, movie, accomplish, favor, frien...","[believe, let, movie, accomplish, favor, frien..."
2,spoiler alert it just gets to me the nerve so...,negative,"[spoiler, alert, gets, nerve, people, remake, ...","[spoiler, alert, get, nerve, people, remake, u..."
3,if there s one thing i ve learnt from watching...,negative,"[one, thing, learnt, watching, george, romero,...","[one, thing, learn, watch, george, romero, cre..."
4,i remember when this was in theaters reviews s...,negative,"[remember, theaters, reviews, said, horrible, ...","[remember, theaters, review, say, horrible, we..."


In [10]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()
def stem(tokens):
    return [stemmer.stem(w) for w in tokens]


train['tokens_stemmed'] = train['tokens'].apply(stem)
test['tokens_stemmed'] = test['tokens'].apply(stem)

train.head()

Unnamed: 0,review,sentiment,tokens,tokens_lemmatized,tokens_stemmed
0,i caught this little gem totally by accident b...,positive,"[caught, little, gem, totally, accident, back,...","[catch, little, gem, totally, accident, back, ...","[caught, littl, gem, total, accid, back, reviv..."
1,i can t believe that i let myself into this mo...,negative,"[believe, let, movie, accomplish, favor, frien...","[believe, let, movie, accomplish, favor, frien...","[believ, let, movi, accomplish, favor, friend,..."
2,spoiler alert it just gets to me the nerve so...,negative,"[spoiler, alert, gets, nerve, people, remake, ...","[spoiler, alert, get, nerve, people, remake, u...","[spoiler, alert, get, nerv, peopl, remak, use,..."
3,if there s one thing i ve learnt from watching...,negative,"[one, thing, learnt, watching, george, romero,...","[one, thing, learn, watch, george, romero, cre...","[one, thing, learnt, watch, georg, romero, cre..."
4,i remember when this was in theaters reviews s...,negative,"[remember, theaters, reviews, said, horrible, ...","[remember, theaters, review, say, horrible, we...","[rememb, theater, review, said, horribl, well,..."


In [11]:
def clean_short_words(tokens):
    return [t for t in tokens if len(t) > 2]

train['tokens_lemmatized'] = train['tokens_lemmatized'].apply(clean_short_words)
test['tokens_lemmatized'] = test['tokens_lemmatized'].apply(clean_short_words)
train['tokens_stemmed'] = train['tokens_stemmed'].apply(clean_short_words)
test['tokens_stemmed'] = test['tokens_stemmed'].apply(clean_short_words)

And at the end, returning to string

In [12]:
def join_tokens(tokens):
    return ' '.join(tokens)

train['review_lemmatized'] = train['tokens_lemmatized'].apply(join_tokens)
test['review_lemmatized'] = test['tokens_lemmatized'].apply(join_tokens)

train['review_stemmed'] = train['tokens_stemmed'].apply(join_tokens)
test['review_stemmed'] = test['tokens_stemmed'].apply(join_tokens)

train.head()

Unnamed: 0,review,sentiment,tokens,tokens_lemmatized,tokens_stemmed,review_lemmatized,review_stemmed
0,i caught this little gem totally by accident b...,positive,"[caught, little, gem, totally, accident, back,...","[catch, little, gem, totally, accident, back, ...","[caught, littl, gem, total, accid, back, reviv...",catch little gem totally accident back revival...,caught littl gem total accid back reviv theatr...
1,i can t believe that i let myself into this mo...,negative,"[believe, let, movie, accomplish, favor, frien...","[believe, let, movie, accomplish, favor, frien...","[believ, let, movi, accomplish, favor, friend,...",believe let movie accomplish favor friends ask...,believ let movi accomplish favor friend ask ea...
2,spoiler alert it just gets to me the nerve so...,negative,"[spoiler, alert, gets, nerve, people, remake, ...","[spoiler, alert, get, nerve, people, remake, u...","[spoiler, alert, get, nerv, peopl, remak, use,...",spoiler alert get nerve people remake use term...,spoiler alert get nerv peopl remak use term lo...
3,if there s one thing i ve learnt from watching...,negative,"[one, thing, learnt, watching, george, romero,...","[one, thing, learn, watch, george, romero, cre...","[one, thing, learnt, watch, georg, romero, cre...",one thing learn watch george romero creepshow ...,one thing learnt watch georg romero creepshow ...
4,i remember when this was in theaters reviews s...,negative,"[remember, theaters, reviews, said, horrible, ...","[remember, theaters, review, say, horrible, we...","[rememb, theater, review, said, horribl, well,...",remember theaters review say horrible well thi...,rememb theater review said horribl well think ...
