In [163]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

dataset = pd.read_csv('data/IMDB_Dataset.csv')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\24746\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\24746\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\24746\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


<h2>Step 1: Text Cleaning</h2>
<h5>Remove special chars, numbers, and extra spaces</h5>

In [164]:
# remove head and tail space and lowercasing
dataset['review'] = dataset['review'].fillna('').str.lower().str.strip()

In [165]:
# remove emoticons and symbols
emoticon_pattern = re.compile("["  
    u"\U0001F600-\U0001F64F"  # emoticons
    u"\U0001F300-\U0001F5FF"  # symbols & pictographs
    u"\U0001F680-\U0001F6FF"  # transport & map symbols
    u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
    u"\U00002500-\U00002BEF"  # Chinese characters
    u"\U00002702-\U000027B0"
    "]+", flags=re.UNICODE)

dataset['review'] = dataset['review'].str.replace(emoticon_pattern, '', regex=True)

In [166]:
# remove numbers and special chars
dataset['review'] = dataset['review'].str.replace(r'[^a-z\s]', '', regex=True)

In [167]:
# tokenization
dataset['sentences'] = dataset['review'].map(sent_tokenize)
dataset['words'] = dataset['review'].map(word_tokenize)

In [168]:
# remove stop words
stop_words = set(stopwords.words('english'))
dataset['words'] = dataset['words'].map(lambda words: [word for word in words if word not in stop_words])

In [169]:
# apply stemmer
stemmer = PorterStemmer()
dataset['stemmed_words'] = dataset['words'].map(lambda words: [stemmer.stem(word) for word in words])

In [170]:
# apply lemmatizer
lemmatizer = WordNetLemmatizer()
dataset['lemmatized_words'] = dataset['stemmed_words'].map(lambda words: [lemmatizer.lemmatize(word) for word in words])

In [171]:
dataset.to_csv('data/processed_dataset.csv', index=False)
print(dataset.head())

                                              review sentiment  \
0  one of the other reviewers has mentioned that ...  positive   
1  a wonderful little production br br the filmin...  positive   
2  i thought this was a wonderful way to spend ti...  positive   
3  basically theres a family where a little boy j...  negative   
4  petter matteis love in the time of money is a ...  positive   

                                           sentences  \
0  [one of the other reviewers has mentioned that...   
1  [a wonderful little production br br the filmi...   
2  [i thought this was a wonderful way to spend t...   
3  [basically theres a family where a little boy ...   
4  [petter matteis love in the time of money is a...   

                                               words  \
0  [one, reviewers, mentioned, watching, oz, epis...   
1  [wonderful, little, production, br, br, filmin...   
2  [thought, wonderful, way, spend, time, hot, su...   
3  [basically, theres, family, little, boy

In [None]:
from readability import Readability

def calculate_readability(text):
    try:
        r = Readability(text)
        flesch = r.flesch().score
        fk_grade = r.flesch_kincaid().grade_level 
        return flesch, fk_grade
    except Exception as e:
        return None, None

dataset[['c', 'fk_grade_level']] = dataset['review'].apply(
    lambda x: pd.Series(calculate_readability(x))
)

dataset.to_csv('data/processed_with_readability.csv', index=False)

print(dataset.head())

                                              review sentiment  \
0  one of the other reviewers has mentioned that ...  positive   
1  a wonderful little production br br the filmin...  positive   
2  i thought this was a wonderful way to spend ti...  positive   
3  basically theres a family where a little boy j...  negative   
4  petter matteis love in the time of money is a ...  positive   

                                           sentences  \
0  [one of the other reviewers has mentioned that...   
1  [a wonderful little production br br the filmi...   
2  [i thought this was a wonderful way to spend t...   
3  [basically theres a family where a little boy ...   
4  [petter matteis love in the time of money is a...   

                                               words  \
0  [one, reviewers, mentioned, watching, oz, epis...   
1  [wonderful, little, production, br, br, filmin...   
2  [thought, wonderful, way, spend, time, hot, su...   
3  [basically, theres, family, little, boy

In [174]:
print(dataset['fk_grade_level'])

0        118
1         65
2         64
3         51
4         89
        ... 
49995     73
49996     45
49997     89
49998     82
49999     49
Name: fk_grade_level, Length: 50000, dtype: object


In [176]:
print(dataset['flesch_score'])

0       -225.955000
1        -94.417222
2        -84.720000
3        -46.472782
4       -152.411522
            ...    
49995   -101.285625
49996    -33.745000
49997   -151.576419
49998   -134.029526
49999    -38.555000
Name: flesch_score, Length: 50000, dtype: float64
