In [23]:
import pandas as pd
import numpy as np

import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import re

from functools import lru_cache
from multiprocessing import Pool

import warnings
warnings.filterwarnings('ignore')

In [24]:
MORPH = SnowballStemmer('english')

In [2]:
train_df = pd.read_csv('../csv/train.csv')

In [3]:
train_df.head()

Unnamed: 0,Review,Rating
0,Dog days is one of most accurate films i've ev...,10
1,In the Hollywood west those trail hands were a...,8
2,After watching the Next Action Star reality TV...,7
3,Considering the lack of art with in African ci...,10
4,mature intelligent and highly charged melodram...,10


In [4]:
test_df = pd.read_csv('../csv/test.csv')

In [5]:
test_df.head()

Unnamed: 0,Review,Rating
0,This is fantastic! Everything from the Score -...,10
1,This movie was amazing!!!! From beginning to e...,10
2,"The first time I've seen this DVD, I was not o...",10
3,One of the flat-out drollest movies of all-tim...,10
4,"When I first got wind of this picture, it was ...",9


In [25]:
@lru_cache(maxsize=100000)
def get_normal_form (i):
    return MORPH.stem(i.lower())

def normalize_text(text):
    normalized = [get_normal_form(word) for word in re.findall(r'[a-zA-Z]{3,}', text)]
    return ' '.join([word for word in normalized if word not in stop_words])

stop_words = set(stopwords.words('english')) 

In [26]:
# normalize words in 'Review' column

with Pool(processes=2) as pool:
    train_df['review_parse'] = pool.map(normalize_text, train_df.Review)
    test_df['review_parse'] = pool.map(normalize_text, test_df.Review)
    pool.terminate()

In [27]:
train_df.head()

Unnamed: 0,Review,Rating,review_parse
0,Dog days is one of most accurate films i've ev...,10,dog day one accur film ever seen describ life ...
1,In the Hollywood west those trail hands were a...,8,hollywood west trail hand rough bunch came tow...
2,After watching the Next Action Star reality TV...,7,watch next action star realiti seri pleas see ...
3,Considering the lack of art with in African ci...,10,consid lack art african cinema black american ...
4,mature intelligent and highly charged melodram...,10,matur intellig high charg melodrama unbelivebl...


In [28]:
test_df.head()

Unnamed: 0,Review,Rating,review_parse
0,This is fantastic! Everything from the Score -...,10,fantast everyth score final credit role movi m...
1,This movie was amazing!!!! From beginning to e...,10,movi amaz begin end movi pack fun laugh music ...
2,"The first time I've seen this DVD, I was not o...",10,first time seen dvd onli happi becaus fact fir...
3,One of the flat-out drollest movies of all-tim...,10,one flat drollest movi time sim rutherford bes...
4,"When I first got wind of this picture, it was ...",9,first got wind pictur call shepherd suppos fil...


In [29]:
train_df[['review_parse', 'Rating']].to_pickle('../data/train.pkl')
test_df[['review_parse', 'Rating']].to_pickle('../data/test.pkl')