In [4]:
import pandas as pd
import numpy as np

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

from functools import lru_cache
from multiprocessing import Pool

import warnings
warnings.filterwarnings('ignore')

In [5]:
MORPH = WordNetLemmatizer()

In [6]:
train_df = pd.read_csv('../csv/train.csv')

In [7]:
train_df.head()

Unnamed: 0,Review,Rating
0,Dog days is one of most accurate films i've ev...,10
1,In the Hollywood west those trail hands were a...,8
2,After watching the Next Action Star reality TV...,7
3,Considering the lack of art with in African ci...,10
4,mature intelligent and highly charged melodram...,10


In [8]:
test_df = pd.read_csv('../csv/test.csv')

In [9]:
test_df.head()

Unnamed: 0,Review,Rating
0,This is fantastic! Everything from the Score -...,10
1,This movie was amazing!!!! From beginning to e...,10
2,"The first time I've seen this DVD, I was not o...",10
3,One of the flat-out drollest movies of all-tim...,10
4,"When I first got wind of this picture, it was ...",9


In [10]:
@lru_cache(maxsize=100000)
def get_normal_form (i):
    return MORPH.lemmatize(i.lower())

def normalize_text(text):
    del_stopwords = [word for word in re.findall(r'[a-zA-Z]{3,}', text) if word not in stop_words]
    normalized = [get_normal_form(word) for word in del_stopwords]
    return ' '.join(normalized)

stop_words = set(stopwords.words('english')) 

In [11]:
# normalize words in 'Review' column

with Pool(processes=2) as pool:
    train_df['review_parse'] = pool.map(normalize_text, train_df.Review)
    test_df['review_parse'] = pool.map(normalize_text, test_df.Review)
    pool.terminate()

In [12]:
train_df.head()

Unnamed: 0,Review,Rating,review_parse
0,Dog days is one of most accurate films i've ev...,10,dog day one accurate film ever seen describing...
1,In the Hollywood west those trail hands were a...,8,hollywood west trail hand rough bunch came tow...
2,After watching the Next Action Star reality TV...,7,after watching next action star reality series...
3,Considering the lack of art with in African ci...,10,considering lack art african cinema black amer...
4,mature intelligent and highly charged melodram...,10,mature intelligent highly charged melodrama un...


In [13]:
test_df.head()

Unnamed: 0,Review,Rating,review_parse
0,This is fantastic! Everything from the Score -...,10,this fantastic everything score final credit r...
1,This movie was amazing!!!! From beginning to e...,10,this movie amazing from beginning end movie pa...
2,"The first time I've seen this DVD, I was not o...",10,the first time seen dvd happy fact first time ...
3,One of the flat-out drollest movies of all-tim...,10,one flat drollest movie time sim rutherford be...
4,"When I first got wind of this picture, it was ...",9,when first got wind picture called shepherd su...


In [14]:
train_df[['review_parse', 'Rating']].to_pickle('../data/train.pkl')
test_df[['review_parse', 'Rating']].to_pickle('../data/test.pkl')