In [1]:
import pandas as pd
from bs4 import BeautifulSoup as bs
import nltk
from nltk.corpus import stopwords # Import the stop word list
import numpy as np
import re

# Read the train data:

In [None]:
train = pd.read_csv("./Data/labeledTrainData.tsv", header=0, delimiter="\t", quoting=3)
# Here, "header=0" indicates that the first line of the file contains column names,
# "delimiter=\t" indicates that the fields are separated by tabs, and quoting=3 tells Python to ignore doubled quotes,
# otherwise you may encounter errors trying to read the file.

In [None]:
train.head()

In [None]:
print train.shape

In [None]:
train.columns.values

In [None]:
train["review"][0]

# Clean the data using 
- [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/bs4/doc/) to deal with HTML stuff.
- re to deal with numbers and punctuations

In [None]:
rev_01 = train["review"][0]
rev_01_pbs = bs(train["review"][0])

In [None]:
print rev_01
print rev_01_pbs.get_text()

## Just keep the letters:

In [None]:
# Use regular expressions to do a find-and-replace
letters_only = re.sub("[^a-zA-Z]",  " ", rev_01_pbs.get_text())
# [] indicates group membership and ^ means "not". In other words,
# the re.sub() statement above says, "Find anything that is NOT a lowercase letter (a-z) or an upper case letter (A-Z),
# and replace it with a space."
print letters_only

##  Convert to lower case and split them into individual words (tokenization):

In [None]:
lower_case = letters_only.lower()
words = lower_case.split()

In [None]:
print words
print len(words)

## Take out the stop words using

- [NLTK](http://www.nltk.org/)

In [None]:
print stopwords.words("english") 

In [None]:
words_stop_clean = [w for w in words if not w in stopwords.words("english")]
print len(words_stop_clean)

## Clean Function:


In [2]:
def review_to_words( raw_review ):
    # Function to convert a raw review to a string of words
    # The input is a single string (a raw movie review), and 
    # the output is a single string (a preprocessed movie review)
    #
    # 1. Remove HTML
    review_text = bs(raw_review).get_text() 
    #
    # 2. Remove non-letters        
    letters_only = re.sub("[^a-zA-Z]", " ", review_text) 
    #
    # 3. Convert to lower case, split into individual words
    words = letters_only.lower().split()                             
    #
    # 4. In Python, searching a set is much faster than searching
    #   a list, so convert the stop words to a set
    stops = set(stopwords.words("english"))                  
    # 
    # 5. Remove stop words
    meaningful_words = [w for w in words if not w in stops]   
    #
    # 6. Join the words back into one string separated by space, 
    # and return the result.
    return( " ".join( meaningful_words ))

In [None]:
clean_rev_01 = review_to_words(train["review"][0])
print clean_rev_01

## Clean the entire train-data:

In [None]:
num_reviews = train.size
train["clean review"] = 'empty'
for i in xrange(0, num_reviews):
    train["clean review"][i] = review_to_words(train["review"][i])

In [None]:
train.head(0)

## save the dataFrame with the new clean column:

In [3]:
train.to_csv("./Data/clean-data.csv", sep='\t')

NameError: name 'train' is not defined

## Apply the cleaning to test-data:

In [4]:
test = pd.read_csv("./Data/testData.tsv", header=0, delimiter="\t", quoting=3)

In [5]:
test.head()

Unnamed: 0,id,review
0,"""12311_10""","""Naturally in a film who's main themes are of ..."
1,"""8348_2""","""This movie is a disaster within a disaster fi..."
2,"""5828_4""","""All in all, this is a movie for kids. We saw ..."
3,"""7186_2""","""Afraid of the Dark left me with the impressio..."
4,"""12128_7""","""A very accurate depiction of small time mob l..."


In [9]:
num_reviews = test.size
test["clean review"] = 'empty'
for i in xrange(0, num_reviews):
    test["clean review"][i] = review_to_words(test["review"][i])

KeyError: 25000

In [11]:
test.head(25000)

Unnamed: 0,id,review,clean review
0,"""12311_10""","""Naturally in a film who's main themes are of ...",naturally film main themes mortality nostalgia...
1,"""8348_2""","""This movie is a disaster within a disaster fi...",movie disaster within disaster film full great...
2,"""5828_4""","""All in all, this is a movie for kids. We saw ...",movie kids saw tonight child loved one point k...
3,"""7186_2""","""Afraid of the Dark left me with the impressio...",afraid dark left impression several different ...
4,"""12128_7""","""A very accurate depiction of small time mob l...",accurate depiction small time mob life filmed ...
5,"""2913_8""","""...as valuable as King Tut's tomb! (OK, maybe...",valuable king tut tomb ok maybe valuable worth...
6,"""4396_1""","""This has to be one of the biggest misfires ev...",one biggest misfires ever script nice could en...
7,"""395_2""","""This is one of those movies I watched, and wo...",one movies watched wondered watch find interes...
8,"""10616_1""","""The worst movie i've seen in years (and i've ...",worst movie seen years seen lot movies acting ...
9,"""9074_9""","""Five medical students (Kevin Bacon, David Lab...",five medical students kevin bacon david labrac...


In [12]:
test.to_csv("./Data/clean-test-data.csv", sep='\t')