In [1]:
import re
import nltk

from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

import pandas as pd
import numpy as np

In [2]:
df_temp = pd.read_csv('training.1600000.processed.noemoticon.csv', index_col= None, encoding = "ISO-8859-1", 
                      names=['value','views'], usecols=[5,0])

In [3]:
print(df_temp.head())

   value                                              views
0      0  @switchfoot http://twitpic.com/2y1zl - Awww, t...
1      0  is upset that he can't update his Facebook by ...
2      0  @Kenichan I dived many times for the ball. Man...
3      0    my whole body feels itchy and like its on fire 
4      0  @nationwideclass no, it's not behaving at all....


In [4]:
from sklearn.utils import shuffle
df_temp = shuffle(df_temp)

In [5]:
print(df_temp[10:21])

         value                                              views
924636       4  @SEOAly You're welcome, &amp; I hope you're fe...
1190240      4  Just made fluffy gluten free pancakes with str...
12879        0  @LiziBeeSays I was downtown ALL day yesterday!...
1302102      4  @Dannymcfly dannnnny jones  fancy replying? he...
72921        0  Why using lucene when iterating is faster? I l...
1306774      4              @yayKIMO you are very welcome. &lt;3 
1400018      4  @MIMI_loves_YOU yay  lol i got swifty 2 say he...
1271792      4  @jennluvs2sing ok, at least i made u smile.  n...
1271922      4  @KimSherrell Oh, I just said that I was  looki...
1461855      4  is impressed with the current Friendfeed inter...
1292714      4  boys boys boys.we like boys in cars.buy us dri...


In [6]:
print(df_temp.shape[0])

1600000


In [7]:
temp_X = df_temp["views"]
temp_y = df_temp['value']
print(temp_X.shape[0])

from sklearn.model_selection import train_test_split

X_train, X_cv, y_train, y_cv = train_test_split(temp_X, temp_y, test_size=0.3)
print(X_train.shape[0])
print(X_cv.shape[0])
print(X_train.head())
print(y_train.head())

1600000
1120000
480000
1236210    Can't stop watching the trailer!! http://bit.l...
1329629    @TheReal_Q He's in Riyadh for God's sake  you ...
489002          I'm back! My iPhone had no service up there 
1233608    @TracieHoward I'm down with you then!   Left h...
1574536    Whoo-hoo I just joined today &amp; i'm so happy! 
Name: views, dtype: object
1236210    4
1329629    4
489002     0
1233608    4
1574536    4
Name: value, dtype: int64


In [8]:
def review_to_word_list(review, remove_stopwords=False, use_stemmer=False, use_lemmatizer=False):
    # 1. Remove HTML. First, we'll remove the HTML tags. For this purpose, 
    # we'll use the Beautiful Soup
    review_text = BeautifulSoup(review,"lxml").get_text()
    
    # 2. Remove non-letters
    review_text = re.sub("[^a-zA-Z]", " ", review_text)
    
    # 3. Convert words to lower case and split them
    words = review_text.lower().split()
    # 4. Optionally remove stop words (false by default)
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
        
    # 5. Optionally use Porter Stemmer
    if use_stemmer:
        porter_stemmer = PorterStemmer()
        words = [porter_stemmer.stem(w) for w in words]
    
    # 6. Optionally use Lemmatizer
    if use_lemmatizer:
        wordnet_lemmatizer = WordNetLemmatizer()
        words = [wordnet_lemmatizer.lemmatize(w) for w in words]
    #
    # 7. Return a list of words
    return words

In [9]:
def review_to_sentences(review, tokenizer, remove_stopwords=False, use_stemmer=False, use_lemmatizer=False):
    # Function to split a review into parsed sentences. Returns a
    # list of sentences, where each sentence is a list of words
    #
    # 1. Use the NLTK tokenizer to split the paragraph into sentences
    raw_sentences = tokenizer.tokenize(review.decode('utf8').strip())

    # 2. Loop over each sentence
    sentences = []
    for raw_sentence in raw_sentences:
        # If a sentence is empty, skip it
        if len(raw_sentence) > 0:
            # Otherwise, call review_to_wordlist to get a list of words
            sentences.append(review_to_word_list(raw_sentence,remove_stopwords,use_stemmer, use_lemmatizer))

    # Return the list of sentences (each sentence is a list of words,
    # so this returns a list of lists
    return sentences

In [10]:
# Get the number of reviews based on the dataframe column size
num_reviews  = X_train.size
# print(num_reviews)

# Initialize an empty list to hold the clean reviews
clean_train_reviews = []
X_train_val = X_train.values
# print(X_train_val[0])


print ("Cleaning and parsing the training set movie reviews...\n")
for i in range(0, num_reviews):
    # If the index is evenly divisible by 5000, print a message
    if ((i + 1) % 112000 == 0):
        print ("Review %d of %d\n" % (i + 1, num_reviews))
    clean_train_reviews.append(review_to_word_list(X_train_val[i], True, True, True))

print ("Complete cleaning review")

Cleaning and parsing the training set movie reviews...

Review 112000 of 1120000

Review 224000 of 1120000

Review 336000 of 1120000

Review 448000 of 1120000

Review 560000 of 1120000

Review 672000 of 1120000

Review 784000 of 1120000

Review 896000 of 1120000

Review 1008000 of 1120000

Review 1120000 of 1120000

Complete cleaning review


In [11]:
# Initialize the "CountVectorizer" object, which is scikit-learn's bag of words tool.
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(analyzer="word", tokenizer=lambda doc: doc, lowercase=False, preprocessor=None, 
                             stop_words=None, max_features=700)
train_data_features = vectorizer.fit_transform(clean_train_reviews)

print('Complete initialization of Bag of Words')

Complete initialization of Bag of Words


In [12]:
# Numpy arrays are easy to work with, so convert the result to an array
train_data_features = train_data_features.toarray()
print(train_data_features.shape)

(1120000, 700)


In [13]:
# Take a look at the words in the vocabulary
vocab = vectorizer.get_feature_names()
# print(vocab)

In [26]:
# Sum up the counts of each vocabulary word
dist = np.sum(train_data_features, axis=0)

# For each, print the vocabulary word and the number of times it 
# appears in the training setfor tag, count in zip(vocab, dist):
    # print (count, tag)

In [13]:
# print(y_train)

print("Training the random forest...")
from sklearn.ensemble import RandomForestClassifier

# Initialize a Random Forest classifier with 100 trees
forest = RandomForestClassifier(n_estimators = 100, n_jobs = 6) # number of cores

# Fit the forest to the training set, using the bag of words as 
# features and the sentiment labels as the response variable
#
# This may take a few minutes to run
forest = forest.fit(train_data_features, y_train.values)

print('Training complete.')

Training the random forest...
Training complete.


In [14]:
# Create an empty list and append the clean reviews one by one
num_reviews = X_cv.size
# print(X_cv)

clean_crossval_reviews = [] 
X_cv_val = X_cv.values

print ("Cleaning and parsing the test set movie reviews...\n")
for i in range(0,num_reviews):
    if( (i+1) % 80000 == 0 ):
        print ("Review %d of %d\n" % (i+1, num_reviews))
    clean_crossval_reviews.append(review_to_word_list(X_cv_val[i], True, True, True))
print("completing cross validation review")

Cleaning and parsing the test set movie reviews...

Review 80000 of 480000

Review 160000 of 480000

Review 240000 of 480000

Review 320000 of 480000

Review 400000 of 480000

Review 480000 of 480000

completing cross validation review


In [15]:
# Get a bag of words for the test set, and convert to a numpy array
cv_data_features = vectorizer.transform(clean_crossval_reviews)
cv_data_features = cv_data_features.toarray()
print('Conversion to numpy array completed.')

Conversion to numpy array completed.


In [24]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(forest,cv_data_features, y_cv.values)

print(scores)

[ 0.71786426  0.7208      0.719592  ]
