In [101]:
import nltk
import pandas as pd
import re
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split

In [102]:
train = pd.read_csv('data/labeledTrainData.tsv', sep='\t')
test = pd.read_csv('data/review.csv')

In [103]:
#Cleaning and parsing the data
def clean_data(review):
    review_text = BeautifulSoup(review).get_text()
    review_text = re.sub("[^a-zA-Z]"," ", review_text)
    stops = set(stopwords.words("english"))
    words = [w for w in review_text.lower().split() if not w in stops]
    return (words)

In [104]:
# Create the bag of words
def get_features(review_data):
    cleaned_data = []

    for i in range(0, len(review_data)):
        cleaned_data.append(" ".join(clean_data(review_data[i])))
        
    vectorizer = CountVectorizer(analyzer="word", max_features=2800)
    features = vectorizer.fit_transform(cleaned_data)
    features = features.toarray()
    return features

In [105]:
X_train, X_test, y_train, y_test = train_test_split(train['review'], train['sentiment'], test_size=0.3)

In [106]:
# Feature extraction
train_data_features = get_features(train['review'])

In [91]:
model = RandomForestClassifier(n_estimators=100)
model.fit(train_data_features, train['sentiment'])

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [93]:
import pickle

#Dump the classifier
classifier_dump = open('dump/classifier1.pickle', 'wb')
new_classifier = pickle.dump(model, classifier_dump)
classifier_dump.close()

#Load the classifier
classifier_f = open('dump/classifier1.pickle', 'rb')
classifier = pickle.load(classifier_f)
classifier_f.close()

In [64]:
new_data_features = get_features(test['review'])
predict = classifier.predict(new_data_features)
pred_df = pd.DataFrame(data = {'review': test['review'], 'prediction': predict})
pred_df.to_csv('data/result.csv', index=False)