In [1]:
import re
from nltk.corpus import stopwords
import nltk.data
import logging
import numpy as np
from bs4 import BeautifulSoup
from gensim.models import word2vec
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn import cross_validation
import pandas as pd
from sklearn.naive_bayes import MultinomialNB

In [7]:
def review2words(raw_review):
    review_text = BeautifulSoup(raw_review, "lxml").get_text()
    letters_only = re.sub("[^a-zA-Z]", " ", review_text)
    words = letters_only.lower().split()
    stops = set(stopwords.words("english"))
    meaningful_words = [w for w in words if not w in stops]
    return " ".join(meaningful_words)

def review2sentences(review, tokenizer):
    raw_sentences = tokenizer.tokenize(review.strip())
    sentences = []
    for raw_sentence in raw_sentences:
        if len(raw_sentence) > 0:
            sentences.append(review_to_words(raw_sentence))
    return sentences

In [19]:
train = pd.read_csv("labeledTrainData.tsv", header=0, delimiter="\t", quoting=3)
test = pd.read_csv('testData.tsv', header=0, delimiter="\t", quoting=3)
#unlabeled_train = pd.read_csv("unlabeledTrainData.tsv", header=0,  delimiter="\t", quoting=3)

In [None]:
clean_train_reviews = []
for i in range(train["review"].size):
    clean_train_reviews.append(review2words(train["review"][i]))
clean_test_reviews = []
for i in range(len(test["review"])):
    clean_test_reviews.append(review2words(test["review"][i]))

In [24]:
unlabeled_clean_train_reviews = []
for i in range(unlabeled_train["review"].size):
    unlabeled_clean_train_reviews.append(review_to_words(unlabeled_train["review"][i]))

In [28]:
# tfv = TfidfVectorizer(min_df=3,  max_features=None, 
#         strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
#         ngram_range=(1, 2), use_idf=1,smooth_idf=1,sublinear_tf=1,
#         stop_words = 'english')
# X_all = clean_train_reviews + clean_test_reviews
# lentrain = len(clean_train_reviews)
# y = train["sentiment"]
# tfv.fit(X_all)
tfv = TfidfVectorizer(min_df=2, max_df=0.95, max_features = 200000, ngram_range = (1, 2), sublinear_tf = True)
vectorizer = tfv.fit(clean_train_reviews + unlabeled_clean_train_reviews)
train_data_features = vectorizer.transform(clean_train_reviews)
test_data_features = vectorizer.transform(clean_test_reviews)
model1 = MultinomialNB(alpha=0.0005)
# X_all = tfv.transform(X_all)
# X = X_all[:lentrain]
# X_test = X_all[lentrain:]
# model = LogisticRegression(penalty='l2', dual=True, tol=0.0001, 
#                          C=1, fit_intercept=True, intercept_scaling=1.0, 
#                          class_weight=None, random_state=None)
# print("20 Fold CV Score: ", np.mean(cross_validation.cross_val_score(model, X, y, cv=20, scoring='roc_auc')))

In [32]:
# model.fit(X,y)
from sklearn.feature_selection.univariate_selection import SelectKBest, chi2, f_classif
fselect = SelectKBest(chi2 , k=70000)
train_data_features = fselect.fit_transform(train_data_features, train["sentiment"])
test_data_features = fselect.transform(test_data_features)
model1 = MultinomialNB(alpha=0.0005)


In [35]:
from sklearn.linear_model import SGDClassifier
model2 = SGDClassifier(loss='modified_huber', n_iter=5, random_state=0, shuffle=True)
model2.fit(train_data_features, train["sentiment"])
result = model2.predict_proba(test_data_features)[:,1]
output = pd.DataFrame(data={"id":test["id"], "sentiment":result})
output.to_csv('Bag_of_Words_model.csv', index=False, quoting=3)
print(len(result), len(output))

25000 25000


In [9]:
vectorizer = CountVectorizer(analyzer="word",
                         tokenizer=None,
                         preprocessor=None,
                         stop_words=None,
                         max_features=5000)
train_data_features = vectorizer.fit_transform(clean_train_reviews)
train_data_features = train_data_features.toarray()
forest = RandomForestClassifier(n_estimators=100)
forest = forest.fit(train_data_features, train["sentiment"])
test_data_features = vectorizer.transform(clean_test_reviews)
test_data_features = test_data_features.toarray()
result = forest.predict(test_data_features)

In [15]:
output = pd.DataFrame(data={"id":test["id"], "sentiment":result})
output.to_csv('Bag_of_Words_model.csv', index=False, quoting=3)

In [None]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
model = word2vec.Word2Vec(sentences, min_count=40, size=300, workers=4, window=10, seed=1, sample=1e-3)
model.init_sims(replace=True)
model.syn0.shape
trainDataVecs = getAvgFeatureVecs(getCleanReviews(train), model, 300)
testDataVecs = getAvgFeatureVecs(getCleanReviews(test), model, 300)
forest = RandomForestClassifier(n_estimators = 100)
forest = forest.fit(trainDataVecs, train["sentiment"])
result = forest.predict(testDataVecs)
output = pd.DataFrame(data={"id":test["id"], "sentiment":result})