In [12]:
import bs4
import numpy as np
import pandas as pd
import re

In [13]:
train = pd.read_csv("data/labeledTrainData.tsv", header=0, delimiter="\t", quoting=2)
test = pd.read_csv("data/testData.tsv", header=0, delimiter="\t", quoting=2)

In [14]:
train.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [15]:
train.to_json('data/imdbHTMLReviewsTrainData.json', orient='records')
test.to_json('data/imdbHTMLReviewsTestData.json', orient='records')

In [16]:
def clean_html_tags(text):
    return bs4.BeautifulSoup(text, "lxml").get_text()

In [17]:
train['review'] = train['review'].apply(clean_html_tags)
test['review'] = test['review'].apply(clean_html_tags)

In [18]:
train.to_json('data/imdbReviewsTrainData.json', orient='records')
test.to_json('data/imdbReviewsTestData.json', orient='records')

In [19]:
import json

with open('data/imdbHTMLReviewsTrainData.json') as ftr:
    train = json.load(ftr)
with open('data/imdbHTMLReviewsTestData.json') as ftr:
    test = json.load(ftr)

In [20]:
test[0]

{'id': '12311_10',
 'review': "Naturally in a film who's main themes are of mortality, nostalgia, and loss of innocence it is perhaps not surprising that it is rated more highly by older viewers than younger ones. However there is a craftsmanship and completeness to the film which anyone can enjoy. The pace is steady and constant, the characters full and engaging, the relationships and interactions natural showing that you do not need floods of tears to show emotion, screams to show fear, shouting to show dispute or violence to show anger. Naturally Joyce's short story lends the film a ready made structure as perfect as a polished diamond, but the small changes Huston makes such as the inclusion of the poem fit in neatly. It is truly a masterpiece of tact, subtlety and overwhelming beauty."}

In [69]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
import re


class BaseTransformer(object):
    def __init__(self):
        self._params = {}


class BaseTextTransformer(BaseTransformer):
    def __init__(self):
        super().__init__()
    
    def _detect_language(self, text):
        self.language = 'english'
        
    @staticmethod
    def validate(self, feature_value):
        return True
        
    '''
    def _clean_text(self, text, language):
        text = re.sub(r'[^[^\W\d_]]', ' ', text.lower())
        stopwords_set = set(stopwords.words(language))
        return ' '.join(word for word in text.split() if word not in stopwords_set)
    
    def _clean_text_feature(self, text_feature):
        return (self._clean_text(text, self.language) for text in text_feature)
    '''


class TfidfTransformer(BaseTextTransformer):
    def __init__(self):
        super().__init__()
        self.vectorizer = TfidfVectorizer()
    
    def _detect_parameters(self, text_feature):
        self._detect_language(text_feature)
        self._params = {'max_features' : 5000,
                 'stop_words' : stopwords.words(self.language)}
        return self._params
    
    @property
    def params(self):
        return self._params
    
    def fit(self, text_feature):
        self.vectorizer.set_params(**self._detect_parameters(text_feature))
        self.vectorizer.fit(text_feature)
        return self
        
    def transform(self, text_feature):
        return self.vectorizer.transform(text_feature)

In [70]:
def json_select(data, path):
    return [obj[path] for obj in data]

In [71]:
y_train = json_select(train, 'sentiment')

In [72]:
transformer = TfidfTransformer().fit(json_select(train, 'review'))
X_train = transformer.transform(json_select(train, 'review'))
X_test = transformer.transform(json_select(test, 'review'))

In [73]:
y_train[:10]

[1, 1, 0, 0, 1, 1, 0, 0, 0, 1]

In [74]:
from xgboost import XGBClassifier

clf = XGBClassifier(n_estimators=700)
clf.fit(X_train, y_train)

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=700, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [75]:
answer_df = pd.DataFrame()
answer_df['id'] = json_select(test, 'id')
answer_df['sentiment'] = clf.predict(X_test)

In [76]:
answer_df.to_csv("baseline_answer.csv", index=False)