In [None]:
import os
import numpy as np
import re
from sklearn.datasets import load_files
from sklearn.metrics import accuracy_score, roc_auc_score

# Read the train Data
path_to_movies = os.path.expanduser('/home/analyticsvidhya/IMDB/')
reviews_train = load_files(os.path.join(path_to_movies, 'train'))
text_train, y_train = reviews_train.data, reviews_train.target

In [None]:
def to_vw_format(document, label=None):
      return str(label or '') + ' |text ' + ' '.join(re.findall('\w{3,}', document.lower())) + '\n'
to_vw_format(str(text_train[1]), 1 if y_train[0] == 1 else -1)


# Splitting train data to train and validation sets
train_size = int(0.7 * len(text_train))
train, train_labels = text_train[:train_size], y_train[:train_share]
valid, valid_labels = text_train[train_size:], y_train[train_share:]

# Convert and save in vowpal wabbit format
with open('movie_reviews_train.vw', 'w') as vw_train_data:
   for text, target in zip(train, train_labels):
   vw_train_data.write(to_vw_format(str(text), 1 if target == 1 else -1))
with open('movie_reviews_valid.vw', 'w') as vw_train_data:
   for text, target in zip(valid, valid_labels):
   vw_train_data.write(to_vw_format(str(text), 1 if target == 1 else -1))

## Training 

In [None]:
# Fitting a logistic regression for predicting the sentiment of a review
!vw -d movie_reviews_train.vw --loss_function logistic -f movie_reviews_model.vw

## Testing 

In [None]:
!vw -i movie_reviews_model.vw -t -d movie_reviews_valid.vw -p movie_valid_pred.txt --quiet

In [None]:
with open('movie_valid_pred.txt') as pred_file:
     valid_prediction = [float(label) for label in pred_file.readlines()]
     print("Accuracy: {}".format(round(accuracy_score(valid_labels, [int(pred_prob > 0) for pred_prob in valid_prediction]), 5)))
     print("AUC: {}".format(round(roc_auc_score(valid_labels, valid_prediction), 5)))

In [None]:
!vw -d movie_reviews_train.vw --loss_function logistic --ngram 2 -f movie_reviews_model_bigram.vw --quiet

## Model interpretability 

In [None]:
!vw -d movie_reviews_train.vw --loss_function logistic --ngram 2 --invert_hash movie_reviews_readable_model_bigram.vw

## Regularization

In [None]:
!vw -d movie_reviews_train.vw --l1 0.00005 --l2 0.00005 --loss_function logistic --ngram 2 -f movie_reviews_model_bigram.vw