In [None]:
!pip3 install gensim sklearn numpy pandas
import datetime
import numpy as np
import pandas as pd
import re
import time
from gensim.sklearn_api import D2VTransformer, W2VTransformer
from sklearn.base import BaseEstimator, MetaEstimatorMixin
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import f1_score, make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.svm import LinearSVC

# Set to None to run entire dataset. Set to an integer to limit dataset size.
DATASET_LIMIT = 500

# Uncomment dataset to train/test
DATASET_FILE = 'IMDB_Dataset.csv'
# DATASET_FILE = 'SST_dataset.csv'

# File containing EWE vectors
EWE_FILE = 'ewe_uni.txt'

def load_ewe(file):
  """Loads EWE data from file and returns a dictionary of words with
  corresponding EWE vectors. Words with 0.0 vectors are filtered out."""
  vectors = {}
  good_count = 0
  bad_count = 0
  with open(file, 'r') as data_file:
    for line in data_file.readlines():
      # Break line into items
      items = line.split(' ')
      # First item is the word
      word = items[0]
      # Next 300 items are the word's vectors
      rest = items[1:]
      assert len(rest) == 300
      vecs = []
      zeros = False
      for v in rest:
        vec = float(v)
        # Check if any zeros are present
        if vec == 0:
          zeros = True
        vecs.append(vec)
      # If values look valid, add to dictionary
      if zeros == False:
        vectors[word] = vecs
        good_count += 1
        if good_count % 10000 == 0 and good_count != 0:
          print("EWE: Loaded", good_count, "words.")
      # If values have zeros, ignore word
      else:
        bad_count += 1
  print("EWE: Loaded", good_count, "words.")
  print("EWE: Ignored", bad_count, "words with zero-length vectors.")
  return vectors

class WordTokenizer(BaseEstimator, MetaEstimatorMixin):
  """Tokenize input strings based on a simple word-boundary pattern.
  This seems to be required in the pipeline for EWE and W2V Transformer
  to operate."""
  
  def fit(self, X, y=None):
    return self
    
  def transform(self, X):
    token_pattern = re.compile(r"(?u)\b\w\w+\b")
    parser = lambda doc: token_pattern.findall(doc)
    X = [parser(x) for x in X]
    return X

class EWETransformerDocLevel(W2VTransformer):
  """A modified Word2Vec sklearn-wrapper class to apply EWE vectors.
  Averages the words in each document. Based on the W2VTransformerDocLevel
  class below."""
    
  def __init__(self, size=300, alpha=0.025, window=5, min_count=5, max_vocab_size=None, sample=1e-3, seed=1,
    workers=4, min_alpha=0.0001, sg=0, hs=0, negative=5, cbow_mean=1, hashfxn=hash, iter=5, null_word=1,
    trim_rule=None, sorted_vocab=1, batch_words=10000):
    super().__init__(size, alpha, window, min_count, max_vocab_size, sample, seed, workers, min_alpha, sg, hs, negative, cbow_mean, hashfxn, iter, null_word, trim_rule, sorted_vocab, batch_words)

  def transform(self, docs):
    
    doc_vecs = []
    #print(len(docs), "docs")
    for doc in docs:
      # for each document generate a word matrix
      word_vectors_per_doc = []
      for word in doc:
        word_lower = word.lower()
        # handle out-of vocabulary words
        if word_lower in ewe_vectors:
          word_vectors_per_doc.append(ewe_vectors[word_lower])
      if len(word_vectors_per_doc) == 0:
        print("Zero word vectors found for:")
        print(doc)
      word_vectors_per_doc = np.array(word_vectors_per_doc)
      if len(word_vectors_per_doc) == 0:
        print("Zero word vectors after numpification found for:")
        print(doc)
      # take the column-wise mean of this matrix and store
      mean = word_vectors_per_doc.mean(axis=0)
      doc_vecs.append(mean)
    array = np.array(doc_vecs)
    #print("array shape:", array.shape)
    return array

class W2VTransformerDocLevel(W2VTransformer):
  """Extends Gensim's Word2Vec sklearn-wrapper class to further transform
  word-vectors into doc-vectors by averaging the words in each document.
  Suggestion for W2V integration taken from following page:
  https://github.com/alex2awesome/gensim-sklearn-tutorial/blob/master/notebooks/gensim-in-sklearn-pipelines.ipynb"""
    
  def __init__(self, size=100, alpha=0.025, window=5, min_count=5, max_vocab_size=None, sample=1e-3, seed=1,
    workers=4, min_alpha=0.0001, sg=0, hs=0, negative=5, cbow_mean=1, hashfxn=hash, iter=5, null_word=0,
    trim_rule=None, sorted_vocab=1, batch_words=10000):
    super().__init__(size, alpha, window, min_count, max_vocab_size, sample, seed, workers, min_alpha, sg, hs, negative, cbow_mean, hashfxn, iter, null_word, trim_rule, sorted_vocab, batch_words)
    
  def transform(self, docs):      
    doc_vecs = []
    for doc in docs:
      # for each document generate a word matrix
      word_vectors_per_doc = []
      for word in doc:
        # handle out-of vocabulary words
        word_lower = word.lower()
        if word_lower in self.gensim_model.wv:
          word_vectors_per_doc.append(self.gensim_model.wv[word_lower])            
      word_vectors_per_doc = np.array(word_vectors_per_doc)
      # take the column-wise mean of this matrix and store
      mean = word_vectors_per_doc.mean(axis=0)
      doc_vecs.append(mean)
    array = np.array(doc_vecs)
    return doc_vecs

In [None]:
# Start timer
start_time = time.time()
print("Processing started at", datetime.datetime.now())

# Load dataset and apply dataset limit if not set to None.
df = pd.read_csv(DATASET_FILE)
print(df.shape)
reviews = df['review'][:DATASET_LIMIT]
sentiments = df['sentiment'][:DATASET_LIMIT]

# Load EWE vector dictionary
#ewe_vectors = load_ewe(EWE_FILE)

# Build EWE pipeline with tokenizer
#ewe = EWETransformerDocLevel(size=300, iter=50)
#ewe_pipeline = Pipeline([
#  ("tokenize", WordTokenizer()),
#  ("ewe", ewe)
#])

# Build Word2Vec pipeline with tokenizer
tokenize = WordTokenizer()
w2v = W2VTransformerDocLevel(size=100, iter=50)
w2v_pipeline = Pipeline([
  ("tokenize", tokenize),
  ("w2v", w2v)
])

# Build ngrams pipeline including unigrams and bigrams
vect = CountVectorizer(ngram_range=(1,2))
tf_idf = TfidfTransformer()
ngram_tf_idf_pipeline = Pipeline([
  ("vect", vect),
  ("tf_idf", tf_idf)
])

# Build feature union of all desired pipelines. Comment out items to disable.
features = FeatureUnion([
  ("ngram_tf_idf_pipeline", ngram_tf_idf_pipeline),
  #("ewe_pipeline", ewe_pipeline),
  ("w2v_pipeline", w2v_pipeline)
])

# Set up Linear SVC classifier, which seems good for sentiment analysis.
classifier = LinearSVC(C=1.0, class_weight="balanced", verbose=1)

# Build main pipeline
pipeline = Pipeline([
  ("features", features),
  ("classifier", classifier)
])

# Set up f1 scoring
scorer = make_scorer(f1_score, average='weighted')

# Train, test and score model on dataset
scores = cross_val_score(pipeline, reviews, sentiments, scoring=scorer, cv=10)
print("\nAccuracy:",scores.mean())

# Stop timer
end_time = time.time()
time_elapsed = end_time - start_time
print("Processing ended at", datetime.datetime.now())
print(time_elapsed, "seconds elapsed.")