In [31]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
import nltk.data
from bs4 import BeautifulSoup
from gensim.models import word2vec
import logging
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import math
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)
nltk.download('stopwords')
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Nikita\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [32]:
train = pd.read_csv("../P1_train.csv")
test = pd.read_csv("../P1_test.csv")

In [33]:
train['label'].value_counts()

1    736
2    661
0    263
Name: label, dtype: int64

In [34]:
test['label'].value_counts()

1    303
2    298
0     82
Name: label, dtype: int64

In [35]:
def filterWords(review):
    text = re.sub("[^a-zA-Z]"," ",review)
    words = text.lower().split()
    stops = set(stopwords.words("english"))
    words = [word for word in words if not word in stops]
    return(words)

def tokenize(review, tokenizer):
    sentences = tokenizer.tokenize(review.strip())
    allSentences = []
    for sentence in sentences:
        if len(sentence)>0:
            allSentences.append(filterWords(sentence))
    return allSentences

In [36]:
sentences = []
for review in train["sentence"]:
    sentences += tokenize(review, tokenizer)

In [37]:
num_features = 100
min_word_count = 1
downsampling = 1e-3

In [38]:
model = word2vec.Word2Vec(sentences, min_count=min_word_count, sample=downsampling)

INFO - 04:33:11: collecting all words and their counts
INFO - 04:33:11: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 04:33:11: collected 11351 word types from a corpus of 41409 raw words and 1660 sentences
INFO - 04:33:11: Loading a fresh vocabulary
INFO - 04:33:11: effective_min_count=1 retains 11351 unique words (100% of original 11351, drops 0)
INFO - 04:33:11: effective_min_count=1 leaves 41409 word corpus (100% of original 41409, drops 0)
INFO - 04:33:11: deleting the raw counts dictionary of 11351 items
INFO - 04:33:11: sample=0.001 downsamples 9 most-common words
INFO - 04:33:11: downsampling leaves estimated 40861 word corpus (98.7% of prior 41409)
INFO - 04:33:11: estimated required memory for 11351 words and 100 dimensions: 14756300 bytes
INFO - 04:33:11: resetting layer weights
INFO - 04:33:16: training model with 3 workers on 11351 vocabulary and 100 features, using sg=0 hs=0 sample=0.001 negative=5 window=5
INFO - 04:33:16: worker thread finishe

In [39]:
def getFeatureVecs(words, model, num_features):
    featureVec = np.zeros(num_features, dtype="float32")
    totalWords = 0
    index2word_set = set(model.wv.index2word)    
    for word in words:
        if word in index2word_set:
            totalWords += 1
            featureVec = np.add(featureVec, model[word])
    featureVec = np.divide(featureVec, totalWords)
    return featureVec 

def getAvgFeatures(train_sentences, model, num_features):
    l = len(train_sentences)
    reviewFeatureVecs = np.zeros((l, num_features), dtype="float32")
    for i, review in enumerate(train_sentences):
        reviewFeatureVecs[i] = getFeatureVecs(review, model, num_features)
    return reviewFeatureVecs

In [40]:
train_sentences = []
for sentence in train['sentence']:
    train_sentences.append(filterWords(sentence))
trainDataVecs = getAvgFeatures(train_sentences, model, num_features)

  


In [41]:
clean_test_reviews = []
for review in test["sentence"]:
    clean_test_reviews.append(filterWords(review))
    
testDataVecs = getAvgFeatures(clean_test_reviews, model, num_features)

  


In [42]:
forest = RandomForestClassifier(n_estimators = 1400)
forest = forest.fit(trainDataVecs, train["label"])
result = forest.predict(testDataVecs)

In [43]:
print(accuracy_score(test['label'], result))

0.5446559297218155


In [44]:
print(f1_score(test['label'], result, average = 'macro'))

0.38527426067747766


  'precision', 'predicted', average, warn_for)


In [48]:
test['predicted_label'] = pd.Series(result)

In [49]:
test.to_csv('testing_output_word2vec.csv', sep=',')

In [None]:
# Reference: https://www.kaggle.com/varun08/sentiment-analysis-using-word2vec