In [2]:
import svmlight
from typing import List, Dict
import math
import os
from collections import Counter

### Load files

In [3]:
def load_words(path):
    with open(path,'r') as f:
        lines = f.readlines()
        data = list(map(lambda w: w.replace('\n',''), lines))
    return data

In [4]:
def get_word_counts(files):
    
    word_counts = Counter()
    for file in files:
        words = load_words(file.path)
        word_counts.update(words)
    return word_counts
import scandir as os
negative_word_counts = get_word_counts(os.scandir("./NEG"))
positive_word_counts = get_word_counts(os.scandir("./POS"))
all_word_counts = negative_word_counts + positive_word_counts
import os

### Calculate smoothed log probs

In [5]:
def get_log_probs(positive_word_counts, negative_word_counts):
    total_pos = sum(positive_word_counts.values())
    total_neg = sum(negative_word_counts.values())
    total_unique_words = sum(all_word_counts.values())
    # {word: {sentiment: log_prob}}
    probs = {}

    for word, count in all_word_counts.items():
        word_prob = {
            "POS": math.log((positive_word_counts[word]+1)/(total_pos+total_unique_words)),
            "NEG": math.log((negative_word_counts[word]+1)/(total_neg+total_unique_words))
        }
        probs[word] = word_prob
    return probs
get_log_probs(positive_word_counts, negative_word_counts)

ValueError: math domain error

In [None]:
def get_log_class_probs(posdir, negdir):
    pos_docs = len(os.listdir(posdir))
    neg_docs = len(os.listdir(negdir))
    total = pos_docs + neg_docs
    return {
        "POS": math.log(pos_docs/total),
        "NEG": math.log(neg_docs/total)
    }    
get_log_class_probs(posdir="./POS", negdir="./NEG")

In [None]:
def nb_classify_file(filename, class_probs, word_log_probs):
    posSum = 0
    negSum = 0
    words = load_words(filename)
    for word in words:
        if word in word_log_probs:
            posSum += word_log_probs[word]["POS"]
            negSum += word_log_probs[word]["NEG"]
    posSum += class_probs["POS"]
    negSum += class_probs["NEG"]
    return "POS" if (posSum > negSum) else "NEG"
    

In [None]:
def nb_classify_files(files, class_probs, word_log_probs):
    return {file : naive_bayes_file(file) for file in files}

## Using sklearn

In [20]:
from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.datasets import load_files
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

import numpy as np
import re

from nltk.stem.porter import PorterStemmer

In [26]:
reviews = load_files(container_path=".", categories=["POS", "NEG"])


array([[ 0],
       [ 1],
       [ 2],
       [ 3],
       [ 4],
       [ 5],
       [ 6],
       [ 7],
       [ 8],
       [ 9],
       [10],
       [11]])

In [30]:
def substr(s, i,j):
    return s[i:j]
np.argwhere(reviews.filenames != './NEG/cv000')
reviews.filenames[2]

'./POS/cv132_5618.tag'

In [None]:
# Split dataset

feature_train, feature_test, label_train, label_test = \
train_test_split(reviews.data, reviews.target, test_size=0.1)

Unigrams

In [None]:
def unigrams(feature_train, label_train, feature_test, label_test):
    # Train
    cv = CountVectorizer(stop_words=None)
    feature_train_counts = cv.fit_transform(feature_train)
    nb_classifier = MultinomialNB(alpha=1).fit(feature_train_counts, label_train)
    # Evaluate

    X = cv.transform(feature_test)
    predicted = nb_classifier.predict(X)
    return np.mean(predicted == label_test)

In [None]:
def nb_train_and_evaluate(feature_train, label_train, feature_test, label_test):
    # Train
    cv = CountVectorizer(stop_words=None)
    feature_train_counts = cv.fit_transform(feature_train)
    nb_classifier = MultinomialNB(alpha=1).fit(feature_train_counts, label_train)
    # Evaluate

    X = cv.transform(feature_test)
    predicted = nb_classifier.predict(X)
    return np.mean(predicted == label_test)

In [None]:
nb_train_and_evaluate(feature_train, label_train, feature_test, label_test)

Bigrams

In [None]:
def bigrams(feature_train, label_train, feature_test, label_test):
    # Train
    cv = CountVectorizer(ngram_range=(2,2), stop_words=None)
    feature_train_bigram_counts = cv.fit_transform(feature_train)
    nb_classifier = MultinomialNB(alpha=1).fit(feature_train_bigram_counts, label_train)

    # Evaluate
    X = cv.transform(feature_test)
    predicted = nb_classifier.predict(X)
    return np.mean(predicted == label_test)

In [None]:
bigrams(feature_train, label_train, feature_test, label_test)

Stemming + Unigrams

In [None]:
porter_stemmer = PorterStemmer()
def stemming_tokenizer(str_input):
    words = re.sub(r"[^A-Za-z0-9\-]", " ", str_input).lower().split()
    words = [porter_stemmer.stem(word) for word in words]
    return words

def stemming_unigrams(feature_train, label_train, feature_test, label_test):
    #Train
    cv = CountVectorizer(ngram_range=(1,1), stop_words=None, tokenizer=stemming_tokenizer)
    feature_train_bigram_counts = cv.fit_transform(feature_train)
    nb_classifier = MultinomialNB(alpha=1).fit(feature_train_bigram_counts, label_train)

    # Evaluate
    X = cv.transform(feature_test)
    predicted = nb_classifier.predict(X)
    return np.mean(predicted == label_test)

In [None]:
stemming_unigrams(feature_train, label_train, feature_test, label_test)

Cross-validation

In [None]:
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=3)

def cross_validate_nb(reviews, func):
    splits = skf.split(reviews.data, reviews.target)
    scores = []
    for train_index, test_index in splits:
        X_train, X_test = np.asarray(reviews.data)[train_index], np.asarray(reviews.data)[test_index]
        y_train, y_test = reviews.target[train_index], reviews.target[test_index]
        scores.append(func(X_train, y_train, X_test, y_test))
    return sum(scores)/len(scores)

In [None]:
cross_validate_nb(reviews, unigrams)

In [None]:
cross_validate_nb(reviews, bigrams)

In [None]:
cross_validate_nb(reviews, stemming_unigrams)

# SVM

In [1]:
import svmlight
from sklearn.datasets import dump_svmlight_file
from sklearn.feature_extraction.text import CountVectorizer

from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.datasets import load_files
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

import numpy as np
import re

from nltk.stem.porter import PorterStemmer
reviews = load_files(container_path=".", categories=["POS", "NEG"])
reviews.data
reviews.filenames
feature_train, feature_test, label_train, label_test = \
train_test_split(reviews.data, reviews.target, test_size=0.1)

cv = CountVectorizer(stop_words=None)
feature_train_counts = cv.fit_transform(feature_train)
label_train[label_train==0]=-1
dump_svmlight_file(feature_train_counts, label_train, "svm_out", zero_based=False) 

In [2]:
import subprocess
subprocess.call(["svm_learn", "svm_out", "model"])

0

In [16]:
feature_test_counts = cv.transform(feature_test)
label_test[label_test==0]=-1

dump_svmlight_file(feature_test_counts, label_test, "svm_test_out", zero_based=False)

In [17]:
subprocess.call(["svm_classify", "svm_test_out","model", "predictions"])

0

In [18]:
%cat predictions

0.98287322
0.074604071
-0.57099994
-2.2587009
-0.29174799
5.377817
-0.091312796
2.2776926
-0.30844986
-0.30637832
-0.64817363
-0.6400294
-0.52056378
-0.3128901
-1.1536328
0.62604578
-1.1160028
2.689419
0.16954575
-0.085207517
-0.58443436
-0.28622555
-1.0793179
1.2037501
2.093639
-0.21668892
-0.83525818
-0.6233429
-1.0580845
-0.050027912
-0.24510117
-1.6731023
-1.0322898
-0.86676932
0.072943901
0.033111139
1.2852925
0.092368036
-0.053854434
-0.83750909
-0.46447945
-0.096754284
-0.74283587
0.15314441
-0.5117
-0.001580273
0.24633156
0.33567507
1.4891685
-0.47314745
-0.60390055
0.60901112
-0.90891548
0.145736
0.30570762
0.49148189
-0.30791946
0.39665084
2.0180885
1.3799447
-0.49014522
0.55708774
-0.34992737
3.114812
0.21280077
-0.8389494
-0.068284577
-0.4080433
-1.3044649
0.026515026
-0.81939518
-0.56585431
-1.1041063
0.74567713
0.28800595
-1.7545375
-0.95639584
-0.24865282
-0.32979116
-0.033853944
0.029821385