In [1]:
import re
import pandas as pd
import numpy as np
import time

from collections import defaultdict
from nltk.corpus import stopwords
from tqdm import tqdm
from scipy.sparse import csr_matrix, diags

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [2]:
# Using default token pattern from sklearn
token_pattern = re.compile(r"(?u)\b\w\w+\b")
ngram_range = (1, 3)
min_freq = 10

In [3]:
df = pd.read_csv("yelp_review_sentiment_2classes.tsv", delimiter="\t")
df = df.dropna()

## CountVectorizer Implementation

Something weird happen with sklearn's CountVectorizer

Result from CountVectorizer. 
'beer', 'beer copyright', 'burger', 'copyright'  
6 4 5 5  

The correct one should be  
'beer', 'pizza', 'beer copyright', 'burger', 'copyright'  
6 4 4 5 5  
Pizza is missing from the transformation

In [162]:
en_stopwords = stopwords.words('english') + ["-PRON-", "-pron-", "PRON", "pron"]

def preprocessing(text, ngram_range):
    tokens = token_pattern.findall(text)
    tokens = list(filter(lambda x: x not in en_stopwords, tokens))
    tokens = build_ngrams(tokens, ngram_range)
    return tokens


def build_ngrams(text, ngram_range):
    vocabs = []
    join_space = " ".join
    for i in range(ngram_range[0], ngram_range[1] + 1):
        for j in range(len(text)):
            if j + i <= len(text):
                vocabs.append(join_space(text[j:j+i]))
    return vocabs


def build_vocab(doc, ngram_range, min_freq):
    vocabs = {}
    vocab_counts = defaultdict(int)
    for text in tqdm(doc):
        tokens = preprocessing(text, ngram_range)
        for word in tokens:
            vocab_counts[word] += 1

    vocab_counts = dict(filter(lambda x: x[1] >= min_freq, vocab_counts.items()))
    for idx, key in enumerate(vocab_counts.keys()):
        vocabs[key] = idx
    return vocabs, vocab_counts


def build_count_vector(doc, vocabs):
    data = []
    indices = []
    indptr = [0]
    for text in tqdm(doc):
        tokens = preprocessing(text, ngram_range)
        feature_counts = defaultdict(int)
        for word in tokens:
            if word in vocabs:
                feature_counts[vocabs[word]] += 1
        data.extend(feature_counts.values())
        indices.extend(feature_counts.keys())
        indptr.append(len(data))
    
    return csr_matrix((data, indices, indptr), shape=(len(doc), len(vocabs)), dtype=float)

In [165]:
vocabs, vocab_counts = build_vocab(df["text"], ngram_range, min_freq)
# vocabs, vocab_counts = build_vocab((
#     "the pizza pizza beer copyright",
#     "the pizza burger beer copyright",
#     "the the pizza beer beer copyright",
#     "the burger beer beer copyright",
#     "the coke burger coke copyright",
#     "the coke burger burger",
# ), (1, 3), 4)

100%|██████████| 249978/249978 [01:47<00:00, 2317.17it/s]


In [144]:
len(vocabs)

6

In [97]:
build_count_vector(df["text"], vocabs)

100%|██████████| 249978/249978 [02:00<00:00, 2081.78it/s]


<249978x234784 sparse matrix of type '<class 'numpy.float64'>'
	with 18601591 stored elements in Compressed Sparse Row format>

In [98]:
len(vocabs)

234784

In [104]:
vocab_counts["fat boy"]

10

In [158]:
# tfidf_vec = TfidfVectorizer(ngram_range=(1, 3), min_df=10, stop_words=en_stopwords)
cnt_vec = CountVectorizer(ngram_range=(1, 3), min_df=4, stop_words=en_stopwords)
cnt_vec.fit((
    "the pizza pizza beer copyright",
    "the pizza burger beer copyright",
    "the the pizza beer beer copyright",
    "the burger beer beer copyright",
    "the coke burger coke copyright",
    "the coke burger burger",
))

CountVectorizer(min_df=4, ngram_range=(1, 3),
                stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours',
                            'ourselves', 'you', "you're", "you've", "you'll",
                            "you'd", 'your', 'yours', 'yourself', 'yourselves',
                            'he', 'him', 'his', 'himself', 'she', "she's",
                            'her', 'hers', 'herself', 'it', "it's", 'its',
                            'itself', ...])

In [159]:
print(cnt_vec.get_feature_names())
print(cnt_vec.transform((
    "the pizza pizza beer copyright",
    "the pizza burger beer copyright",
    "the the pizza beer beer copyright",
    "the burger beer beer copyright",
    "the coke burger coke copyright",
    "the coke burger burger",
)).toarray().sum(axis=0))

['beer', 'beer copyright', 'burger', 'copyright']
[6 4 5 5]


In [170]:
res = cnt_vec.transform((
    "the pizza pizza beer copyright",
    "the pizza burger beer copyright",
    "the the pizza beer beer copyright",
    "the burger beer beer copyright",
    "the coke burger coke copyright",
    "the coke burger burger",
))
res

<6x4 sparse matrix of type '<class 'numpy.int64'>'
	with 17 stored elements in Compressed Sparse Row format>

## TfIdfVectorizer Implementation

In [208]:
def build_tfidf_vector(doc, vocabs):
    term_vectors = build_count_vector(doc, vocabs)
    document_vectors = np.bincount(term_vectors.indices, minlength=term_vectors.shape[1])
    # Add smoothing for both numerator and denumerator to prevent division by 0 or having 0
    # as the result
    idf = (term_vectors.shape[0] + 1) / (document_vectors + 1)
    idf = np.log(idf)
    # The default action for csr_matrix multiplied by numpy vector is dot product, therefore
    # we need to create a diagonal matrix first to simulate element by element multiplication
    # without blowing up the memory
    diag_idf = diags(idf, offsets=0, format="csr")
    return term_vectors * diag_idf

In [209]:
build_tfidf_vector(df["text"], vocabs)

100%|██████████| 249978/249978 [02:01<00:00, 2059.15it/s]


<249978x234784 sparse matrix of type '<class 'numpy.float64'>'
	with 18601591 stored elements in Compressed Sparse Row format>

## Normalizer Implementation

In [253]:
def normalize(X):
    new_data = []
    for x in X:
        square_sum = 0
        for i in x.data:
            square_sum += i * i
        norm = np.sqrt(square_sum)
        new_data.extend((x.data / norm).tolist())
    new_mat = csr_matrix((np.array(new_data), X.indices, X.indptr), dtype=np.float64)
    return new_mat

In [254]:
normalize(res)

<6x4 sparse matrix of type '<class 'numpy.float64'>'
	with 17 stored elements in Compressed Sparse Row format>

## train_test_split Implementation

In [289]:
def split_data(X, y, test_size=0.1):
    total_data = len(X)
    total_test = int(total_data * test_size)
    sample_idx = np.arange(total_data)
    np.random.shuffle(sample_idx)

    X_train = X.iloc[sample_idx[total_test:]]
    y_train = y.iloc[sample_idx[total_test:]]

    X_test = X.iloc[sample_idx[:total_test]]
    y_test = y.iloc[sample_idx[:total_test]]
    return (X_train, y_train), (X_test, y_test)

## Cross validation

In [128]:
def save_division(a, b):
    if b == 0:
        return 0.
    return a / b


def confusion_matrix(true, pred):
    tp = 0
    fp = 0
    tn = 0
    fn = 0
    for y, y_hat in zip(true, pred):
        if y == 1 and y_hat == 1:
            tp += 1
        elif y == 0 and y_hat == 1:
            fp += 1
        elif y == 1 and y_hat == 0:
            fn += 1
        elif y == 0 and y_hat == 0:
            tn += 1
    return tp, fp, tn, fn


def precision_micro(true, pred):
    tp, fp, tn, fn = confusion_matrix(true, pred)
    return save_division(tp, (tp + fp))


def recall_micro(true, pred):
    tp, fp, tn, fn = confusion_matrix(true, pred)
    return save_division(tp, (tp + fn))


def f1_micro(true, pred):
    precision = precision_micro(true, pred)
    recall = recall_micro(true, pred)
    return save_division((2 * precision * recall), (precision + recall))


In [129]:
def cross_validate(model, X, y, folds=5):
    fold_size = int(X.shape[0] / folds)
    fold_ranges = [(i, i + fold_size) for i in range(0, X.shape[0], fold_size)]
    scores = {
        "fit_time": [],
        "score_time": [],
        "test_f1_micro": [],
        "train_f1_micro": [],
        "test_precision_micro": [],
        "train_precision_micro": [],
        "test_recall_micro": [],
        "train_recall_micro": []
    }

    for start, end in fold_ranges:
        X_train = np.concatenate([X[0:start], X[end:]])
        y_train = np.concatenate([y[0:start], y[end:]])

        X_test = X[start:end]
        y_test = y[start:end]

        start_time = time.time()
        model.fit(X_train, y_train)
        scores["fit_time"].append(time.time() - start_time)

        start_time = time.time()
        preds_train = model.predict(X_train)
        preds_test = model.predict(X_test)
        scores["score_time"].append(time.time() - start_time)

        metrics = [precision_micro, recall_micro, f1_micro]
        for metric in metrics:
            name = metric.__name__
            scores[f"train_{name}"].append(metric(y_train, preds_train))
            scores[f"test_{name}"].append(metric(y_test, preds_test))

    return scores

In [130]:
from sklearn.datasets import load_digits
from sklearn.linear_model import LogisticRegression

digits = load_digits()
X, y = digits.data, digits.target
clf = LogisticRegression(random_state=42)

cross_validate(clf, X, y, 3)

{'fit_time': [0.10642695426940918, 0.12486886978149414, 0.16100716590881348],
 'score_time': [0.0006372928619384766,
  0.0007798671722412109,
  0.0009598731994628906],
 'test_f1_micro': [1.0, 1.0, 1.0],
 'train_f1_micro': [1.0, 1.0, 1.0],
 'test_precision_micro': [1.0, 1.0, 1.0],
 'train_precision_micro': [1.0, 1.0, 1.0],
 'test_recall_micro': [1.0, 1.0, 1.0],
 'train_recall_micro': [1.0, 1.0, 1.0]}

In [4]:
x = np.array([1, 0, 0, 1])
x == 1

array([ True, False, False,  True])