# Bag Of Words
## S1: "I LIKE it"    
## S2: "I don't like it"
## S3: "I like it a lot"

||I|like|it|don't|a|lot|(I like)| (like it)|...|
|--|--|--|--|--|--|--|--|--|--|
|S1|1|1|1|0|0|0|1|1|..|
|S2|1|1|1|1|0|0|0|1|..|
|S3|1|1|1|0|1|1|1|1|..|

### Difficulties creating the matrix
* Capitalization
* Punctuation
* Typos/Slang/Too many Spaces
* Encoding (ASCII/utf-8)

### Difficulties with doc-term (feature) matrix
Turn words into tokens
* n-grams (2-gram = all possible pairs)
* Stop words (very common words to leave out), e.g. "a" "and"
* Stemming (use only the stem of words, cleaned -> clean)
* Lemmatization (use language rules to simplify matrix)

Package recommendation: `nltk`

In [85]:
import numpy as np
import re
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import LogisticRegression

In [86]:
url_moby_dick = 'https://www.gutenberg.org/files/2701/old/moby10b.txt'
url_sea_wolf = 'https://www.gutenberg.org/cache/epub/1074/pg1074.txt'

## File import

`i = 1
1) 'sea'+str(i)+'.txt' -> sea1.txt
2) f'sea{i}.txt' -> sea1.txt
3) 'sea{}.txt'.format(i)```

In [88]:
# import seawolf chapter 1-10
seawolf_chapters = []
for i in range(10):
    with open(f'sea{i+1}.txt', 'r') as my_file:
        seawolf_chapters.append(my_file.readlines())

In [89]:
# import mobydick chapter 1-10
mobydick_chapters = []
for i in range(10):
    with open(f'moby{i+1}.txt', 'r') as my_file:
        mobydick_chapters.append(my_file.readlines())

In [90]:
# import moby dick chapters 1-10

In [91]:
corpus = []
for chapter in seawolf_chapters:
    for sentence in chapter:
        corpus.append(sentence)

In [92]:
# flatten lists & count lines (for y_train!)
corpus_seawolf = [sentence for chapter in seawolf_chapters for sentence in chapter]
len(corpus_seawolf)

3038

In [93]:
# flatten lists & count lines (for y_train!)
corpus_mobydick = [sentence for chapter in mobydick_chapters for sentence in chapter]
len(corpus_mobydick)

1955

In [94]:
# get corpus for BoW
corpus = corpus_seawolf.copy() # copy of corpus_seawolf (deep copy)
# corpus = corpus_seawolf is not good, beacuse lists are mutable (shallow copy)
corpus.extend(corpus_mobydick)

## Test sentences

In [95]:
test_texts = []

In [96]:
test_texts.append("""Louis has also given me additional information about Death Larsen, which
tallies with the captain’s brief description.""") 

In [97]:
test_texts.append("""Be it said, that
though I had felt such a strong repugnance to his smoking in the bed
the night before, yet see how elastic our stiff prejudices grow when
love once comes to bend them.""")

In [98]:
test_texts.append("""To crawl is piggish; but to not crawl, to be as
the clod and rock, is loathsome to contemplate.""")

In [99]:
test_texts.append("""Upon opening my eyes then, and coming
out of my own pleasant and self-created darkness into the imposed and
coarse outer gloom of the unilluminated twelve-o'clock-at-night, I
experienced a disagreeable revulsion.""")

In [100]:
test_texts.append("""For the Preacher loved life, and did
not want to die, saying, ‘For a living dog is better than a dead lion.’""")

### Seawolf = 1 ; Moby-Dick = 0

In [101]:
y_train = pd.Series([1] * 3038 + [0] * 1955)

## Manual feature extraction (tokenization)

In [102]:
wordlists = []
# get words out of sentences and do some processing (lower case, word selection,...)
for sentence in corpus:
    text = re.findall(r'[A-Za-z+]{4,}', sentence)
    wordlists.append([word.lower() for word in text])

In [103]:
# flatten list
wordset = [word for sentence in wordlists for word in sentence]

In [104]:
len(wordset)

26889

In [105]:
# get unique set of words (columns in doc-term matrix)
wordset = sorted(list(set(wordset)))

In [106]:
len(wordset)

6826

In [115]:
# function to calculate entry in Bag of Words doc-term matrix
def calculate_bag_of_words(wordset, sentence):
    tf_dict = dict.fromkeys(wordset, 0)
    for word in sentence:
        if word not in wordset:
            continue
        tf_dict[word] = sentence.count(word)
    return tf_dict

In [108]:
bow = [calculate_bag_of_words(wordset, sentence) for sentence in wordlists]
df_bow = pd.DataFrame(bow)

In [111]:
df_bow

Unnamed: 0,aback,abandon,abandoned,abbreviated,abdomen,abed,ablaze,able,ablutions,aboard,...,yonson,young,your,yours,yourself,youth,yted,zealand,zephyr,zone
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4988,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4989,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4990,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4991,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Logistic regression

In [112]:
logreg_manual = LogisticRegression()
logreg_manual.fit(df_bow ,y_train)

LogisticRegression()

In [113]:
logreg_manual.score(df_bow, y_train)

0.930702984177849

In [114]:
X_test_wordlists = []
for sentence in test_texts:
    text = re.findall(r'[A-Za-z+]{4,}', sentence)
    X_test_wordlists.append([word.lower() for word in text])

In [2]:
X_test = [calculate_bag_of_words(wordset, sentence) for sentence in X_test_wordlists]

NameError: name 'X_test_wordlists' is not defined

In [1]:
X_test

NameError: name 'X_test' is not defined

In [117]:
X_test = pd.DataFrame(X_test)

In [118]:
X_test.head()

Unnamed: 0,aback,abandon,abandoned,abbreviated,abdomen,abed,ablaze,able,ablutions,aboard,...,yonson,young,your,yours,yourself,youth,yted,zealand,zephyr,zone
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [119]:
logreg_manual.predict(X_test)

array([1, 0, 1, 0, 1])

In [120]:
logreg_manual.predict_proba(X_test)

array([[0.00441198, 0.99558802],
       [0.65968954, 0.34031046],
       [0.07079281, 0.92920719],
       [0.95427309, 0.04572691],
       [0.07370315, 0.92629685]])

## Sklearn CountVectorizer

In [127]:
vectorizer = CountVectorizer(lowercase=True, stop_words='english', token_pattern='[A-Za-z]+', ngram_range=(1,1))
X_cv = vectorizer.fit_transform(corpus)
df_bow_sklearn = pd.DataFrame(X_cv.toarray(), columns=vectorizer.get_feature_names_out())
df_bow_sklearn.head()

Unnamed: 0,aback,abandon,abandoned,abbreviated,abdomen,abed,ablaze,able,ablutions,aboard,...,yokohama,yon,yonder,yonson,young,youth,yted,zealand,zephyr,zone
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [128]:
X_test = vectorizer.transform(test_texts)
df_test = pd.DataFrame(X_test.toarray(), columns=vectorizer.get_feature_names_out())

In [129]:
logreg_cv = LogisticRegression()

In [130]:
logreg_cv.fit(df_bow_sklearn, y_train)

LogisticRegression()

In [131]:
logreg_cv.score(df_bow_sklearn, y_train)

0.9319046665331464

In [132]:
logreg_cv.predict(df_test)

array([1, 0, 1, 0, 1])

In [133]:
logreg_cv.predict_proba(df_test)

array([[0.01705228, 0.98294772],
       [0.84101127, 0.15898873],
       [0.10964161, 0.89035839],
       [0.72803982, 0.27196018],
       [0.04451704, 0.95548296]])

## TF-IDF
### Term frequency
### Inverse Document frequence
$$\text{tf-idf}(t_i,d_j) = \frac{f_{t_i,d_j}}{\sum_{t'\in d_j}f_{t',d_j}} \cdot \Bigl(\log\Bigl[{\frac{N+1}{n_t + 1}\Bigr]+1}\Bigr)$$

In [134]:
tfidf = TfidfVectorizer(lowercase=True, stop_words='english')

In [135]:
X_tf = tfidf.fit_transform(corpus)
df_tf = pd.DataFrame(X_tf.toarray(), columns=tfidf.get_feature_names_out())

In [136]:
X_test = tfidf.transform(test_texts)
df_test = pd.DataFrame(X_test.toarray(), columns=tfidf.get_feature_names_out())

In [137]:
logreg_tfidf = LogisticRegression()

In [138]:
logreg_tfidf.fit(df_tf, y_train)

LogisticRegression()

In [139]:
logreg_tfidf.predict(df_test)

array([1, 0, 1, 1, 1])

In [140]:
logreg_tfidf.predict_proba(df_test)

array([[0.15857505, 0.84142495],
       [0.56891604, 0.43108396],
       [0.2778443 , 0.7221557 ],
       [0.49659482, 0.50340518],
       [0.16212461, 0.83787539]])