# 1. Bag of Words

In [1]:
import numpy as np

In [3]:
texts = ['i hate you','i love you','i really hate you','i like you']

In [4]:
vocabulary = list(set(' '.join(texts).split()))
vocabulary

['love', 'really', 'like', 'you', 'i', 'hate']

In [5]:
X = np.zeros((len(texts),len(vocabulary)))
for no, i in enumerate(texts):
    for text in i.split():
        X[no, vocabulary.index(text)] += 1

In [6]:
X

array([[0., 0., 0., 1., 1., 1.],
       [1., 0., 0., 1., 1., 0.],
       [0., 1., 0., 1., 1., 1.],
       [0., 0., 1., 1., 1., 0.]])

# 2.  Term Frequency–Inverse Document Frequency,TFIDF

In [7]:
import numpy as np

In [9]:
texts = ['i hate you','i love you','i really hate you','i like you']

In [10]:
vocabulary = list(set(' '.join(texts).split()))
vocabulary

['you', 'like', 'love', 'i', 'really', 'hate']

In [13]:
idf = {}
for i in vocabulary:
    idf[i] = 0
    for k in texts:
        if i in k.split():
            idf[i] += 1
    idf[i] = np.log(idf[i] / len(texts))

idf

{'hate': -0.6931471805599453,
 'i': 0.0,
 'like': -1.3862943611198906,
 'love': -1.3862943611198906,
 'really': -1.3862943611198906,
 'you': 0.0}

In [15]:
X = np.zeros((len(texts),len(vocabulary)))
for no, i in enumerate(texts):
    for text in i.split():
        X[no, vocabulary.index(text)] += 1
    for text in i.split():
        X[no, vocabulary.index(text)] = X[no, vocabulary.index(text)] * idf[text]
X

array([[ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        -0.69314718],
       [ 0.        ,  0.        , -1.38629436,  0.        ,  0.        ,
         0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        , -1.38629436,
        -0.69314718],
       [ 0.        , -1.38629436,  0.        ,  0.        ,  0.        ,
         0.        ]])

# 3. Hashing Vector

In [7]:
!pip install sklearn

Collecting sklearn
  Downloading https://files.pythonhosted.org/packages/1e/7a/dbb3be0ce9bd5c8b7e3d87328e79063f8b263b2b1bfa4774cb1147bfcd3f/sklearn-0.0.tar.gz
Building wheels for collected packages: sklearn
  Running setup.py bdist_wheel for sklearn: started
  Running setup.py bdist_wheel for sklearn: finished with status 'done'
  Stored in directory: C:\Users\User2\AppData\Local\pip\Cache\wheels\76\03\bb\589d421d27431bcd2c6da284d5f2286c8e3b2ea3cf1594c074
Successfully built sklearn
Installing collected packages: sklearn
Successfully installed sklearn-0.0


In [9]:
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.preprocessing import LabelEncoder
import numpy as np

# early explanation check bow-tfidf.py, here only explain what is hashing vectorizer

example = [['i hate you', 'neg'],
		  ['i love you', 'pos'],
		  ['i really hate you', 'neg'],
		  ['i like you', 'pos']]

example_matrix = np.array(example)
unique_labels, unique_count = np.unique(example_matrix[:, 1], return_counts = True)
label_int = LabelEncoder().fit_transform(example_matrix[:, 1])
texts = example_matrix[:, 0].copy()

hash_counts = HashingVectorizer().fit_transform(texts)
print (np.unique((' '.join(texts.flatten().tolist())).split()))
print (hash_counts.shape)
# (4, 1048576)
# default n_features = 1048576
# you can change into small number, like 5
# hash_counts = HashingVectorizer(n_features = 5).fit_transform(texts)
# it is good to use hashing if your dictionary totally a huge number, then you can set smaller number than dictionary size
# but smaller number == more collision of features

# classifier(train = bag_counts_tdidf, label = label_int)

['hate' 'i' 'like' 'love' 'really' 'you']
(4, 1048576)


In [10]:
example_matrix

array([['i hate you', 'neg'],
       ['i love you', 'pos'],
       ['i really hate you', 'neg'],
       ['i like you', 'pos']], dtype='<U17')

In [14]:
unique_count

array([2, 2], dtype=int64)

In [16]:
print(hash_counts)

  (0, 550131)	-0.7071067811865475
  (0, 832412)	0.7071067811865475
  (1, 672777)	-0.7071067811865475
  (1, 832412)	0.7071067811865475
  (2, 120741)	0.5773502691896258
  (2, 550131)	-0.5773502691896258
  (2, 832412)	0.5773502691896258
  (3, 832412)	0.7071067811865475
  (3, 975831)	-0.7071067811865475


# 4. Bayes-Classifier

In [19]:
# to get f1 score
from sklearn import metrics
import numpy as np
import sklearn.datasets
import re
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.cross_validation import train_test_split



In [20]:
# clear string
def clearstring(string):
    string = re.sub('[^A-Za-z0-9 ]+', '', string)
    string = string.split(' ')
    string = filter(None, string)
    string = [y.strip() for y in string]
    string = ' '.join(string)
    return string

# because of sklean.datasets read a document as a single element
# so we want to split based on new line
def separate_dataset(trainset):
    datastring = []
    datatarget = []
    for i in range(len(trainset.data)):
        data_ = trainset.data[i].split('\n')
        # python3, if python2, just remove list()
        data_ = list(filter(None, data_))
        for n in range(len(data_)):
            data_[n] = clearstring(data_[n])
        datastring += data_
        for n in range(len(data_)):
            datatarget.append(trainset.target[i])
    return datastring, datatarget

In [25]:
# you can change any encoding type
trainset = sklearn.datasets.load_files(container_path = 'local', encoding = 'UTF-8')
trainset.data, trainset.target = separate_dataset(trainset)
print (trainset.target_names)
print (len(trainset.data))
print (len(trainset.target))

['adidas', 'apple']
3846
3846


In [26]:
# bag-of-word
bow = CountVectorizer().fit_transform(trainset.data)

#tf-idf, must get from BOW first
tfidf = TfidfTransformer().fit_transform(bow)

#hashing, default n_features, probability cannot divide by negative
hashing = HashingVectorizer(non_negative = True).fit_transform(trainset.data)



In [27]:
train_X, test_X, train_Y, test_Y = train_test_split(bow, trainset.target, test_size = 0.2)

bayes_multinomial = MultinomialNB().fit(train_X, train_Y)
predicted = bayes_multinomial.predict(test_X)
print('accuracy validation set: ', np.mean(predicted == test_Y))

# print scores
print(metrics.classification_report(test_Y, predicted, target_names = trainset.target_names))

accuracy validation set:  0.9376623376623376
             precision    recall  f1-score   support

     adidas       0.94      0.90      0.92       322
      apple       0.93      0.96      0.95       448

avg / total       0.94      0.94      0.94       770



In [28]:
train_X, test_X, train_Y, test_Y = train_test_split(tfidf, trainset.target, test_size = 0.2)

bayes_multinomial = MultinomialNB().fit(train_X, train_Y)
predicted = bayes_multinomial.predict(test_X)
print('accuracy validation set: ', np.mean(predicted == test_Y))

# print scores
print(metrics.classification_report(test_Y, predicted, target_names = trainset.target_names))

accuracy validation set:  0.8987012987012987
             precision    recall  f1-score   support

     adidas       0.99      0.76      0.86       307
      apple       0.86      0.99      0.92       463

avg / total       0.91      0.90      0.90       770



In [29]:
train_X, test_X, train_Y, test_Y = train_test_split(hashing, trainset.target, test_size = 0.2)

bayes_multinomial = MultinomialNB().fit(train_X, train_Y)
predicted = bayes_multinomial.predict(test_X)
print('accuracy validation set: ', np.mean(predicted == test_Y))

# print scores
print(metrics.classification_report(test_Y, predicted, target_names = trainset.target_names))

accuracy validation set:  0.9103896103896104
             precision    recall  f1-score   support

     adidas       1.00      0.78      0.88       318
      apple       0.87      1.00      0.93       452

avg / total       0.92      0.91      0.91       770

