In [None]:
import numpy as np
import re
import spacy

In [2]:
nlp = spacy.load('en_core_web_lg')

In [3]:
def clean_text(text:str)->str:
    words = re.sub('\W', ' ', text.lower()).split()
    return [word for word in words if not word in nlp.Defaults.stop_words]
  

In [4]:
clean_text('The end is nie')

['end', 'nie']

In [15]:
def tokenize(sentences:list)->list:
    words = []

    for sent in sentences:
        clean_words = clean_text(sent)
        words.extend(clean_words)

    return list(set(words))

In [6]:
allsentences = ["Joe waited for the train",
                "The train was late",
                "Mary and Samantha took the bus",
               "I looked for Mary and Samantha at the bus station",
                "Mary and Samantha arrived at the bus station early but waited until noon for the bus"]

tokenize(allsentences)

['early',
 'arrived',
 'station',
 'joe',
 'samantha',
 'looked',
 'train',
 'noon',
 'waited',
 'late',
 'mary',
 'took',
 'bus']

In [55]:
def gen_bow(sentences:list):
    vocab = tokenize(sentences=sentences)
    print(vocab,'\n\n')
    for sentence in sentences:
        words = clean_text(sentence)    
        bag_vec = np.zeros(len(vocab))
        for word in words:
            indx = vocab.index(word)
            bag_vec[indx] += 1

            # print(f'{word}\n {bag_vec}')
            # print('---------------------------------------------------')
            # bag_vec[indx] = 0
        print(f'{sentence}\n{bag_vec}')
        print('---------------------------------------------------')

In [56]:
text = "Mary and Samantha arrived at the bus station early but waited until noon for the bus"

gen_bow([text])

['early', 'arrived', 'station', 'samantha', 'noon', 'waited', 'mary', 'bus'] 


Mary and Samantha arrived at the bus station early but waited until noon for the bus
[1. 1. 1. 1. 1. 1. 1. 2.]
---------------------------------------------------


In [57]:
gen_bow(sentences=allsentences)

['early', 'arrived', 'station', 'joe', 'samantha', 'looked', 'train', 'noon', 'waited', 'late', 'mary', 'took', 'bus'] 


Joe waited for the train
[0. 0. 0. 1. 0. 0. 1. 0. 1. 0. 0. 0. 0.]
---------------------------------------------------
The train was late
[0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0.]
---------------------------------------------------
Mary and Samantha took the bus
[0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 1. 1.]
---------------------------------------------------
I looked for Mary and Samantha at the bus station
[0. 0. 1. 0. 1. 1. 0. 0. 0. 0. 1. 0. 1.]
---------------------------------------------------
Mary and Samantha arrived at the bus station early but waited until noon for the bus
[1. 1. 1. 0. 1. 0. 0. 1. 1. 0. 1. 0. 2.]
---------------------------------------------------


In [58]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(allsentences)

for i in range(len(allsentences)) : 
    print(allsentences[i])
    print(list(X.toarray()[i]))
    print('---------------------------------------------------')

Joe waited for the train
[0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0]
---------------------------------------------------
The train was late
[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1]
---------------------------------------------------
Mary and Samantha took the bus
[1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0]
---------------------------------------------------
I looked for Mary and Samantha at the bus station
[1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0]
---------------------------------------------------
Mary and Samantha arrived at the bus station early but waited until noon for the bus
[1, 1, 1, 2, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 2, 0, 0, 1, 1, 0]
---------------------------------------------------
