In [1]:
import re
import nltk
from nltk.tokenize import RegexpTokenizer
import spacy
from random import shuffle
import string
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
import math

nlp = spacy.load("en_core_web_sm")

In [2]:
#get the corpus
with open("sample text.txt","r") as f:
    content = f.read()

# get the sentences
corpus = nltk.sent_tokenize(content)
shuffle(corpus)
shuffle(corpus)
shuffle(corpus)
shuffle(corpus)
shuffle(corpus)
corpus = corpus[:10]

In [3]:
# clean sents and tokenize
def bag_of_words(s):
    tokenizer = RegexpTokenizer("\w+")
    tokens = tokenizer.tokenize(s)
    tokens = [token for token in tokens if token not in string.punctuation]
    tokens = [token.lower() for token in tokens if re.findall(r"\w", token)]
    tokens = [token.strip().strip('.').strip("—").strip("'") for token in tokens]
    return tokens
    

In [4]:
def get_vocab(all_text):
    tokenized_text = [bag_of_words(s) for s in all_text]
    tokens = [token for tokens in tokenized_text for token in tokens]
    tokens = list(set(tokens))
    tokens.sort()
    return tokens

In [5]:
def get_count_vect(s, vocab):
    bagofwords = bag_of_words(s)
    cv = [0]*len(vocab)
    for w in bagofwords:
        for i,sw in enumerate(vocab):
            if w == sw:
                cv[i] += 1
                break
    return cv

In [6]:
def get_tf(sent, unique_words):
    vect = get_count_vect(sent, unique_words)
    tokens = bag_of_words(sent)
    N = len(tokens)
    vect = [val/N for val in vect]
    return vect
                

In [7]:
def get_word_freq(unique_words, all_text):
    bag = [bag_of_words(sent) for sent in all_text]
    word_freq = dict.fromkeys(unique_words,0)
    for sent_tok in bag:
        for token in sent_tok:
            word_freq[token] += 1
    word_freq = [(k,v) for k,v in word_freq.items()]
    word_freq.sort(key = lambda x: x[0])
    return word_freq

In [8]:
def get_idf(word_freq, sent):
    sent_tokens = bag_of_words(sent)
    words = [k[0] for k in word_freq]
    idf = [0]* len(word_freq)
    for i,word_freq in enumerate(word_freq):
        word = word_freq[0]
        val = word_freq[1]
        idf_val = math.log((len(sent_tokens) + 1) / (val + 1)) + 1
        idf[i] = (word, idf_val)
    return idf

In [9]:
# get tf
vocabulary = get_vocab(corpus)
corpus_tf = [get_tf(sent, vocabulary) for sent in corpus]
print(*corpus,sep="\n")

"That is remarkable—most remarkable," said Holmes, whose interest in the case seemed to be rising.
When I saw that little empty quiver beside the small bird-bow, it was just what I expected to see.
There is a lull at present.
"Because it was in my mind to put that little test which answered so admirably.
You know that I have a quick eye for faces, Holmes.
His Grace's ma might overlook the age, but a big scandal would be a different matter, so it is imperative—Ah!
But the man lost his nerve.
I fancied that you must have heard me when I displaced the figure, but luck was on my side.
Amberley excelled at chess—one mark, Watson, of a scheming mind.
The idea of food did not occur to me at the moment."


In [10]:
print(*corpus_tf,sep="\n")

[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0625, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0625, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0625, 0.0, 0.0, 0.0, 0.0625, 0.0625, 0.0625, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0625, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.125, 0.0625, 0.0, 0.0625, 0.0, 0.0, 0.0, 0.0, 0.0625, 0.0, 0.0, 0.0, 0.0, 0.0625, 0.0625, 0.0, 0.0625, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0625, 0.0, 0.0]
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.05, 0.0, 0.05, 0.05, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.05, 0.0, 0.05, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.05, 0.05, 0.0, 0.05, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.05, 0.0, 0.0, 0.0, 0.0, 0.05, 0.0, 0.0, 0.05, 0.0, 0.0, 0.05, 0.0, 0.0, 0.05, 0.05, 0.0, 0.05, 0.05, 0.0, 0.05, 0.05, 0.0, 0.0, 0.0, 0.0]
[0.166

In [11]:
print(vocabulary)

['a', 'admirably', 'age', 'ah', 'amberley', 'answered', 'at', 'be', 'because', 'beside', 'big', 'bird', 'bow', 'but', 'case', 'chess', 'did', 'different', 'displaced', 'empty', 'excelled', 'expected', 'eye', 'faces', 'fancied', 'figure', 'food', 'for', 'grace', 'have', 'heard', 'his', 'holmes', 'i', 'idea', 'imperative', 'in', 'interest', 'is', 'it', 'just', 'know', 'little', 'lost', 'luck', 'lull', 'ma', 'man', 'mark', 'matter', 'me', 'might', 'mind', 'moment', 'most', 'must', 'my', 'nerve', 'not', 'occur', 'of', 'on', 'one', 'overlook', 'present', 'put', 'quick', 'quiver', 'remarkable', 'rising', 's', 'said', 'saw', 'scandal', 'scheming', 'see', 'seemed', 'side', 'small', 'so', 'test', 'that', 'the', 'there', 'to', 'was', 'watson', 'what', 'when', 'which', 'whose', 'would', 'you']


In [12]:
frequency_word = get_word_freq(vocabulary, corpus)
print(frequency_word)

[('a', 5), ('admirably', 1), ('age', 1), ('ah', 1), ('amberley', 1), ('answered', 1), ('at', 3), ('be', 2), ('because', 1), ('beside', 1), ('big', 1), ('bird', 1), ('bow', 1), ('but', 3), ('case', 1), ('chess', 1), ('did', 1), ('different', 1), ('displaced', 1), ('empty', 1), ('excelled', 1), ('expected', 1), ('eye', 1), ('faces', 1), ('fancied', 1), ('figure', 1), ('food', 1), ('for', 1), ('grace', 1), ('have', 2), ('heard', 1), ('his', 2), ('holmes', 2), ('i', 5), ('idea', 1), ('imperative', 1), ('in', 2), ('interest', 1), ('is', 3), ('it', 3), ('just', 1), ('know', 1), ('little', 2), ('lost', 1), ('luck', 1), ('lull', 1), ('ma', 1), ('man', 1), ('mark', 1), ('matter', 1), ('me', 2), ('might', 1), ('mind', 2), ('moment', 1), ('most', 1), ('must', 1), ('my', 2), ('nerve', 1), ('not', 1), ('occur', 1), ('of', 2), ('on', 1), ('one', 1), ('overlook', 1), ('present', 1), ('put', 1), ('quick', 1), ('quiver', 1), ('remarkable', 2), ('rising', 1), ('s', 1), ('said', 1), ('saw', 1), ('scandal

In [13]:
idf_values = [get_idf(frequency_word, s) for s in corpus]
print(idf_values)

[[('a', 2.041453874828161), ('admirably', 3.1400661634962708), ('age', 3.1400661634962708), ('ah', 3.1400661634962708), ('amberley', 3.1400661634962708), ('answered', 3.1400661634962708), ('at', 2.4469189829363254), ('be', 2.734601055388106), ('because', 3.1400661634962708), ('beside', 3.1400661634962708), ('big', 3.1400661634962708), ('bird', 3.1400661634962708), ('bow', 3.1400661634962708), ('but', 2.4469189829363254), ('case', 3.1400661634962708), ('chess', 3.1400661634962708), ('did', 3.1400661634962708), ('different', 3.1400661634962708), ('displaced', 3.1400661634962708), ('empty', 3.1400661634962708), ('excelled', 3.1400661634962708), ('expected', 3.1400661634962708), ('eye', 3.1400661634962708), ('faces', 3.1400661634962708), ('fancied', 3.1400661634962708), ('figure', 3.1400661634962708), ('food', 3.1400661634962708), ('for', 3.1400661634962708), ('grace', 3.1400661634962708), ('have', 2.734601055388106), ('heard', 3.1400661634962708), ('his', 2.734601055388106), ('holmes', 2.