Skip to content

Commit

Permalink
Remove scikit-learn dependency from text_to_bow.py
Browse files Browse the repository at this point in the history
  • Loading branch information
Albert Aparicio authored and ofrei committed Mar 15, 2018
1 parent 10ac6ed commit 209d29b
Showing 1 changed file with 8 additions and 11 deletions.
19 changes: 8 additions & 11 deletions utils/text_to_bow.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,10 @@

import argparse
import os
import re
from collections import Counter

from sklearn.feature_extraction.text import CountVectorizer
from six import iteritems


def main(args):
Expand All @@ -19,22 +21,17 @@ def main(args):
with open(args.fname, 'r')as file:
rawdata = file.read().replace('\n', '')

# instantiate the parser and feed it some HTML
vectorizer = CountVectorizer()

x = vectorizer.fit_transform([rawdata])

vocabulary = vectorizer.get_feature_names()
count_vector = x.toarray()
bagofwords = Counter(re.findall(r'\w+', rawdata.lower()))
vocabulary = sorted(bagofwords.keys())

# Initialize Bag-of-words data
bow = [
str(1), # D - Number of documents
str(len(vocabulary)), # W - Number of words in vocabulary
str(count_vector.sum()), # NNZ - Number of words in documents
str(len(bagofwords)), # W - Number of words in vocabulary
str(sum(bagofwords.values())), # NNZ - Number of words in documents
]

for word, count in zip(vocabulary, count_vector.squeeze()):
for word, count in iteritems(bagofwords):
bow.append('{} {} {}'.format(1, 1 + vocabulary.index(word), count))

# Save docfile and vocabulary
Expand Down

0 comments on commit 209d29b

Please sign in to comment.