In [1]:
import dask.bag as bag
import os

In [2]:
raw_text = bag.read_text("foods.txt",encoding='cp1252')

In [3]:
raw_text.count().compute()

5116093

In [4]:
from dask.delayed import delayed

In [5]:
def get_next_buffer_part(file,start_index,span_index=0,blocksize=1000):
    file.seek(start_index)
    buffer = file.read(blocksize + span_index).decode('cp1252')
    delimeter_position = buffer.find('\n\n')
    if delimeter_position == -1:
        return get_next_buffer_part(file,start_index,span_index+blocksize)
    else:
        file.seek(start_index)
        return start_index,delimeter_position

In [6]:
with open("foods.txt","rb") as file_handle:
    size = file_handle.seek(0,2) - 1       #Get the total size of the file in bytes
    more_data = True                     
    output = list()
    current_position = next_position = 0
    while more_data:
        if current_position >= size:
            more_data = False
        else:
            current_position,next_position = get_next_buffer_part(file_handle,current_position,0)
            output.append((current_position,next_position))
            current_position = current_position + next_position + 2

In [7]:
def get_dict_item(filename,start_index,delimeter_position,encoding='cp1252'):
    with open(filename,"rb") as file_handle:
        file_handle.seek(start_index)
        text = file_handle.read(delimeter_position).decode(encoding)
        elements = text.strip().split("\n")
        key_value_pairs = [(element.split(": ")[0], element.split(": ")[1])
                          if len(element.split(": ")) > 1
                          else ("unknown",element)
                          for element in elements]
        return dict(key_value_pairs)

In [8]:
reviews = bag.from_sequence(output).map(lambda x: get_dict_item("foods.txt",x[0],x[1]))

In [9]:
def fetch_scores(element):
    numeric_score = float(element['review/score'])
    return numeric_score

In [10]:
review_scores = reviews.map(fetch_scores)

In [11]:
def tag_reviews(element):
    if float(element['review/score']) > 3:
        element['review/score'] = 'pos'
    else:
        element['review/score'] = 'neg'
    return element

In [12]:
reviews = reviews.map(tag_reviews)

In [13]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
import re
import nltk
from contractions import contractions_dict
from nltk.corpus import stopwords
from spacy.lang.en.stop_words import STOP_WORDS
from itertools import filterfalse
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA

In [14]:
def text_tokenization(x):
    x['review/text'] = word_tokenize(x['review/text'])
    return x

In [15]:
tokenized_reviews = reviews.map(text_tokenization)

In [16]:
def normalize_tokens(review):
    review['review/text'] =  [x.lower() for x in review['review/text']]
    return review

In [17]:
normalized_reviews = tokenized_reviews.map(normalize_tokens)

In [18]:
def contracted_word_expansion(token):
    if token in contractions_dict.keys():
        return contractions_dict[token]
    else:
        return token

In [19]:
def contractions_expansion(review):
    review['review/text'] = list(map(contracted_word_expansion,review['review/text']))
    return review

In [20]:
contracted_reviews = normalized_reviews.map(contractions_expansion)

In [21]:
regex = r'^@[a-zA-z0-9]|^#[a-zA-Z0-9]|\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*|\W+|\d+|<("[^"]*"|\'[^\']*\'|[^\'">])*>|_+|[^\u0000-\u007f]+'

In [22]:
def waste_word_or_not(token):
    return re.search(regex,token)

In [23]:
def filter_waste_words(review):
    review['review/text'] = list(filterfalse(waste_word_or_not,review['review/text']))
    return review

In [24]:
filtered_reviews = contracted_reviews.map(filter_waste_words)

In [25]:
def split(review):
    review['review/text'] = list(map(lambda x: re.split(regex,x)[0],review['review/text']))
    return review

In [26]:
filtered_reviews = filtered_reviews.map(split)

In [27]:
en_stop_words = list(set(stopwords.words('english')).union(set(STOP_WORDS)))

In [28]:
def is_stopword(token):
    return not(token in en_stop_words or re.search(r'\b\w\b|[^\u0000-\u007f]+|_+|\W+',token))

In [29]:
def stopwords_removal(review):
    review['review/text'] = list(filter(is_stopword,review['review/text']))
    return review

In [30]:
without_stopwords_reviews = filtered_reviews.map(stopwords_removal)

In [31]:
def get_wnet_pos_tag(treebank_tag):
    wn.ensure_loaded()
    if treebank_tag[1].startswith('J'):
        return (treebank_tag[0],wn.ADJ)
    elif treebank_tag[1].startswith('V'):
        return (treebank_tag[0],wn.VERB)
    elif treebank_tag[1].startswith('N'):
        return (treebank_tag[0],wn.NOUN)
    elif treebank_tag[1].startswith('R'):
        return (treebank_tag[0],wn.ADV)
    else:
        return (treebank_tag[0],wn.NOUN)

In [32]:
def get_pos_tag(review):
    wn.ensure_loaded()
    review['review/text'] = list(map(get_wnet_pos_tag,pos_tag(review['review/text'])))
    return review

In [33]:
tagged_reviews = without_stopwords_reviews.map(get_pos_tag)

In [34]:
lemmatizer = WordNetLemmatizer()
wn.ensure_loaded()

In [35]:
def token_lemmatization(token_pos_tuple):
    wn.ensure_loaded()
    if token_pos_tuple == None:
        return ""
    else:
        return lemmatizer.lemmatize(word=token_pos_tuple[0],pos=token_pos_tuple[1])

In [36]:
def lemmatization(review):
    wn.ensure_loaded()
    if len(review['review/text']) > 0:
        review['review/text'] = list(map(token_lemmatization,review['review/text']))
    else:
        review['review/text'] = [""]
    return review

In [37]:
lemmatized_reviews = tagged_reviews.map(lemmatization)

In [38]:
def extract_tokens(review):
    return review['review/text']

In [39]:
extracted_tokens = lemmatized_reviews.map(extract_tokens)

In [40]:
unique_tokens = extracted_tokens.flatten().distinct()

In [41]:
from dask.diagnostics import ProgressBar

In [42]:
with ProgressBar():
    number_of_tokens = unique_tokens.count().compute()

[########################################] | 100% Completed |  5min 47.3s


In [43]:
number_of_tokens

90435

In [44]:
unique_tokens

dask.bag<distinct-aggregate, npartitions=1>

In [45]:
with ProgressBar():
    tokens_index = list(unique_tokens)

[########################################] | 100% Completed |  5min 47.2s


In [46]:
from collections import Counter
from collections import OrderedDict

In [47]:
def compute_tf(review):
    D = dict(Counter(review))
    non_included = set(tokens_index).difference(set(D.keys()))
    D_prime = dict(zip(non_included,list(np.zeros(len(non_included)))))
    D_prime.update(D)
    full_D = dict(OrderedDict(sorted(D_prime.items())))
    print(full_D)
    return np.array(full_D.values())

In [48]:
tf_vectors = extracted_tokens.map(compute_tf)

In [49]:
from dask.distributed import Client

In [50]:
client = Client()

In [51]:
from dask import array as dask_array

In [52]:
def stacker(partition):
    return dask_array.concatenate([element for element in partition])

In [53]:
tf_vector_data = tf_vectors.map(lambda x: dask_array.from_array(x).reshape(1,-1)).reduction(
        perpartition=stacker,aggregate=stacker)
tf_vector_data = tf_vector_data.compute()
progress(tf_vector_data)

AttributeError: 'WordNetCorpusReader' object has no attribute '_LazyCorpusLoader__args'

