# Build a model 2.0



In [2]:
import gzip
import json
import spacy
import pandas as pd
import re

from collections import Counter
from joblib import Parallel, delayed
from sklearn.feature_extraction import text


## Read functions

In [3]:
def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield json.loads(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

## Clean and preprocess functions

In [7]:
def cleaner(df):
    "Extract relevant text from DataFrame using a regex"
    # Regex pattern for only alphanumeric, hyphenated text with 3 or more chars
    pattern = re.compile(r"[A-Za-z0-9\-]{3,50}")
    df['clean'] = df['reviewText'].str.findall(pattern).str.join(' ')
    return df

def lemmatize_pipe(doc):
    lemma_list = [str(tok.lemma_).lower() for tok in doc
                  if tok.is_alpha and tok.text.lower() not in stopwords] 
    return lemma_list

def preprocess_pipe(texts):
    preproc_pipe = []
    for doc in nlp.pipe(texts, batch_size=20):
        preproc_pipe.append(lemmatize_pipe(doc))
    return preproc_pipe

def chunker(iterable, total_length, chunksize):
    return (iterable[pos: pos + chunksize] for pos in range(0, total_length, chunksize))

def flatten(list_of_lists):
    "Flatten a list of lists to a combined list"
    return [item for sublist in list_of_lists for item in sublist]

def process_chunk(texts):
    preproc_pipe = []
    for doc in nlp.pipe(texts, batch_size=20):
        preproc_pipe.append(lemmatize_pipe(doc))
    return preproc_pipe

def preprocess_parallel(texts, chunksize=100):
    executor = Parallel(n_jobs=7, backend='multiprocessing', prefer="processes")
    do = delayed(process_chunk)
    tasks = (do(chunk) for chunk in chunker(texts, len(df_clean), chunksize=chunksize))
    result = executor(tasks)
    return flatten(result)

## Corpus and one-hot

In [9]:
n_limit = 1000
set_chunksize = 1000

stopwords = text.ENGLISH_STOP_WORDS
nlp = spacy.load("en_core_web_sm")

df = getDF('Grocery_and_Gourmet_Food_5.json.gz')
df_limit = df.head(n_limit)
df_clean = cleaner(df_limit)

df_clean['text'] = preprocess_parallel(df_clean['clean'], chunksize=1000)

df_clean = df_clean[["overall", "text"]]

df_clean.head(3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,overall,text
0,5.0,"[adverse, comment]"
1,5.0,"[gift, college, student]"
2,5.0,"[like, strong, tea, little, strong]"


In [11]:
all_words = [item for sublist in df_clean['text'] for item in sublist]
corpus = set(all_words)
word_to_ix = {word: i for i, word in enumerate(corpus)}

n_corpus = len(corpus)

Counter = Counter(all_words)
  
# most_common() produces k frequently encountered
# input values and their respective counts.
most_occur = Counter.most_common(10)
  
print(most_occur)

[('good', 301), ('flavor', 290), ('use', 289), ('like', 254), ('great', 244), ('taste', 226), ('love', 185), ('gum', 180), ('buy', 160), ('tea', 159)]


In [12]:
word_to_ix

{'foam': 0,
 'powdered': 1,
 'original': 2,
 'bars': 3,
 'antihistamine': 4,
 'tanginess': 5,
 'reach': 6,
 'find': 7,
 'rs': 8,
 'sight': 9,
 'aromatic': 10,
 'precious': 11,
 'spelt': 12,
 'hand': 13,
 'static': 14,
 'deepness': 15,
 'genisoy': 16,
 'reliably': 17,
 'fingertip': 18,
 'aid': 19,
 'sense': 20,
 'sprayed': 21,
 'med': 22,
 'ready': 23,
 'swiss': 24,
 'dull': 25,
 'sunday': 26,
 'improve': 27,
 'manufacturing': 28,
 'bakery': 29,
 'trouble': 30,
 'interest': 31,
 'accomplish': 32,
 'automobile': 33,
 'immunity': 34,
 'loyal': 35,
 'combo': 36,
 'session': 37,
 'drop': 38,
 'funky': 39,
 'forever': 40,
 'part': 41,
 'delivery': 42,
 'nesco': 43,
 'portable': 44,
 'pink': 45,
 'relatively': 46,
 'quick': 47,
 'istanbul': 48,
 'hispanics': 49,
 'massive': 50,
 'impress': 51,
 'sweeten': 52,
 'mac': 53,
 'chain': 54,
 'noodle': 55,
 'wakes': 56,
 'introduce': 57,
 'halloween': 58,
 'bag': 59,
 'dough': 60,
 'hungry': 61,
 'potassium': 62,
 'wished': 63,
 'do': 64,
 'bubblegu

In [14]:
def word_list_to_vec(word_list):
    'return one-hot encoding of word list'
    vec = [0] * n_corpus
    for word in word_list:
        vec[word_to_ix[word]] = 1
    return vec

In [15]:
# Bag-of-words is available in in the sklearn open source library
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
vectorizer=CountVectorizer(lowercase=True)


In [16]:
bag_of_words=vectorizer.fit_transform(df_limit['reviewText'])

In [17]:
feature_names=vectorizer.get_feature_names()
bow_df=pd.DataFrame(bag_of_words.toarray(), columns=feature_names)



In [18]:
bow_df.head(3)

Unnamed: 0,00,000,09,0g,0mg,10,100,101,10x,11,...,yucky,yuk,yum,yummy,zero,zinc,zing,zone,zucchini,zuccini
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
# cosine_similarity generates a 2D array representing the cosine similarity from 0-1 between each text
from sklearn.metrics.pairwise import cosine_similarity
similarity=cosine_similarity(bag_of_words)
# finding the max cosine similarity value in the resulting array, there are a number of built in numpy functions to get the max value but the trick is skipping the n, n index because they will always be 1
max_value=0
max_x=0
max_y=0
for each_row in range(len(similarity)): 
    for each_column in range(len(similarity[0])): 
        if similarity[each_row][each_column]>max_value and each_row!=each_column:
            max_value=similarity[each_row][each_column]
            max_y=each_row
            max_x=each_column

In [20]:
print(max_value, max_x, max_y)

1.0000000000000004 997 996


In [31]:
# from sklearn.feature_extraction.text import CountVectorizer
# vectorizer = CountVectorizer()
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

# Not sure about best way to use one-hot vector sv create sparese train and test matrix. Hmm.
x_train, x_test, y_train, y_test = train_test_split(df_clean['text'].tolist(), df_clean['overall'].tolist())


In [13]:
lr = LogisticRegression(C=100.0, random_state=1, solver='lbfgs', multi_class='ovr')

In [None]:
# Train model? Predictive vectors and then test?
# https://scikit-learn.org/stable/auto_examples/text/plot_document_classification_20newsgroups.html#sphx-glr-auto-examples-text-plot-document-classification-20newsgroups-py

