# Build a model 2.0



In [None]:
import gzip
import json
import spacy
import string
import pandas as pd
import re

from collections import Counter
from joblib import Parallel, delayed

from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn import metrics

### Word2Vec Model
from gensim.models import Word2Vec

## Read functions

In [None]:
def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield json.loads(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

## Clean and preprocess functions

In [3]:
def cleaner(df):
    "Extract relevant text from DataFrame using a regex"
    # Regex pattern for only alphanumeric, hyphenated text with 3 or more chars
    pattern = re.compile(r"[A-Za-z0-9\-]{3,50}")
    df['clean'] = df['reviewText'].str.findall(pattern).str.join(' ')
    return df

def lemmatize_pipe(doc):
    lemma_list = [str(tok.lemma_).lower() for tok in doc
                  if tok.is_alpha and tok.text.lower() not in stopwords] 
    return lemma_list

def preprocess_pipe(texts):
    preproc_pipe = []
    for doc in nlp.pipe(texts, batch_size=20):
        preproc_pipe.append(lemmatize_pipe(doc))
    return preproc_pipe

def chunker(iterable, total_length, chunksize):
    return (iterable[pos: pos + chunksize] for pos in range(0, total_length, chunksize))

def flatten(list_of_lists):
    "Flatten a list of lists to a combined list"
    return [item for sublist in list_of_lists for item in sublist]

def process_chunk(texts):
    preproc_pipe = []
    for doc in nlp.pipe(texts, batch_size=20):
        preproc_pipe.append(lemmatize_pipe(doc))
    return preproc_pipe

def preprocess_parallel(texts, chunksize=100):
    executor = Parallel(n_jobs=7, backend='multiprocessing', prefer="processes")
    do = delayed(process_chunk)
    tasks = (do(chunk) for chunk in chunker(texts, len(df_clean), chunksize=chunksize))
    result = executor(tasks)
    return flatten(result)

## Corpus and one-hot

In [4]:
n_limit = 100000
set_chunksize = 500

stopwords = text.ENGLISH_STOP_WORDS
nlp = spacy.load("en_core_web_sm")

raw_df = getDF('../../../../data/Grocery_and_Gourmet_Food_5.json.gz')
raw_df = raw_df.dropna(subset = ["reviewText"])

df_limit = raw_df.head(n_limit)
df_clean = cleaner(df_limit)

df_clean['text'] = preprocess_parallel(df_clean['clean'], chunksize=set_chunksize)

df_clean = df_clean[["overall", "text"]]

df_clean.head(3)
# loc[row_indexer,col_indexer]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,overall,text
0,5.0,"[adverse, comment]"
1,5.0,"[gift, college, student]"
2,5.0,"[like, strong, tea, little, strong]"


In [None]:
# df = getDF('../../../../data/Grocery_and_Gourmet_Food_5.json.gz')
# df.head(3)

In [None]:
# # Nora's code:
# df=df[["reviewText"]]

# df["split_reviewText"]=df["reviewText"].str.lower()
# df=df[["split_reviewText"]]

# rem=string.punctuation
# pattern = r"[{}]".format(rem)
# df["split_reviewText"]=df["split_reviewText"].str.replace(pattern, '')

# df["split_reviewText"]=df.split_reviewText.str.split(" ")

# # Nora's code:
# df_clean = df[["reviewText"]]
# df_clean = cleaner(df_clean)

# df_clean["split_reviewText"]=df_clean["reviewText"].str.lower()
# df_clean = df_clean[["split_reviewText"]]

# rem = string.punctuation
# pattern = r"[{}]".format(rem)
# df_clean["split_reviewText"]=df_clean["split_reviewText"].str.replace(pattern, '')

# df_clean["split_reviewText"]=df_clean.split_reviewText.str.split(" ")
# df_clean.head(3)

In [None]:
# df_clean['split_reviewText']
# df_clean.dtypes

In [5]:
# Create the list of list format of the custom corpus for gensim modeling 
# sent = [row for row in df_clean['split_reviewText']]
sent = list(df_clean['text'])

In [12]:
sent[0:5]

[['adverse', 'comment'],
 ['gift', 'college', 'student'],
 ['like', 'strong', 'tea', 'little', 'strong'],
 ['love',
  'tea',
  'flavor',
  'way',
  'well',
  'regular',
  'lipton',
  'black',
  'tea',
  'definetly',
  'worth',
  'money'],
 ['search',
  'browse',
  'amazon',
  'tea',
  'lipton',
  'sell',
  'grocery',
  'store',
  'shelf',
  'stuff',
  'purchase',
  'just',
  'awful',
  'near',
  'good',
  'remember']]

In [7]:
model = Word2Vec(sent, min_count=1,vector_size= 50,workers=3, window =5, sg = 1) 

In [20]:
# model("tea")
vector = model.wv['cookie']
print(vector)
sims = model.wv.most_similar('cookie', topn=10)
print(sims)
sims = model.wv.most_similar('cake', topn=10)
print(sims)
sims = model.wv.most_similar('chocolate', topn=10)
print(sims)

[-0.56914383  0.44229633 -0.08588842  0.00905657 -1.0971054   0.26600435
  0.6358879   0.11039567 -0.4812935  -0.2776121  -0.10902502 -0.6501509
 -0.08723716  0.68138844 -0.5863477  -0.16933496  0.21681567  0.32548955
 -0.12870976 -0.5939464  -0.28272468  0.44033     0.6807964  -0.75070465
  0.27004883 -0.32829216 -0.5868924   0.07159974  0.11104973  0.22327894
 -0.09788269 -0.01286444 -0.32635158 -0.36721665 -0.697452    1.0271446
 -0.0522331  -0.11789049  0.2700275  -0.49165687 -0.18336883  0.54660004
 -0.32295758  0.27195424  0.760365    0.14604013 -0.29991692 -0.40308306
  0.87348497  0.36599055]
[('cookies', 0.8529347777366638), ('shortbread', 0.8285030722618103), ('brownie', 0.8106359243392944), ('fudge', 0.7925254702568054), ('biscotti', 0.7817010283470154), ('pecan', 0.7776121497154236), ('murray', 0.774254322052002), ('walker', 0.7648388147354126), ('nutella', 0.7632539868354797), ('choc', 0.7627608180046082)]
[('cupcake', 0.8975176811218262), ('icing', 0.8860847353935242), ('

In [21]:
model.wv.most_similar('cake')[:5]

[('cupcake', 0.8975176811218262),
 ('icing', 0.8860847353935242),
 ('frosting', 0.860694408416748),
 ('buttercream', 0.8543232083320618),
 ('frost', 0.8488162159919739)]

In [22]:
model.wv.most_similar('cookie')[:5]

[('cookies', 0.8529347777366638),
 ('shortbread', 0.8285030722618103),
 ('brownie', 0.8106359243392944),
 ('fudge', 0.7925254702568054),
 ('biscotti', 0.7817010283470154)]

In [23]:
model.wv.similarity("cookie", "biscuit")

0.74429965

In [24]:
model.wv.similarity("cookie", "cake")

0.7183425

In [25]:
model.wv.similarity("biscuit", "bread")

0.64192384

In [28]:
vector = model.wv['cookie']
model.wv.most_similar(model.wv['meat'] + model.wv['bread'])

[('bread', 0.8836270570755005),
 ('meatloaf', 0.8595864176750183),
 ('meat', 0.8476506471633911),
 ('pizza', 0.831365168094635),
 ('loafs', 0.8200547099113464),
 ('loaf', 0.8178911209106445),
 ('meatball', 0.817259669303894),
 ('focaccia', 0.8139068484306335),
 ('breadcrumb', 0.812077522277832),
 ('seitan', 0.8052205443382263)]

In [29]:
model.wv.most_similar(model.wv['cake'] + model.wv['icing'])

[('cake', 0.9711697697639465),
 ('icing', 0.9710375666618347),
 ('cupcake', 0.9200991988182068),
 ('frosting', 0.9078121781349182),
 ('frost', 0.9023512601852417),
 ('buttercream', 0.8940775990486145),
 ('velvet', 0.8690030574798584),
 ('decoration', 0.8613044023513794),
 ('decorate', 0.8516001105308533),
 ('fondant', 0.8395641446113586)]

In [None]:
# all_words = [item for sublist in df_clean['text'] for item in sublist]
# corpus = set(all_words)
# # word_to_ix = {word: i for i, word in enumerate(corpus)}

# n_corpus = len(corpus)

# all_word_counter = Counter(all_words)
  
# # most_common() produces k frequently encountered
# # input values and their respective counts.
# most_occur = all_word_counter.most_common(10)
  
# print(most_occur)
# print(f"Size of corpus: {n_corpus}")


In [None]:
# def word_list_to_vec(word_list):
#     'return one-hot encoding of word list'
#     vec = [0] * n_corpus
#     for word in word_list:
#         vec[word_to_ix[word]] = 1
#     return vec

In [None]:
# Bag-of-words is available in in the sklearn open source library

vectorizer = CountVectorizer(lowercase=True)


In [None]:

df_clean['reviewText'] = df_clean.text.apply(lambda x: ' '.join(x))

df_clean.head(5)

In [None]:
bag_of_words = vectorizer.fit_transform(df_clean['reviewText'])
# bag_of_words = vectorizer.fit_transform(lambda x: ''.join(df_clean['text'])

In [None]:
# print(feature_names)
print(bag_of_words)

In [None]:
feature_names = vectorizer.get_feature_names()
print(len(feature_names)) 
bow_df = pd.DataFrame(bag_of_words.toarray(), columns=feature_names)

In [None]:
display(bow_df.head(3))
print(bow_df.dim)
print(bow_df.max().max())

In [None]:
# cosine_similarity generates a 2D array representing the cosine similarity from 0-1 between each text
similarity = cosine_similarity(bag_of_words)
# finding the max cosine similarity value in the resulting array, there are a number of built in numpy functions to get the max value but the trick is skipping the n, n index because they will always be 1
max_value = 0
max_x = 0
max_y = 0
for each_row in range(len(similarity)): 
    for each_column in range(len(similarity[0])): 
        if similarity[each_row][each_column] > max_value and each_row!=each_column:
            max_value = similarity[each_row][each_column]
            max_y = each_row
            max_x = each_column

In [None]:
print(max_value, max_x, max_y)
print(df.iloc[max_x])
print(df.iloc[max_y])

In [None]:
# from sklearn.feature_extraction.text import CountVectorizer
# vectorizer = CountVectorizer()


# Not sure about best way to use one-hot vector sv create sparese train and test matrix. Hmm.
x_train, x_test, y_train, y_test = train_test_split(df_clean['text'].tolist(), df_clean['overall'].tolist())


In [None]:
lr = LogisticRegression(C=100.0, random_state=1, solver='lbfgs', multi_class='ovr')

In [None]:
# Train model? Predictive vectors and then test?
# https://scikit-learn.org/stable/auto_examples/text/plot_document_classification_20newsgroups.html#sphx-glr-auto-examples-text-plot-document-classification-20newsgroups-py

