In [1]:
import pandas as pd
import spacy
nlp = spacy.load("en_core_web_sm")
import re
import sklearn
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

In [10]:
def data_preprocessing(path, test=False):
    #create appropriate file path
    if test == False:
        pfilename = path + "/product_training.json"
        rfilename = path + "/review_training.json"
    else:
        pfilename = path + "/product_test.json"
        rfilename = path + "/review_test.json"
    
    #extract files as pandas dataframes
    product_df = pd.read_json(pfilename)
    review_df = pd.read_json(rfilename).drop_duplicates(subset=["reviewerID", "unixReviewTime"], keep="first")
    
    review_df.drop(columns=["reviewerID","vote", "unixReviewTime","reviewTime","style","reviewerName","image"], axis=1 ,inplace=True)
    
    review_df['reviewText'].fillna("", inplace=True)
    review_df['summary'].fillna("", inplace=True)
    
    group = review_df.groupby("asin")
    
    review_group_df = pd.DataFrame(columns = ['asin', 'numReviews', 'percentVerified','reviewText','summaryText'])
    
    for asin, data in group:
        verifiedCount = data['verified'].sum()
        reviewCount = data['asin'].count()
        percentVerified = verifiedCount / reviewCount
        new_row = {'asin': asin, 'numReviews': reviewCount, 'percentVerified': percentVerified, 'reviewText': ' '.join(data['reviewText']), 'summaryText': ' '.join(data['summary'])} 
        review_group_df = review_group_df.append(new_row, ignore_index = True)
        
    return (review_group_df, product_df)

PSEUDOCODE FOR WHAT WE NEED TO DO WITH THE TRAINING DATA:
 - Saahir/Amy starts by putting all of the reviews for a product in one string. This string is the DOCUMENT for the reviews of a product
 - I will then take the corpus and transform it to remove stopwords, punctuation, and lemmatize everything. This is the true "bag of words".
 - Then, I use the cleaned text data to train the Naive Bayes classifier
 - When we get test data, first clean in the same way, then use it in the classifier

In [3]:
def get_stopwords():
    file = open('en.txt')
    stopwords = []
    for line in file:
        stopwords.append(line.rstrip())
    return stopwords

In [4]:
def transform_document(doc, remove_stopwords = True):
    new_doc = ""
    stopwords = get_stopwords()
    parsed_text = nlp(doc)
    for token in parsed_text:
        lemma = token.lemma_.lower()
        if re.match("[a-z0-9]+", lemma) and (remove_stopwords == False or lemma not in stopwords):
            new_doc += lemma + " "
    return new_doc.rstrip()

transform_document("First, you need to preprocess the raw text data. This may involve tasks like tokenizing the text (i.e., splitting it into individual words), removing stopwords, stemming or lemmatizing the words, and converting the text into a numerical format that can be used as input for the model. Then, you need to split the data into training and testing sets. The training set will be used to train the model, while the testing set will be used to evaluate its performance.")

'preprocess raw text datum involve task tokenize text i.e. split individual word remove stopword stem lemmatize word convert text numerical format input model split datum training testing set training set train model testing set evaluate performance'

In [5]:
def bag_of_words(review_text, remove_stopwords = True):
    word_bag = {}
    stopwords = get_stopwords()
    parsed_text = nlp(review_text)
    for token in parsed_text:
        lemma = token.lemma_.lower()
        if re.match("[a-z0-9]+", lemma) and (remove_stopwords == False or lemma not in stopwords):
            if lemma in word_bag:
                word_bag[lemma] += 1
            else:
                word_bag[lemma] = 1
    return word_bag

In [6]:
def vocabulary_from_corpus(corpus, remove_stopwords = True):
    vocab_set = set()
    for document in corpus:
        word_bag = bag_of_words(document, remove_stopwords)
        for word in word_bag.keys():
            vocab_set.add(word)
    return list(vocab_set)

def transform_document(doc, remove_stopwords = True):
    new_doc = ""
    stopwords = get_stopwords()
    parsed_text = nlp(doc)
    for token in parsed_text:
        lemma = token.lemma_.lower()
        if re.match("[a-z0-9]+", lemma) and (remove_stopwords == False or lemma not in stopwords):
            new_doc += lemma + " "
    return new_doc.rstrip()

vocabulary_from_corpus(['this is the first document', 'this document is the second document', 'and this is the third one', 'is this the first document'], False)

['first', 'this', 'third', 'one', 'the', 'and', 'document', 'second', 'be']

In [7]:
corpus = ["I am being handed a list of documents", "Each of these documents has several unique words", "The words will represent the class of each review", "I am also removing stopwords in order to make this make more sense"]
cleaned_corpus = [transform_document(doc) for doc in corpus]
vocabulary = vocabulary_from_corpus(cleaned_corpus, True)
pipe = Pipeline([('count', CountVectorizer(vocabulary=vocabulary)), ('tfid', TfidfTransformer())]).fit(cleaned_corpus)

In [11]:
(review_group_df, product_df) = data_preprocessing("../devided_dataset_v2/CDs_and_Vinyl/train")

In [13]:
review_group_df['reviewText'][0]

'Even tho I love this album, I am having problems playing it on my stereo system. This is the 4th CD that i have ordered as thinking there was something wrong with the CD\'s itself. It plays perfectly on my portable CD player.......but it can\'t play on my stereo system. I read that this recording was electronically enhanced for PC use. Don\'t know what\'s going on, as my stereo system is only 2 & 1/2 months old. It\'s a Jenson system. Sigh.......maybe I\'ll finally get lucky and find a *playable copy of this CD some day. Otherwise.....it\'s an fantastic recording with beautiful songs by Ms. Nancy Wilson. Nancy Wilson is still one of the most distinctive and unique singers I have ever known! I love everything about her; Her voice, her classiness (and by the way, she \'ALWAYS HAVE MAINTAINED HER SINCE OF CLASSINESS & STYLE SINCE I introduced myself to her music! I\'m a avid jazz lover and she has \'ALWAYS\' been apart of mostly every album she has made! I purchased 2 CD\'s at the time o