In [6]:
import pandas as pd
import spacy
nlp = spacy.load("en_core_web_sm")
import re
import sklearn
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
import time
import numpy as np
from sklearn.model_selection import KFold
import statistics

In [9]:
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_transformer
from scipy.sparse import csr_matrix
from sklearn import preprocessing
from sklearn.svm import LinearSVC, SVC
from sklearn.preprocessing import StandardScaler, MaxAbsScaler
from sklearn.metrics import f1_score
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment import SentimentIntensityAnalyzer
from sklearn.model_selection import cross_val_score

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/adenweiser/nltk_data...


In [14]:
def data_preprocessing(path, test=False):
    #start_time = time.time()
    
    #create appropriate file path
    if test == False:
        pfilename = path + "/product_training.json"
        rfilename = path + "/review_training.json"
    else:
        pfilename = path + "/product_test.json"
        rfilename = path + "/review_test.json"
    
    #extract files as pandas dataframes
    product_df = pd.read_json(pfilename)
    
    review_df = pd.read_json(rfilename).drop_duplicates(subset=["reviewerID", "unixReviewTime"], keep="first")
    ## 11.66 seconds to get to here
    
    review_df.drop(columns=["reviewerID","vote", "unixReviewTime","reviewTime","style","reviewerName","image"], axis=1 ,inplace=True)
    
    review_df['reviewText'].fillna("", inplace=True)
    review_df['summary'].fillna("", inplace=True)
    
    review_df.sort_values('asin', inplace = True)
    product_df.sort_values('asin', inplace = True)
    
    group = review_df.groupby("asin")
    
    #review_group_df = pd.DataFrame(columns = ['asin', 'numReviews', 'percentVerified','reviewText','summaryText', 'awesomeness'])
    
    # about the same amount of time to get to here
    start_time = time.time()
    datalist = []
    count = 0
    #awesome_pos = 0
    for asin, data in group:
        verifiedCount = data['verified'].sum()
        reviewCount = data['asin'].count()
        percentVerified = verifiedCount / reviewCount
        if count == 0:
            print(type(data['reviewText']))
        reviewText = ' '.join(data['reviewText'])
        #reviewText = ' '.join(transform_document(x) for x in data['reviewText'])
        #summaryText = ""
        summaryText = ' '.join(data['summary'])
        #summaryText = ' '.join(transform_document(x) for x in data['summary'])
        #reviewText = transform_document(' '.join(data['reviewText']))
        #summaryText = transform_document(' '.join(data['summary']))
        #awesomeness = 0
        
        #SENTIMENT ANALYSIS CHUNK
        (rev_mean, rev_stdev) = sentiment_analysis(data['reviewText'])
        (sum_mean, sum_stdev) = sentiment_analysis(data['summary'])
        while (product_df['asin'][count] != asin):
               count = count + 1
        
        awesomeness = product_df['awesomeness'][count]
        #awesome_pos = awesome_pos + reviewCount
        #awesomeness = product_df.loc[product_df['asin'] == asin, 'awesomeness'].values[0] #might be slow
        datalist.append([asin,  reviewCount, percentVerified, reviewText, summaryText, rev_mean, rev_stdev, sum_mean, sum_stdev, awesomeness])
        
        count = count + 1
        #if count > 100:
        #    break
        
        '''new_row = {'asin': asin, 
                   'numReviews': reviewCount, 
                   'percentVerified': percentVerified, 
                   'reviewText': transform_document(' '.join(data['reviewText'])), 
                   'summaryText': transform_document(' '.join(data['summary'])), 
                   'awesomeness': product_df.loc[product_df['asin'] == asin, 'awesomeness'].values[0]} 
        review_group_df = review_group_df.append(new_row, ignore_index = True)
         '''
    review_group_df = pd.DataFrame(datalist,columns =['asin', 'numReviews', 'percentVerified','reviewText','summaryText', \
                                                      'reviewMean', 'reviewStDev', 'summaryMean', 'summaryStDev', 'awesomeness'])    
    
    review_group_df.to_json(r'../devided_dataset_v2/CDs_and_Vinyl/train/cleaned_data.json')
    end_time = time.time()
    print(end_time - start_time)
    
    return review_group_df
    

In [15]:
# Preprocessed Data Generated (Reviews and Summaries aggregated, no NLP processing)
review_group_df = data_preprocessing("../devided_dataset_v2/CDs_and_Vinyl/train")
#review_group_df.to_json("preprocessed.json")
#review_group_df.head()
#review_group_df

<class 'pandas.core.series.Series'>


StatisticsError: variance requires at least two data points

In [7]:
def sentiment_analysis(docs):
    sentiments = []
    sid = SentimentIntensityAnalyzer()
    for doc in docs:
        polarities = sid.polarity_scores(doc)
        sentiments.append(polarities['compound'])
    if len(sentiments) == 1:
        return (sentiments[0], 0)
    else:
        return (statistics.mean(sentiments), statistics.stdev(sentiments))

In [None]:
string_transformer = Pipeline(
    steps = [('vect', CountVectorizer()),
     ('tfidf', TfidfTransformer())]
)
wordbagger = ColumnTransformer(
    transformers=[("rev", string_transformer, 'reviewText'), 
                  ("sum", string_transformer, 'summaryText')]
    , remainder='passthrough'
)

clf = Pipeline(steps = [("wordbag", wordbagger), 
                        ("scale", MaxAbsScaler()), 
                        ('classifier', RandomForestClassifier(max_depth = 5, n_jobs  = -1))])
#x = review_group_df.filter(['numReviews', 'percentVerified', 'reviewText', 'summaryText'])
#clf = LogisticRegression(tol = 0.001, max_iter = 150)
#clf.fit(x,y)
review_features = review_group_df.filter(['numReviews', 'percentVerified', 'reviewText', 'summaryText'])
y = review_group_df.filter(['awesomeness'])

In [120]:
start = time.time()
# this runs the k-fold cross-validation automatically?
cv10_results = cross_val_score(clf, review_features, np.ravel(y), cv=10, scoring = 'f1_macro')
end = time.time()
print((end - start)/60)

6.791166607538859


In [121]:
cv10_results

array([0.57534303, 0.56416805, 0.57010494, 0.56669105, 0.57545749,
       0.57953195, 0.56105469, 0.57145317, 0.57041176, 0.56486839])

All the older stuff is below here:

In [108]:
kf = KFold(n_splits = 10, shuffle = True)
for i, (train_index, test_index) in enumerate(kf.split(review_features)):
    start = time.time()
    print(f"Fold {i}:")
    print(f"  Train: index={train_index}")
    print(f"  Test:  index={test_index}")
    x_train = review_features.loc[train_index, :]
    x_test = review_features.loc[test_index, :]
    y_train = np.ravel(y.loc[train_index, :])
    y_test = np.ravel(y.loc[test_index,:])
    
    #x_train_features = wordbagger.fit_transform(x_train)
    #x_test_features = wordbagger.fit_transform(x_test)
    #scaler = preprocessing.MaxAbsScaler().fit(x_train_features)
    #x_train_scaled = scaler.transform(x_train_features)
    #print(type(y_train))
    #print(x_train.shape)
    #print(y_train.shape)
    #clf = Pipeline(steps = [("preprocess", preprocessor), ('classifier', LogisticRegression()) ])
    clf.fit(x_train, y_train)
    #print(x_train)
    #print(y_train.shape)
    #X_trans = preprocessor.fit_transform(x_train)
    #print(x_train.shape)
    print("model score: %.3f" % clf.score(x_test, y_test))
    end = time.time()
    print(end - start)

Fold 0:
  Train: index=[    0     1     2 ... 71540 71541 71542]
  Test:  index=[    7    10    21 ... 71512 71515 71520]
model score: 0.569
46.16732168197632
Fold 1:
  Train: index=[    0     1     2 ... 71540 71541 71542]
  Test:  index=[    3     4    11 ... 71526 71530 71538]
model score: 0.579
43.28184628486633
Fold 2:
  Train: index=[    0     1     2 ... 71540 71541 71542]
  Test:  index=[    8    16    30 ... 71504 71525 71527]
model score: 0.578
48.57866644859314
Fold 3:
  Train: index=[    0     1     2 ... 71539 71540 71542]
  Test:  index=[   25    27    44 ... 71487 71505 71541]
model score: 0.579
43.54520773887634
Fold 4:
  Train: index=[    1     2     3 ... 71540 71541 71542]
  Test:  index=[    0    12    15 ... 71536 71537 71539]
model score: 0.568
48.14018964767456
Fold 5:
  Train: index=[    0     1     2 ... 71539 71540 71541]
  Test:  index=[   17    19    22 ... 71531 71535 71542]
model score: 0.586
50.04946684837341
Fold 6:
  Train: index=[    0     1     2 ... 

NLP Functions

In [7]:
corpus = ["I am being handed a list of documents", "Each of these documents has several unique words", "The words will represent the class of each review", "I am also removing stopwords in order to make this make more sense"]
cleaned_corpus = [transform_document(doc) for doc in corpus]
vocabulary = vocabulary_from_corpus(cleaned_corpus, True)
pipe = Pipeline([('count', CountVectorizer(vocabulary=vocabulary)), 
                 ('tfid', TfidfTransformer())]).fit(cleaned_corpus)),

In [12]:
def get_stopwords():
    file = open('en.txt')
    stopwords = []
    for line in file:
        stopwords.append(line.rstrip())
    return stopwords

In [23]:
def transform_corpus(review_group):
    stopwords = get_stopwords()
    reviewTextSet = review_group['reviewText']
    for index in review_group.index:
        curr_parsed = nlp(reviewTextSet[index].lower())
        doclist = []
        for token in curr_parsed:
            lemma = token.lemma_
            if not(re.match("[a-z0-9]+", lemma)):
                continue
            if lemma not in stopwords:
                doclist.append(lemma)
        reviewTextSet[index] = " ".join(doclist)

In [197]:
def transform_document(doc, remove_stopwords = True):
    #new_doc = ""
    stopwords = get_stopwords() # is this slow?
    parsed_text = nlp(doc) # is this slow
    doclist = []
    for token in parsed_text:
        lemma = token.lemma_.lower()
        if re.match("[a-z0-9]+", lemma) and (remove_stopwords == False or lemma not in stopwords):
            doclist.append(lemma) # this is less slow?
    return " ".join(doclist)

In [50]:
def bag_of_words(review_text, remove_stopwords = True):
    word_bag = {}
    stopwords = get_stopwords()
    parsed_text = nlp(review_text)
    for token in parsed_text:
        lemma = token.lemma_.lower()
        if re.match("[a-z0-9]+", lemma) and (remove_stopwords == False or lemma not in stopwords):
            if lemma in word_bag:
                word_bag[lemma] += 1
            else:
                word_bag[lemma] = 1
    return word_bag

In [51]:
def vocabulary_from_corpus(corpus, remove_stopwords = True):
    vocab_set = set()
    for document in corpus:
        word_bag = bag_of_words(document, remove_stopwords)
        for word in word_bag.keys():
            vocab_set.add(word)
    return list(vocab_set)
vocabulary_from_corpus(['this is the first document', 'this document is the second document', 'and this is the third one', 'is this the first document'], False)

['first', 'the', 'second', 'third', 'be', 'and', 'this', 'one', 'document']

In [88]:
product_df = pd.read_json('../devided_dataset_v2/CDs_and_Vinyl/train/product_training.json')
review_df = pd.read_json('../devided_dataset_v2/CDs_and_Vinyl/train/review_training.json')
#len(product_df.index)

In [6]:
teststring = "First, you need to preprocess the raw text data. This may involve tasks like tokenizing the text (i.e., splitting it into individual words), removing stopwords, stemming or lemmatizing the words, and converting the text into a numerical format that can be used as input for the model. Then, you need to split the data into training and testing sets. The training set will be used to train the model, while the testing set will be used to evaluate its performance."
#teststring.lower()
parsed = nlp(teststring.lower())