In [1]:
import numpy as np
from numpy.random import seed
import tensorflow as tf

from tensorflow import set_random_seed
seed(42)
set_random_seed(42)

from keras.layers import Input, Dense, TimeDistributed, Embedding
from keras.layers import Concatenate, Reshape, Lambda, Multiply, multiply, concatenate
from keras.models import Model
from keras import backend as K

import matplotlib.pyplot as plt
from matplotlib import style

from dataset_load import *
from lr_baseline import LRBaseline

style.use('seaborn-whitegrid')

def open_pickle(path):
    import pickle
    with open(path, 'rb') as f:
        X = pickle.load(f)
    return X

def load_unigrams(path, X, y):
    word_list = []
    connotation = {}
    
    with open(path, 'r', encoding='utf8') as f:
        for line in f:
            word_list.append(line.strip())
            
    for word in word_list:
        pos_count = 0
        neg_count = 0
        for i, doc in enumerate(X):
            if word in doc.lower():
                
                if (y[i] == 1):
                    pos_count += 1
                else:
                    neg_count += 1
                    
        if pos_count > neg_count:
            connotation[word] = 1
        else:
            connotation[word] = 0
    
    return word_list, connotation

def generate_appearance(X_train_corpus, X_test_corpus, word_list, connotation):
    y_train_agreement = []
    for i in range(len(X_train_corpus)):
        doc_agreement = []
        for word in word_list:
            if word in X_train_corpus[i]:
                if connotation[word] == 1:
                    doc_agreement.append(1)
                else:
                    doc_agreement.append(-1)
            else:
                doc_agreement.append(0)
        y_train_agreement.append(doc_agreement)
        
    y_test_agreement = []
    for i in range(len(X_test_corpus)):
        doc_agreement = []
        for word in word_list:
            if word in X_test_corpus[i]:
                if connotation[word] == 1:
                    doc_agreement.append(1)
                else:
                    doc_agreement.append(-1)
            else:
                doc_agreement.append(0)
        y_test_agreement.append(doc_agreement)
        
    return np.array(y_train_agreement), np.array(y_test_agreement)

# 'imdb-unigrams.txt'
import pandas as pd

path = r'../../data/womens-ecommerce-clothing-reviews/Womens_Clothing_E-Commerce_Reviews.csv'

df = pd.read_csv(path)

X = list(df['Review Text'])
y = list(df['Rating'])
y_label = np.asarray(y)


Using TensorFlow backend.


In [2]:
y_label[y_label<3] = 0
y_label[y_label>3] = 1

neutral_indices = np.where(y_label==3)[0]
y_label = np.delete(y_label, neutral_indices)


In [3]:
X = np.delete(X, neutral_indices)

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, ShuffleSplit

# split
X_train_split, X_test_split, y_train, y_test = train_test_split(X, y_label, test_size=0.33, random_state=42)

# preprocessing
X_train_corpus_update = update_corpus_contraction(X_train_split)
X_test_corpus_update = update_corpus_contraction(X_test_split)

# Count vectorizer 

# # count vectorizer
# token = r"(?u)\b[\w\'/]+\b"
# cv = CountVectorizer(lowercase=True, max_df=1.0, min_df=100, binary=True, token_pattern=token)
# cv.set_params(ngram_range=(1,1))

# cv.fit(X_train_split)

# X_train = cv.transform(X_train_corpus_update)
# X_test = cv.transform(X_test_corpus_update)

# words = cv.get_feature_names()


word_list, connotation = load_unigrams('./ecom-unigrams.txt', X_train_corpus_update, y_train)
# y_train_agreement, y_test_agreement = generate_appearance(X_train_corpus_update, X_test_corpus_update, 
#                                                           word_list, connotation)


(75, 2)
corpus update start
corpus update end

(75, 2)
corpus update start
corpus update end



In [5]:
ecom_baseline = LRBaseline(X_train_corpus_update, X_test_corpus_update, y_train, y_test, 
                           human_terms=word_list)

In [6]:
print(ecom_baseline.baseline())
print(ecom_baseline.baseline(penalty='l1'))
print(ecom_baseline.human_terms_baseline())
print(ecom_baseline.human_terms_baseline(penalty='l1'))

(0.9477266145380828, 0.927237983242687, 0.1933773987053664)
(0.9466406023747466, 0.9269439952961929, 0.18966799841674473)
(0.9055893426006372, 0.9169994790762285, 0.22732426617153725, 0.15346170806996914)
(0.9053721401679699, 0.9171731203333913, 0.22767620722993168, 0.15346170806996914)
