In [1]:
import pandas as pd
import numpy as np
import pickle
import joblib

from collections import Counter

import nltk
from nltk.corpus import stopwords


In [2]:
stopword_list = stopwords.words('english')
stopword_list.append('<num>')
stopword_list = list(set(stopword_list))

In [5]:
# Stopword list from NLTK
stopword_list = stopwords.words('english')
stopword_list.append('<num>')
stopword_list = list(set(stopword_list))

# Word counter for all words in dataset
word_counter = Counter()

categories = ['books', 'dvd', 'electronics', 'kitchen']
classifications = ['positive.review', 'negative.review', 'unlabeled.review']

for cat in categories:
    for cl in classifications:
        path = 'processed_acl/{}/{}'.format(cat, cl)
        df = pd.read_csv(path, header=None)
        raw_split = [[rev.split(':') for rev in df.iloc[i].values[0].split()] 
                     for i in range(len(df))]
        for review in raw_split:
            for w_count in review:
                # Ignore stopwords
                if w_count[1].isdigit():# and w_count[0] not in stopword_list:
                    word_counter[w_count[0]] += int(w_count[1])

# 5000 most common words across all dataset
common_words = [word[0] for word in word_counter.most_common(5000)]  
# Dictionary mapping words to index in numpy array
common_dict = {word:i for i, word in enumerate(common_words)}

In [7]:
# Make feature arrays for all labeled categories

X_dict = {}

for cat in categories:
    X = np.zeros((2000,5000))
    for cl_num, cl in enumerate(classifications[0:2]):
        path = 'processed_acl/{}/{}'.format(cat, cl)
        df = pd.read_csv(path, header=None)
        raw_split = [[rev.split(':') for rev in df.iloc[i].values[0].split()] 
                     for i in range(len(df))]
        for r_num, review in enumerate(raw_split):
            if r_num%500==0:
                print(r_num)
            for w_count in review:
                if w_count[1].isdigit() and w_count[0] in common_words:
                    X[cl_num*1000 + r_num, common_dict[w_count[0]]] = w_count[1]
                    
    X_dict[cat] = X.copy()



0
500
0
500
0
500
0
500
0
500
0
500
0
500
0
500


In [8]:
# Make feature arrays for all unlabeled categories

X_unlabeled_dict = {}

for cat in categories:
    cl = classifications[2]

    path = 'processed_acl/{}/{}'.format(cat, cl)
    df = pd.read_csv(path, header=None)
    X = np.zeros((len(df),5000))
    raw_split = [[rev.split(':') for rev in df.iloc[i].values[0].split()] 
                 for i in range(len(df))]
    for r_num, review in enumerate(raw_split):
        if r_num%1000==0:
            print(r_num)
        for w_count in review:
            if w_count[1].isdigit() and w_count[0] in common_words:
                X[r_num, common_dict[w_count[0]]] = w_count[1]
                    
    X_unlabeled_dict[cat] = X.copy()

0
1000
2000
3000
4000
0
1000
2000
3000
0
1000
2000
3000
4000
5000
0
1000
2000
3000
4000
5000


In [9]:
Y=np.ones(2000)
Y[1000:] = 0

In [10]:
Y=np.ones(2000)
Y[1000:] = 0
categories = ['books', 'dvd', 'electronics', 'kitchen']
cat_initial = ['B', 'D', 'E', 'K']

for c1, i1 in zip(categories, cat_initial):
    for c2, i2 in zip(categories, cat_initial):
        
        if c1!= c2:
            d = [X_dict[c1], Y.copy(), X_dict[c2], Y.copy(), X_unlabeled_dict[c2]]
            pickle.dump(d, open( "{}-{}.pkl".format(i1,i2), "wb" ))

In [22]:
x

"avid:1 your:1 horrible_book:1 wasted:1 use_it:1 the_entire:1 money.i:1 i_lit:1 i_read:1 lit:1 i_would:1 relationship:1 read:1 a_<num>:1 reader_and:1 reader:1 suffering:1 fire_one:1 i_had:1 year_old:2 gotten:1 horrible:3 lit_this:1 world...don't:1 my:2 one_star:1 headache_the:1 this_book:5 mom:1 was_horrible:1 friend:1 book_horrible:1 star_i:1 back:1 avid_reader:1 than_one:1 life:1 copy:1 rate_it:1 rate:1 my_mom:1 man:1 book_was:1 half:1 on_fire:1 and_then:1 reading_this:1 so:1 lower:1 i_could:1 <num>_year:2 than:1 time:2 half_of:1 time_spent:1 then:1 book:6 and_picked:1 possible:1 spent:1 old_man:1 up_after:1 one:2 horrible_if:1 one_less:1 part:1 was:2 entire:1 less_copy:1 to_rate:1 my_life:1 about_the:1 your_money.i:1 an_avid:1 if:1 the_relationship:1 use:1 a_headache:1 fire:1 lower_than:1 reading:1 a_friend:1 picked:1 purposes:1 then_got:1 waste_your:1 after_my:1 friend_i:1 old:2 man_and:1 and_i:1 world...don't_waste:1 book_on:1 part_about:1 copy_in:1 book_back:1 book_wasted:1 have_

In [209]:
[Xs, Ys, X_test, Y_test, Xt]=joblib.load("B-D.pkl")