In [1]:
import pandas as pd
import numpy as np
import pickle
import joblib

from collections import Counter

import nltk
from nltk.corpus import stopwords


In [2]:
stopword_list = stopwords.words('english')
stopword_list.append('<num>')
stopword_list = list(set(stopword_list))

In [3]:
# Stopword list from NLTK
stopword_list = stopwords.words('english')
stopword_list.append('<num>')
stopword_list = list(set(stopword_list))

# Word counter for all words in dataset
word_counter = Counter()

categories = ['books', 'dvd', 'electronics', 'kitchen']
classifications = ['positive.review', 'negative.review', 'unlabeled.review']

for cat in categories:
    for cl in classifications:
        path = 'processed_acl/{}/{}'.format(cat, cl)
        df = pd.read_csv(path, header=None)
        raw_split = [[rev.split(':') for rev in df.iloc[i].values[0].split()] 
                     for i in range(len(df))]
        for review in raw_split:
            for w_count in review:
                # Ignore stopwords
                if w_count[1].isdigit():# and w_count[0] not in stopword_list:
                    word_counter[w_count[0]] += int(w_count[1])

# 5000 most common words across all dataset
common_words = [word[0] for word in word_counter.most_common(5000)]  
# Dictionary mapping words to index in numpy array
common_dict = {word:i for i, word in enumerate(common_words)}

In [4]:
# Make feature arrays for all labeled categories

X_dict = {}

for cat in categories:
    X = np.zeros((2000,5000))
    for cl_num, cl in enumerate(classifications[0:2]):
        path = 'processed_acl/{}/{}'.format(cat, cl)
        df = pd.read_csv(path, header=None)
        raw_split = [[rev.split(':') for rev in df.iloc[i].values[0].split()] 
                     for i in range(len(df))]
        for r_num, review in enumerate(raw_split):
            if r_num%500==0:
                print(r_num)
            for w_count in review:
                if w_count[1].isdigit() and w_count[0] in common_words:
                    X[cl_num*1000 + r_num, common_dict[w_count[0]]] = w_count[1]
                    
    X_dict[cat] = X.copy()



0
500
0
500
0
500
0
500
0
500
0
500
0
500
0
500


In [5]:
# Make feature arrays for all unlabeled categories

X_unlabeled_dict = {}

for cat in categories:
    cl = classifications[2]

    path = 'processed_acl/{}/{}'.format(cat, cl)
    df = pd.read_csv(path, header=None)
    X = np.zeros((len(df),5000))
    raw_split = [[rev.split(':') for rev in df.iloc[i].values[0].split()] 
                 for i in range(len(df))]
    for r_num, review in enumerate(raw_split):
        if r_num%1000==0:
            print(r_num)
        for w_count in review:
            if w_count[1].isdigit() and w_count[0] in common_words:
                X[r_num, common_dict[w_count[0]]] = w_count[1]
                    
    X_unlabeled_dict[cat] = X.copy()

0
1000
2000
3000
4000
0
1000
2000
3000
0
1000
2000
3000
4000
5000
0
1000
2000
3000
4000
5000


In [6]:
Y=np.ones(2000)
Y[1000:] = 0

In [7]:
Y=np.ones(2000)
Y[1000:] = 0
categories = ['books', 'dvd', 'electronics', 'kitchen']
cat_initial = ['B', 'D', 'E', 'K']

for c1, i1 in zip(categories, cat_initial):
    for c2, i2 in zip(categories, cat_initial):
        
        if c1!= c2:
            d = [X_dict[c1], Y.copy(), X_dict[c2], Y.copy(), X_unlabeled_dict[c2]]
            pickle.dump(d, open( "{}-{}.pkl".format(i1,i2), "wb" ))

In [8]:
cat = 'books'
cl = 'positive.review'

path = 'processed_acl/{}/{}'.format(cat, cl)
df = pd.read_csv(path, header=None)

df[0][2]

'woman_the:1 contains_the:1 fan_i:1 alex_ross(superman:1 justice:1 read:1 comics_fan:1 again:1 league_etc:1 fans:1 recieved:1 hanna-barbera!)_a:1 book_fans:1 wonder:1 gift:1 gorgeous_artwork:1 gift_and:1 contains:1 i_recieved:1 artwork:2 christmas:1 read_it:1 wonder_woman:1 justice_league:1 a_comics:1 again_and:1 even:1 i_read:1 the_most:2 gorgeous:1 of_alex:1 i:2 extraordinary:1 most_gorgeous:1 most:2 it_again:1 comic_books:1 and_i:1 ross(superman_batman:1 etc_even:1 etc:1 the_justice:1 fan:1 beautiful:1 again.a:1 even_hanna-barbera!):1 comics:1 batman_wonder:1 for_comic:1 in_comic:1 artwork_in:1 books_contains:1 woman:1 a_christmas:1 extraordinary_artwork:1 books:1 christmas_gift:1 ross(superman:1 league:1 artwork_of:1 most_extraordinary:1 comic_book:1 book:1 recieved_this:1 batman:1 must-have_for:1 hanna-barbera!):1 must-have:1 again.a_must-have:1 alex:1 and_again.a:1 comic:2 #label#:positive'