In [1]:
import pandas as pd
import pickle
import numpy as np
from sklearn.tree import DecisionTreeClassifier 
from sklearn.model_selection import train_test_split

In [3]:
def open_pickle(path):
    import pickle
    with open(path, 'rb') as f:
        X = pickle.load(f)
    return X

X_train_original = open_pickle('../../data/imdb/imdb_original_preprocessed_xtrain.pickle')
X_test_original = open_pickle('../../data/imdb/imdb_original_preprocessed_xtest.pickle')
y_train_original = open_pickle('../../data/imdb/imdb_original_preprocessed_ytrain.pickle')
y_test_original = open_pickle('../../data/imdb/imdb_original_preprocessed_ytest.pickle')

In [4]:
from sklearn.feature_extraction.text import CountVectorizer

token = r"(?u)\b[\w\'/]+\b"
cv = CountVectorizer(min_df = 100, token_pattern=token, lowercase=True, binary=True)
X_train = cv.fit_transform(X_train_original)
X_test = cv.transform(X_test_original)

### Extract list

In [5]:
from sklearn.linear_model import LogisticRegression

In [6]:
clf = LogisticRegression(random_state=42, penalty='l1')
clf.fit(X_train, y_train_original)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l1', random_state=42, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [7]:
coef = clf.coef_[0]
indices = np.argsort(np.absolute(coef))[::-1]
word = cv.get_feature_names()

In [8]:
word_list = [word[i] for i in indices[:100]]

In [48]:
len(word_list)

100

### Extract 'k window' phrase

In [10]:
X_train_original[0]

'silent night, deadly night 5 is the very last of the series, and like part 4, it is unrelated to the first three except by title and the fact that it is a christmas-themed horror flick.except to the oblivious, there is some obvious thing going on here...mickey rooney plays a toymaker named joe petto and his creepy son\'s name is pino. ring a bell, anyone? now, a little boy named derek heard a knock at the door one evening, and opened it to find a present on the doorstep for him. even though it said "do not open till christmas", he begins to open it anyway but is stopped by his dad, who scolds him and sends him to bed, and opens the gift himself. inside is a little red ball that sprouts santa arm and a head, and proceed to kill dad. oop, maybe he should have left well-enough alone. of course derek is then traumatized by the incident since he watched it from the stair, but he does not grow up to be some killer santa, he just stops talking.there is a mysterious stranger lurking around, w

In [12]:
import nltk, re, pprint
from nltk import word_tokenize

In [46]:
len(word_tokenize(X_train_original[0]))

534

In [14]:
vocab = cv.vocabulary_

In [15]:
vocab['affect']

119

In [29]:
X_tr_phrase = []
token_list = []
k = 2

for doc in X_train_original:
    token_list = []
    token = word_tokenize(doc)
    for i,tok in enumerate(token):
        for j, word in enumerate(word_list):
            if tok==word:
                # Check if it reach end of document
                if i>=len(token)-1-k:
                    break
                
                join = ' '.join(token[i-k:i+k+1])
                token_list.append(join)
    
    X_tr_phrase.append(token_list)

In [30]:
# nltk.download('punkt')
len(X_tr_phrase)

25000

In [31]:
len_track = []
for doc in X_tr_phrase:
    len_track.append(len(doc))
len_track = np.asarray(len_track)

In [39]:
len(len_track)

25000

In [34]:
from sklearn.model_selection import train_test_split

'''
Assuming the label is already binarized
'''


class ExtractWindowTerms():
    def __init__(self, X_tr, y_tr, train_test_split=False, random_state=42, test_size=(1./3)):
        self.X_tr = X_tr
        self.y_tr = y_tr
        self.random_state = random_state
        self.test_size = test_size
        self.token = r"(?u)\b[\w\'/]+\b"
        
        if train_test_split:
            self.X_tr, self.X_te, self.y_tr, self.y_te = train_test_split(
                self.X_tr, 
                self.y_tr, 
                test_size=self.test_size, 
                random_state=self.random_state)
        
    def extract_word_list(self, min_df=100, penalty='l1', n_word = 100):
        cv = CountVectorizer(min_df = min_df, token_pattern=self.token, lowercase=True, binary=True)
        self.X_train = cv.fit_transform(self.X_tr)
        
        clf = LogisticRegression(random_state=self.random_state, penalty=penalty)
        clf.fit(self.X_train, self.y_tr)
        
        coef = clf.coef_[0]
        indices = np.argsort(np.absolute(coef))[::-1]
        word = cv.get_feature_names()
        self.word_list = [word[i] for i in indices[:n_word]]
        
        return self.word_list
    
    def k_window_slicing(self, X, k_window=2):
        
        X_phrase = []
        token_list = []

        for doc in X:
            token_list = []
            token = word_tokenize(doc)
            
            for i,tok in enumerate(token):
                for j, word in enumerate(word_list):
                    if tok==word:
                        # Check if it reach end of document
                        if i>=len(token)-1-k_window:
                            break

                        join = ' '.join(token[i-k_window:i+k_window+1])
                        token_list.append(join)

            X_phrase.append(token_list)
            
        return X_phrase

In [35]:
ext = ExtractWindowTerms(X_train_original, y_train_original)

In [36]:
_ = ext.extract_word_list()

In [37]:
X_phrase = ext.k_window_slicing(X_train_original, k_window=3)

In [42]:
len([i for i in X_phrase if len(i)>0])

14949

In [41]:
len([i for i in X_phrase if len(i)==0])

10051

In [47]:
len(_)

100

In [49]:
_

['7/10',
 '3/10',
 '1/10',
 '4/10',
 '2/10',
 '8/10',
 'unwatchable',
 'refreshing',
 'incoherent',
 'unfunny',
 'stinker',
 'waste',
 '10/10',
 'disappointment',
 'poorly',
 'behave',
 'superbly',
 'worst',
 'miscast',
 'flawless',
 'uninspired',
 '9/10',
 'appalling',
 'pointless',
 'lousy',
 'cardboard',
 'boredom',
 'forgettable',
 'mildly',
 'obnoxious',
 'hooked',
 'awful',
 'mst3k',
 'haunting',
 'fails',
 'fest',
 'wooden',
 'furthermore',
 'laughable',
 'alright',
 'dull',
 'dimensional',
 'captures',
 'appreciated',
 'noir',
 'troubled',
 'uninteresting',
 'wasting',
 'rare',
 'incredible',
 'dreadful',
 'junk',
 'lacks',
 'credibility',
 'existed',
 'avoid',
 'disappointing',
 'wonderfully',
 'batman',
 'ripped',
 'dramas',
 'underrated',
 'monkey',
 'outer',
 'remotely',
 'tight',
 'ideal',
 'smooth',
 'mess',
 'gem',
 'redeeming',
 'funniest',
 'excellent',
 'hype',
 'trite',
 'prince',
 'driven',
 'generous',
 'boring',
 'delightful',
 'pretentious',
 'executed',
 'steals