In [3]:
import pandas as pd
import numpy as np
import os
import nltk
from nltk.tokenize import WordPunctTokenizer
from nltk.stem.snowball import SnowballStemmer
import math


from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import *
from sklearn import svm
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, r2_score

from utils import load_raw_file, remove_polarity

Using Theano backend.


## Define the text preprocessing functions

In [4]:
def tokenize(text):
    tokenizer = WordPunctTokenizer()
    tokens = tokenizer.tokenize(text)
    words = [word for word in tokens if word.isalnum()]
    return words


def preprocess(df):
    df['text'] = df['text'].str.lower()
    df['text'] = df['text'].apply(tokenize)
    df['term'] = df['term'].str.lower()
    df['term'] = df['term'].apply(tokenize)
    df['term'] = df['term'].apply(lambda x:" ".join(x))
    return df

polarity_label = {
    'positive': 1,
    'negative': -1,
    'neutral': 0
}


def create_vocab(df):
    text_w2id = {'<pad>': 0}
    term_w2id = {}
    for words in df['text']:
        for word in words:
            if word not in text_w2id:
                text_w2id[word] = len(text_w2id)

    for words in df['term']:
        if words not in term_w2id:
            term_w2id[words] = len(term_w2id)
    return text_w2id, term_w2id

### Function to get the relative position of the token w.r.t. aspect

In [5]:
def get_relative_token_pos(row):
    index = []
    s_len = len(row['text'])-1
    p = row['text'].copy()
    aspects = row['term'].split(' ')
    for aspect in aspects:
        try:
            if len(aspects)-1 > aspects.index(aspect):
                a_i = [i for i,val in enumerate(row['text']) if val==aspect]
                try:
                    for a_id in a_i:
                        if row['text'][a_id+1] != aspects[aspects.index(aspect)+1]:
                            a_i.remove(a_id)
                except:
                    pass
                index.extend(a_i[0])
            else:
                index.append(row['text'].index(aspect))
            p[row['text'].index(aspect)] = s_len
        except:
            pass
    try:
        for i in range(index[0]):
            p[i] = s_len - index[0] + i
        v = s_len
        for i in range(index[len(index)-1],len(p)):
            if i == index[len(index)-1]:
                p[i] = v
            else:    
                p[i] = v - 1
                v = v-1
            
    except Exception as e: 
        p = [0 for i in row['text']]
    text_pos_data.append(p)
    return p

## Load the data

In [6]:
def prepare_data(row):
    global max_tokens
    global text_w2id
    global term_word2idx
    
    text_ids = [text_w2id[idx] for idx in row['text']]
    text_data.append(text_ids)
    
    n = len(text_ids)

    if n > max_tokens:
        max_tokens = n
        
    term_ids = [term_w2id[row['term']]]
    term_data.append(term_ids)
    
    
    polarity.append(polarity_label[row['polarity']])
    pos_ids = get_relative_token_pos(row)
    
    prepared_row = [text_ids, pos_ids, term_ids]
    prepared_rows.append(prepared_row)

In [7]:
def generate_data(df, text2id, term2id):
    global text_w2id
    global term_w2id
    text_w2id = text2id
    term_w2id = term2id

    df.apply(prepare_data,axis = 1)
    return text_data, text_pos_data, term_data, polarity, max_tokens

## Define some global variables 

These will be used to featurize the data

In [8]:
text_w2id = {}
term_w2id = {}
text_data = []
text_pos_data = []
term_data = []
polarity = []

prepared_rows = []
max_tokens = 0

In [9]:

df_train = load_raw_file('restaurants', 'train')
df_train = remove_polarity('conflict', df_train)


df_test = load_raw_file('restaurants', 'test')
df_test = remove_polarity('conflict', df_test)

df = pd.concat([df_train, df_test], axis=0)

test_start_id = df_train.shape[0]

df_prep = preprocess(df)
text_w2id, term_w2id = create_vocab(df_prep)

In [10]:
def get_train_test_data(X, y):
    X_train, y_train = X[:test_start_id], y[:test_start_id]
    X_test, y_test = X[test_start_id:], y[test_start_id:]
    
    return X_train, y_train, X_test, y_test

In [11]:
text_data, text_pos_data, term_data, polarity, max_tokens = generate_data(df_prep, text_w2id, term_w2id)

In [12]:
def featurize_data(text_data, text_pos_data, term_data, polarity, max_tokens):
    
    row_arr = []
    for i in range(len(text_data)):
        x1, x2, x3 = [], [], []

        x1.extend(text_data[i])
        if len(x1) < max_tokens:
            for _ in range(max_tokens - len(x1)):
                x1.append(0)

        x2.extend(text_pos_data[i])
        if len(x2) < max_tokens:
            for _ in range(max_tokens - len(x2)):
                x2.append(0)

        x3.extend(term_data[i])
        for j in range(len(x1) - 1):
            x3.append(x3[0])
        for k in range(max_tokens - len(x3)):
            x3.append(0)

        row_arr.append([x1, x2, x3])

    row_arr = np.array(row_arr)
    nobs, n1, n2 = row_arr.shape
    X = row_arr.reshape((nobs, n1*n2))
    y = np.array(polarity)
    

    return X, y

In [15]:
X, y = featurize_data(text_data, text_pos_data, term_data, polarity, max_tokens)
X_train, y_train, X_test, y_test = get_train_test_data(X, y)

In [16]:
from imblearn.over_sampling import SMOTE, ADASYN
X_resampled, y_resampled = SMOTE().fit_sample(X_train, y_train)

In [17]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

In [20]:
clf = MultinomialNB(alpha=0.5)
clf.fit(X_resampled, y_resampled)

MultinomialNB(alpha=0.5)