# Fitting various ML models

Basic preprocessing
    - Remove stopwords from the text (custom wordlist?)
    - Lemmatization?
    
Experiment with various feature engineering techniques
    - Remove the aspect from the text
    - CountVectorizer, TfidfVectorizer
    - ngram ranges
    


In [121]:
from spacy.tokenizer import Tokenizer
from spacy import load
from sklearn.metrics import accuracy_score, confusion_matrix
import numpy as np
import re
from collections import Counter
import pandas as pd

nlp = load("en")
tokenizer = Tokenizer(nlp.vocab)


# Remove Stopwords
# Custom stopword list?
def remove_stopwords(sentence) :
    return " ".join([str(token) for token in tokenizer(sentence.replace(',', '').replace(".","").lower())
                     if not token.is_stop and not token.is_punct and not token.is_digit and token.is_alpha])


def lemmatize(sentence):
    return " ".join([token.lemma_ for token in nlp(sentencence)])

def remove_aspect(text, aspect) :
    pattern = '\s*'+aspect.replace('(', '\(').replace(')', '\)')+'\s*'
    return re.sub(pattern, ' ', text)


def split_text(text, on, method):
    spltd = text.split(on)
    
    if method == 'before':
        res = spltd[0]
    elif method == 'after':
        if len(spltd) > 1: 
            res = spltd[1]
        else:
            res = ' '
    
    return res

# split and get left side of the sentence
def split_left(text_splitpoint) :
    sentence, split_point = text_splitpoint
    return sentence.split(split_point)[0]


# split and get right side of the sentence
def split_right(text_splitpoint):
    sentence, split_point = text_splitpoint
    split = sentence.split(split_point)
    return split[1] if len(split)>1 else " "


def remove_polarity(polarity, df):
    return df.loc[df.polarity != polarity, :]
    


In [122]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split
import numpy as np

In [6]:
from utils import load_raw_file
df = load_raw_file('restaurants', 'train', '2014')

In [7]:
df.head()

Unnamed: 0,id,text,term,from,to,polarity,category,category_polarity
0,3121,But the staff was so horrible to us.,staff,8,13,negative,service,negative
1,2777,"To be completely fair, the only redeeming fact...",food,57,61,positive,"food, anecdotes/miscellaneous","positive, negative"
2,1634,"The food is uniformly exceptional, with a very...",food,4,8,positive,food,positive
3,1634,"The food is uniformly exceptional, with a very...",kitchen,55,62,positive,food,positive
4,1634,"The food is uniformly exceptional, with a very...",menu,141,145,neutral,food,positive


In [14]:
def preprocess(df):
    
    df = df.copy()
    
    cols = ['text', 'term', 'polarity']
    df = remove_polarity('conflict', df).loc[:, cols]
    
   # remove stopwords
    df.loc[:, 'text'] = df.text.apply(remove_stopwords)

    # lemmatize
    # texts = []
    # for doc in nlp.pipe(df.text):
    #     sent_lemmatized = ' '.join([token.lemma_ for token in doc])
    #     texts.append(sent_lemmatized)

    # df.loc[:, 'text'] = texts

    # extract before aspect and after aspect sentence
    df['before_aspect'] = df.loc[:, ['text', 'term']].apply(
                          lambda r: split_text(r['text'], r['term'], 'before'), axis=1)

    df['after_aspect'] = df.loc[:, ['text', 'term']].apply(
                          lambda r: split_text(r['text'], r['term'], 'after'), axis=1)

    # remove aspect 
    df['without_aspect'] = df.loc[:, ['text', 'term']].apply(
                          lambda r: remove_aspect(r['text'], r['term']), axis=1)
    
    
    return df
    
    

In [17]:
df = preprocess(df)

In [18]:
df.head()

Unnamed: 0,text,term,polarity,before_aspect,after_aspect,without_aspect
0,staff horrible,staff,negative,,horrible,horrible
1,completely fair redeeming factor food average ...,food,positive,completely fair redeeming factor,average deficiencies teodora,completely fair redeeming factor average defic...
2,food uniformly exceptional capable kitchen pro...,food,positive,,uniformly exceptional capable kitchen proudly...,uniformly exceptional capable kitchen proudly...
3,food uniformly exceptional capable kitchen pro...,kitchen,positive,food uniformly exceptional capable,proudly whip feel like eating menu,food uniformly exceptional capable proudly whi...
4,food uniformly exceptional capable kitchen pro...,menu,neutral,food uniformly exceptional capable kitchen pro...,,food uniformly exceptional capable kitchen pro...


In [19]:
df.polarity.value_counts()

positive    2164
negative     805
neutral      633
Name: polarity, dtype: int64

In [38]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.pipeline import Pipeline

from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier


from sklearn.metrics import classification_report


In [39]:
le = LabelEncoder()

In [106]:
X_train = df.without_aspect
y_train = le.fit_transform(df.polarity)

In [107]:
cvec = CountVectorizer()
cvec.fit(X_train)

X_train_lr = np.hstack([cvec.transform(df.before_aspect).A, cvec.transform(df.after_aspect).A])

In [108]:
df_test = load_raw_file('restaurants', 'test')
df_test = preprocess(df_test)

In [114]:
X_test = df_test.without_aspect
y_test = le.transform(df_test.polarity)

In [115]:
X_test_lr = np.hstack([cvec.transform(df_test.before_aspect).A, cvec.transform(df_test.after_aspect).A])

In [116]:
pipe = Pipeline([
    ('vectorize', TfidfVectorizer(max_features=6000, ngram_range=(1, 2),)),
    ('clf', SVC(C=1))
])

pipe.fit(X_train, y_train)

Pipeline(steps=[('vectorize',
                 TfidfVectorizer(max_features=6000, ngram_range=(1, 2))),
                ('clf', SVC(C=1))])