In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
from nltk.tokenize import word_tokenize,wordpunct_tokenize,sent_tokenize 
from nltk.corpus import stopwords 
import string
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC,SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [2]:
data = pd.read_csv('Restaurant_review.csv')

In [3]:
data.head()

Unnamed: 0,example_id,text,aspect_term,term_location,class
0,3121_0,But the staff was so horrible to us.,staff,8--13,-1
1,2777_0,To be completely fair[comma] the only redeemin...,food,57--61,1
2,1634_0,The food is uniformly exceptional[comma] with ...,food,4--8,1
3,1634_1,The food is uniformly exceptional[comma] with ...,kitchen,55--62,1
4,1634_2,The food is uniformly exceptional[comma] with ...,menu,141--145,0


In [4]:
data_copy = data.copy()

In [5]:
data_copy.columns = ['example_id','text','aspect_term','term_location','class']

In [6]:
countVectorizer = CountVectorizer()
document_matrix = countVectorizer.fit_transform(data_copy['text'])

In [7]:
vocab = list(countVectorizer.vocabulary_.keys())
len(vocab)

3716

In [8]:
def pre_process(message):
    message = message.replace("[comma]",",")
    message = " ".join(wordpunct_tokenize(message))
    nopunc = [char for char in message if char not in string.punctuation]
    message = "".join(nopunc)
    message = [text for text in message.strip().split() if text not in set(stopwords.words('english'))]    
    message = " ".join(message)
    return word_tokenize(message)

In [9]:
data_copy['text'] = data_copy['text'].apply(pre_process)

In [10]:
data_copy['text'].head()

0                           [But, staff, horrible, us]
1    [To, completely, fair, redeeming, factor, food...
2    [The, food, uniformly, exceptional, capable, k...
3    [The, food, uniformly, exceptional, capable, k...
4    [The, food, uniformly, exceptional, capable, k...
Name: text, dtype: object

In [11]:
data_copy['mod_aspect_term'] = data_copy['aspect_term'].apply(pre_process)

In [12]:
def calculate_weights(x):
    text = x[0]
    aspect = x[1]
    if set(aspect) <= set(text):
        left_weights = right_weights = []
        sentence = " ".join(text)
        aspect_join = " ".join(aspect)
        start_list= [i for i, x in enumerate(text) if x == aspect[0]]
        for q in (start_list):
            if  text[(q + len(aspect) - 1)] == aspect[-1]:
                start_index = q
                end_index = q + len(aspect) - 1
                break
        if (end_index - start_index) == len(aspect) - 1:
            left_text = text[:start_index]
            right_text = text[end_index+1:]
            left_weights = [1/i for i in range(len(left_text),0,-1) if len(left_text) != 0]
            right_weights = [1/i for i in range(1,len(right_text)+1) if len(right_text) != 0]
        tot_weights = left_weights + [2]*len(aspect) + right_weights
        return dict(zip(text,tot_weights))
    else: 
        return np.nan

In [13]:
data_copy['weights_score'] = data_copy[['text','mod_aspect_term']].apply(calculate_weights, axis = 1)
data_copy = data_copy.dropna()
df_new = pd.DataFrame(np.zeros((len(data_copy),len(vocab))),columns=vocab)

In [14]:
data_copy['weights_score'].head()

0    {'But': 1.0, 'staff': 2, 'horrible': 1.0, 'us'...
1    {'To': 0.2, 'completely': 0.25, 'fair': 0.3333...
2    {'The': 1.0, 'food': 2, 'uniformly': 1.0, 'exc...
3    {'The': 0.2, 'food': 0.25, 'uniformly': 0.3333...
4    {'The': 0.07692307692307693, 'food': 0.0833333...
Name: weights_score, dtype: object

In [15]:
for row in range(len(data_copy)):
    for key,value in data_copy.iloc[row]['weights_score'].items():
        df_new.iloc[row][key] = value

tfidf= TfidfTransformer().fit_transform(df_new)

### Cross Val on RBF SVM Kernel

In [16]:
svc = SVC(C=1.2,random_state=0)
pred_weight_svc = cross_val_predict(svc,tfidf,data_copy['class'],cv = 10)
np.mean(pred_weight_svc == data_copy['class'])

0.6008333333333333

### Cross Val on Linear SVM kernel

In [17]:
svm = LinearSVC(C=1.2,random_state=0)
pred_weight_svm = cross_val_predict(svm,tfidf,data_copy['class'],cv = 10)
np.mean(pred_weight_svm == data_copy['class'])

0.6683333333333333

### Cross Val on MultinomialNB

In [18]:
mnb = MultinomialNB()
pred_weight_mnb = cross_val_predict(mnb,tfidf,data_copy['class'],cv = 10)
np.mean(pred_weight_mnb == data_copy['class'])

0.6305555555555555

### Cross Val on Random Forest Classifier

In [19]:
rfc = RandomForestClassifier(n_estimators=50,max_depth=1500,random_state=0)
pred_weight_rfc = cross_val_predict(rfc,tfidf,data_copy['class'],cv = 10)
np.mean(pred_weight_rfc == data_copy['class'])

0.6525

In [20]:
print("\n Random Forrest Metrics\n ", classification_report(pred_weight_rfc,data_copy['class']))


 Random Forrest Metrics
               precision    recall  f1-score   support

         -1       0.32      0.60      0.42       430
          0       0.18      0.38      0.25       305
          1       0.91      0.69      0.79      2865

avg / total       0.78      0.65      0.70      3600



# Holdout Method

In [21]:
x_train,x_test,y_train,y_test = train_test_split(tfidf,data_copy['class'],test_size = 0.3,random_state = 0)

#### Linear SVM 

In [22]:
model = LinearSVC(C=1.2,random_state=0).fit(X=x_train,y=y_train)
model.score(x_test,y_test)

0.7111111111111111

#### Random Forrest

In [23]:
model = RandomForestClassifier(n_estimators=50,max_depth=1500,random_state=0).fit(X=x_train,y=y_train)
model.score(x_test,y_test)

0.712037037037037

# Final Model using Random Forrest

In [24]:
model = RandomForestClassifier(n_estimators=50,max_depth=1500,random_state=0).fit(tfidf,y=data_copy['class'])

# Run final dataset

In [25]:
test_data = pd.read_csv('Data-2_test.csv')


In [26]:
test_data.columns = ['example_id','text','aspect_term','term_location']

In [27]:
test_data.head()

Unnamed: 0,example_id,text,aspect_term,term_location
0,32933228#1700177#1_2,I reccomend the fried pork dumplings[comma] th...,fried rice,71--81
1,35820984#608922#3_0,The staff is very sharp and they look good too.,staff,4--9
2,35170181#0#5_1,The best dessert[comma] a chocolate and peanut...,chocolate and peanut butter tart,20--52
3,33067279#1612676#1_1,The food was very good and I was pleasantly su...,vegan options,69--82
4,32882616#562969#3_0,I never had an orange donut before so I gave i...,orange donut,15--27


In [28]:
test_data['text'] = test_data['text'].apply(pre_process)

In [29]:
test_data['mod_aspect_term'] = test_data['aspect_term'].apply(pre_process)

In [30]:
test_data['weights_score'] = test_data[['text','mod_aspect_term']].apply(calculate_weights, axis = 1)
test_data = test_data.dropna()
df_test = pd.DataFrame(np.zeros((len(test_data),len(vocab))),columns=vocab)

In [31]:
for row in range(len(test_data)):
    for key,value in test_data.iloc[row]['weights_score'].items():
        df_test.iloc[row][key] = value

tfidf_test= TfidfTransformer().fit_transform(df_test)

In [32]:
test_data['class']=(model.predict(tfidf_test))

In [33]:
data_copy['class'].head()

0   -1
1    1
2    1
3    1
4    0
Name: class, dtype: int64