In [1]:
import numpy as np
import pandas as pd
import string
import matplotlib.pyplot as plt
from nltk.corpus import stopwords 
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import classification_report

In [2]:
# importing training data
df = pd.read_csv('restaurant_train.csv')
df.columns = ['example_id','text','aspect_term','term_location','class']
df.head()

Unnamed: 0,example_id,text,aspect_term,term_location,class
0,3121_0,But the staff was so horrible to us.,staff,8--13,-1
1,2777_0,To be completely fair[comma] the only redeemin...,food,57--61,1
2,1634_0,The food is uniformly exceptional[comma] with ...,food,4--8,1
3,1634_1,The food is uniformly exceptional[comma] with ...,kitchen,55--62,1
4,1634_2,The food is uniformly exceptional[comma] with ...,menu,141--145,0


In [3]:
# importing testing data
df_test2 = pd.read_csv('restaurant_test.csv')
df_test2.columns = ['example_id','text','aspect_term','term_location']
df_test2.head()

Unnamed: 0,example_id,text,aspect_term,term_location
0,32933228#1700177#1_2,I reccomend the fried pork dumplings[comma] th...,fried rice,71--81
1,35820984#608922#3_0,The staff is very sharp and they look good too.,staff,4--9
2,35170181#0#5_1,The best dessert[comma] a chocolate and peanut...,chocolate and peanut butter tart,20--52
3,33067279#1612676#1_1,The food was very good and I was pleasantly su...,vegan options,69--82
4,32882616#562969#3_0,I never had an orange donut before so I gave i...,orange donut,15--27


In [4]:
def text_prep(text):
    text = text.replace("[comma]", "")
    no_punct = [char for char in text if char not in string.punctuation]
    text = "".join(no_punct)
    # Using lemmatization instead of stemming
    text = [WordNetLemmatizer().lemmatize(text, pos='v') for text in text.lower().split() if text not in set(stopwords.words('english'))] 
    text = " ".join(text)
    return word_tokenize(text)

In [5]:
def get_weights(x):
    text = x[0]
    aspect = x[1]
    if set(aspect) <= set(text):
        left_weights = right_weights = []
        start_list= [i for i, x in enumerate(text) if x == aspect[0]]
        for q in (start_list):
            if  text[(q + len(aspect) - 1)] == aspect[-1]:
                start_index = q
                end_index = q + len(aspect) - 1
                break
        if (end_index - start_index) == len(aspect) - 1:
            left_text = text[:start_index]
            right_text = text[end_index+1:]
            left_weights = [1/i for i in range(len(left_text),0,-1) if len(left_text) != 0]
            right_weights = [1/i for i in range(1,len(right_text)+1) if len(right_text) != 0]
        total_weights = left_weights + [2]*len(aspect) + right_weights
        return dict(zip(text,total_weights))
    else: 
        return np.nan

## Data Preprocessing 

In [6]:
# preprocess text and aspect_term columns
df['prep_text'] = df['text'].apply(text_prep)
df['prep_aspect_term'] = df['aspect_term'].apply(text_prep)
df[['prep_text', 'prep_aspect_term']].head()

Unnamed: 0,prep_text,prep_aspect_term
0,"[staff, horrible, us]",[staff]
1,"[completely, fair, redeem, factor, food, avera...",[food]
2,"[food, uniformly, exceptional, capable, kitche...",[food]
3,"[food, uniformly, exceptional, capable, kitche...",[kitchen]
4,"[food, uniformly, exceptional, capable, kitche...",[menu]


In [7]:
# preprocessing test data
df_test2['prep_test2_text'] = df_test2['text'].apply(text_prep)
df_test2['prep_test2_aspect_term'] = df_test2['aspect_term'].apply(text_prep)
df_test2[['prep_test2_text', 'prep_test2_aspect_term']].head()

Unnamed: 0,prep_test2_text,prep_test2_aspect_term
0,"[reccomend, fry, pork, dumplings, orange, chic...","[fry, rice]"
1,"[staff, sharp, look, good]",[staff]
2,"[best, dessert, chocolate, peanut, butter, tar...","[chocolate, peanut, butter, tart]"
3,"[food, good, pleasantly, surprise, see, many, ...","[vegan, options]"
4,"[never, orange, donut, give, shoot]","[orange, donut]"


## Assigning weights

In [8]:
# weighing training data
df['weights_score'] = df[['prep_text','prep_aspect_term']].apply(get_weights, axis = 1)
df = df.dropna()
df.iloc[1]['weights_score']

{u'average': 1,
 u'completely': 0,
 u'couldnt': 0,
 u'deficiencies': 0,
 u'factor': 1,
 u'fair': 0,
 u'food': 2,
 u'make': 0,
 u'redeem': 0,
 u'teodora': 0}

In [9]:
# weighing testest data
df_test2['test2_weights_score'] = df_test2[['prep_test2_text','prep_test2_aspect_term']].apply(get_weights, axis = 1)
df_test2 = df_test2.dropna()
df_test2.iloc[0]['test2_weights_score']

{u'chickenbeef': 1,
 u'dumplings': 0,
 u'fry': 2,
 u'orange': 0,
 u'pork': 0,
 u'reccomend': 0,
 u'rice': 2}

## Vectorizing data using CountVectorizer and Tf-idf Transformer 

In [10]:
# training data vectorized
vec = CountVectorizer()
doc_matrix = vec.fit_transform(df['text'])
vocab = list(vec.vocabulary_.keys())
df_weights = pd.DataFrame(np.zeros((len(df),len(vocab))),columns=vocab)
for row in range(len(df)):
    for key,value in df.iloc[row]['weights_score'].items():
        df_weights.iloc[row][key] = value
        
tfidf_vec = TfidfTransformer()
tfidf = tfidf_vec.fit_transform(df_weights)

In [11]:
# testing data vectorized
test2_doc_matrix = vec.transform(df_test2['text'])
test2_vocab = list(vec.vocabulary_.keys())
df_test2_weights = pd.DataFrame(np.zeros((len(df_test2),len(test2_vocab))),columns=test2_vocab)
for row in range(len(df_test2)):
    for key,value in df_test2.iloc[row]['test2_weights_score'].items():
        df_test2_weights.iloc[row][key] = value

test2_tfidf = tfidf_vec.transform(df_test2_weights)

## Random Forest as final model 

In [12]:
# Choosing Random Forest as our final model
rfc = RandomForestClassifier(n_estimators=50,max_depth=2000)
crv_rfc = cross_val_predict(rfc,tfidf,df['class'],cv = 10)
np.mean(crv_rfc == df['class'])

0.6270571827057183

In [13]:
Y = df['class']
print("\n Classification Report \n ", classification_report(crv_rfc,Y))
rfc = RandomForestClassifier(n_estimators=50,max_depth=2000).fit(tfidf, df['class'])
test_rfc = rfc.predict(test2_tfidf)

('\n Classification Report \n ', u'              precision    recall  f1-score   support\n\n          -1       0.37      0.46      0.41       640\n           0       0.27      0.45      0.34       384\n           1       0.83      0.70      0.76      2561\n\n   micro avg       0.63      0.63      0.63      3585\n   macro avg       0.49      0.53      0.50      3585\nweighted avg       0.69      0.63      0.65      3585\n')


## Output file

In [14]:
# Please remember to delete the txt file created before otherwise it will append to the previous results

with open('Result.txt', 'a') as out:
    for i,j in zip(df_test2['example_id'],test_rfc):
        out.write(str(i) + ";;" + str(j) + "\n")

## Multinomial Naive Bayes

In [15]:
mnb = MultinomialNB()
crv_mnb = cross_val_predict(mnb,tfidf,df['class'],cv = 10)
np.mean(crv_mnb == df['class'])


0.6167364016736402

In [16]:
Y = df['class']
print("\n Classification Report \n ", classification_report(crv_mnb,Y))

('\n Classification Report \n ', u'              precision    recall  f1-score   support\n\n          -1       0.07      0.67      0.13        87\n           0       0.03      0.52      0.07        42\n           1       0.99      0.62      0.76      3456\n\n   micro avg       0.62      0.62      0.62      3585\n   macro avg       0.37      0.60      0.32      3585\nweighted avg       0.96      0.62      0.74      3585\n')


## Linear Support Vector Machine

In [17]:
svm = LinearSVC()
crv_svm = cross_val_predict(svm,tfidf,df['class'],cv = 10)
np.mean(crv_svm == df['class'])


0.6440725244072525

In [18]:
Y = df['class']
print("\n Classification Report \n ", classification_report(crv_svm,Y))

('\n Classification Report \n ', u'              precision    recall  f1-score   support\n\n          -1       0.36      0.52      0.42       557\n           0       0.30      0.44      0.35       433\n           1       0.85      0.71      0.77      2595\n\n   micro avg       0.64      0.64      0.64      3585\n   macro avg       0.50      0.55      0.52      3585\nweighted avg       0.71      0.64      0.67      3585\n')
