In [40]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
from nltk.tokenize import word_tokenize,wordpunct_tokenize,sent_tokenize 
from nltk.corpus import stopwords 
import string
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC,SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [41]:
data = pd.read_csv('Computer_review.csv')

In [42]:
data.head()

Unnamed: 0,example_id,text,aspect_term,term_location,class
0,2333_1,Obviously one of the most important features o...,human interface,69--84,0
1,1805_1,Good for every day computing and web browsing.,every day computing,9--28,1
2,2782_2,while the keyboard itself is alright[comma] th...,mouse command buttons,115--136,-1
3,1385_0,Again[comma] the same problem[comma] the right...,right speaker,29--42,-1
4,1423_0,My problem was with DELL Customer Service.,DELL Customer Service,20--41,-1


In [43]:
data_copy = data.copy()

In [44]:
data_copy.columns = ['example_id','text','aspect_term','term_location','class']

In [45]:
countVectorizer = CountVectorizer()
document_matrix = countVectorizer.fit_transform(data_copy['text'])

In [46]:
vocab = list(countVectorizer.vocabulary_.keys())
len(vocab)

3246

In [47]:
def pre_process(message):
    message = message.replace("[comma]",",")
    message = " ".join(wordpunct_tokenize(message))
    nopunc = [char for char in message if char not in string.punctuation]
    message = "".join(nopunc)
    message = [text for text in message.strip().split() if text not in set(stopwords.words('english'))]    
    message = " ".join(message)
    return word_tokenize(message)

In [48]:
data_copy['text'] = data_copy['text'].apply(pre_process)

In [49]:
data_copy['text'].head()

0    [Obviously, one, important, features, computer...
1         [Good, every, day, computing, web, browsing]
2    [keyboard, alright, plate, around, cheap, plas...
3               [Again, problem, right, speaker, work]
4               [My, problem, DELL, Customer, Service]
Name: text, dtype: object

In [50]:
data_copy['mod_aspect_term'] = data_copy['aspect_term'].apply(pre_process)

In [51]:
def calculate_weights(x):
    text = x[0]
    aspect = x[1]
    if set(aspect) <= set(text):
        left_weights = right_weights = []
        sentence = " ".join(text)
        aspect_join = " ".join(aspect)
        start_list= [i for i, x in enumerate(text) if x == aspect[0]]
        for q in (start_list):
            if  text[(q + len(aspect) - 1)] == aspect[-1]:
                start_index = q
                end_index = q + len(aspect) - 1
                break
        if (end_index - start_index) == len(aspect) - 1:
            left_text = text[:start_index]
            right_text = text[end_index+1:]
            left_weights = [1/i for i in range(len(left_text),0,-1) if len(left_text) != 0]
            right_weights = [1/i for i in range(1,len(right_text)+1) if len(right_text) != 0]
        tot_weights = left_weights + [2]*len(aspect) + right_weights
        return dict(zip(text,tot_weights))
    else: 
        return np.nan

In [52]:
data_copy['weights_score'] = data_copy[['text','mod_aspect_term']].apply(calculate_weights, axis = 1)
data_copy = data_copy.dropna()
df_new = pd.DataFrame(np.zeros((len(data_copy),len(vocab))),columns=vocab)

In [59]:
data_copy['weights_score'].head()

0    {'Obviously': 0.2, 'one': 0.25, 'important': 0...
1    {'Good': 1.0, 'every': 2, 'day': 2, 'computing...
2    {'keyboard': 0.1, 'alright': 0.111111111111111...
3    {'Again': 0.5, 'problem': 1.0, 'right': 2, 'sp...
4    {'My': 0.5, 'problem': 1.0, 'DELL': 2, 'Custom...
Name: weights_score, dtype: object

In [60]:
for row in range(len(data_copy)):
    for key,value in data_copy.iloc[row]['weights_score'].items():
        df_new.iloc[row][key] = value

tfidf= TfidfTransformer().fit_transform(df_new)

### Cross Val on RBF SVM Kernel

In [61]:
svc = SVC(C=1,random_state=0)
pred_weight_svc = cross_val_predict(svc,tfidf,data_copy['class'],cv = 10)
np.mean(pred_weight_svc == data_copy['class'])

0.42616992276238075

### Cross Val on Linear SVM kernel

In [62]:
svm = LinearSVC(C=1.2,random_state=0)
pred_weight_svm = cross_val_predict(svm,tfidf,data_copy['class'],cv = 10)
np.mean(pred_weight_svm == data_copy['class'])

0.7087687414811449

### Cross Val on MultinomialNB

In [63]:
mnb = MultinomialNB()
pred_weight_mnb = cross_val_predict(mnb,tfidf,data_copy['class'],cv = 10)
np.mean(pred_weight_mnb == data_copy['class'])

0.651976374375284

### Cross Val on Random Forest Classifier

In [64]:
rfc = RandomForestClassifier(n_estimators=50,max_depth=1500,random_state=0)
pred_weight_rfc = cross_val_predict(rfc,tfidf,data_copy['class'],cv = 10)
np.mean(pred_weight_rfc == data_copy['class'])

0.7233075874602454

In [65]:
print("\n Random Forrest Metrics\n ", classification_report(pred_weight_rfc,data_copy['class']))


 Random Forrest Metrics
               precision    recall  f1-score   support

         -1       0.83      0.68      0.75      1008
          0       0.39      0.68      0.50       254
          1       0.78      0.78      0.78       939

avg / total       0.76      0.72      0.73      2201



# Holdout Method

In [66]:
x_train,x_test,y_train,y_test = train_test_split(tfidf,data_copy['class'],test_size = 0.3,random_state = 0)

#### Linear SVM 

In [67]:
model = LinearSVC(C=1.2,random_state=0).fit(X=x_train,y=y_train)
model.score(x_test,y_test)

0.6959152798789713

#### Random Forrest

In [68]:
model = RandomForestClassifier(n_estimators=50,max_depth=1500,random_state=0).fit(X=x_train,y=y_train)
model.score(x_test,y_test)

0.6853252647503782

# Final Model using Random Forrest

In [69]:
model = RandomForestClassifier(n_estimators=50,max_depth=1500,random_state=0).fit(tfidf,y=data_copy['class'])

# Run final dataset

In [24]:
test_data = pd.read_csv('Data-1_test.csv')


In [25]:
test_data.columns = ['example_id','text','aspect_term','term_location']

In [28]:
test_data.head()

Unnamed: 0,example_id,text,aspect_term,term_location
0,494:1_0,Buy the separate RAM memory and you will have ...,RAM memory,17--27
1,311:18_1,But with this laptop[comma] the bass is very w...,sound,52--57
2,256:1_0,This hardware seems to be better than the iMac...,hardware,5--13
3,748:1_0,It is very easy to navigate even for a novice.,navigate,19--27
4,128:1_0,Amazing Performance for anything I throw at it.,Performance,8--19


In [29]:
test_data['text'] = test_data['text'].apply(pre_process)

In [30]:
test_data['mod_aspect_term'] = test_data['aspect_term'].apply(pre_process)

In [31]:
test_data['weights_score'] = test_data[['text','mod_aspect_term']].apply(calculate_weights, axis = 1)
test_data = test_data.dropna()
df_test = pd.DataFrame(np.zeros((len(test_data),len(vocab))),columns=vocab)

In [32]:
for row in range(len(test_data)):
    for key,value in test_data.iloc[row]['weights_score'].items():
        df_test.iloc[row][key] = value

tfidf_test= TfidfTransformer().fit_transform(df_test)

In [33]:
test_data['class']=(model.predict(tfidf_test))

In [34]:
data_copy['class'].head()

0    0
1    1
2   -1
3   -1
4   -1
Name: class, dtype: int64