In [35]:
import pandas as pd
from sklearn.externals import joblib
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm
from scipy.sparse import hstack
import warnings

from preprocessing.xml_2_dataframe import Xml2DataFrame
from preprocessing.pos_tagger import POSTagger


In [36]:
path_train = './data/Laptops_Train_v2.xml'
path_test = r'./data/Laptops_Test_Gold.xml'
new_test_path = r'./data/test.xml'


In [37]:
# xml parser
def get_xml_data(path):
    xml2df = Xml2DataFrame()
    xml_dataframe = xml2df.process_data(path)
    return xml_dataframe


# df = get_xml_data(path_train)
# df.head()

In [38]:
# Making list to train
train_dataframe = get_xml_data(path_train)
# print(train_dataframe.head())
train_text_list = train_dataframe['text']
train_aspects_list = list(train_dataframe['aspect_info'])
print(train_text_list.head())
print("=============================")
# print(train_aspects_list[:5])

0    i charge it at night and skip taking the cord with me because of the good battery life.                                                                                            
1    i bought a hp pavilion dv4-1222nr laptop and have had so many problems with the computer.                                                                                          
2    the tech guy then said the service center does not do 1-to-1 exchange and i have to direct my concern to the "sales" team, which is the retail shop which i bought my netbook from.
3    i investigated netbooks and saw the toshiba nb305-n410bl.                                                                                                                          
4    the other day i had a presentation to do for a seminar at a large conference in town- lots of people, little time to prep and have to set up a computer to a projector, etc.       
Name: text, dtype: object


In [39]:
# POSTag function wrapper
def pos_tag(review):
    tagged_text_list = []
    pos_tagger = POSTagger()
    for text in review:
        tagged_text_list.append(pos_tagger.pos_tagger(text=text))
    return tagged_text_list


# POSTag filtering of noun,adjective,verb,adverb
def filter_tag(tagged_reviews):
    filtered_list = []
    pos_tagger = POSTagger()
    for tagged_review in tagged_reviews:
        filtered_list.append(pos_tagger.filter_pos_tag(tagged_review))
    return filtered_list

In [40]:
# tag POS
tagged_text_list_train = pos_tag(train_text_list)
joblib.dump(tagged_text_list_train, 'tagged_text_list_train.pkl')
tagged_text_list_train = joblib.load('tagged_text_list_train.pkl')
# print(tagged_text_list_train[:5])

In [41]:
# train list after filtering
final_train_text_list = filter_tag(tagged_text_list_train)

print(final_train_text_list[:5])

['charge night skip taking cord good battery life', 'bought hp pavilion dv4-1222nr laptop many problems computer', 'tech guy said service center 1-to-1 exchange direct concern sales team retail shop bought netbook', 'investigated netbooks saw toshiba nb305-n410bl', 'day presentation seminar large conference town- lots people little time prep set computer projector etc']


In [42]:
#Selecting only 20 most common aspect.
def get_most_common_aspect(aspect_list):
    import nltk
    aspect_terms = []

    aspect_list = list(aspect_list.aspect_info)

    for inner_list in aspect_list:
        if inner_list is not None:
            for _dict in inner_list:
                # for key in _dict:
                aspect_terms.append(_dict.get('term'))

    most_common_aspect = [k for k, v in nltk.FreqDist(aspect_terms).most_common(50)]
    return most_common_aspect

In [43]:
# generate data frame
def get_data_frame(text_list, train_aspects_list, most_common_aspect):
    data = {'Text': text_list}
    df = pd.DataFrame(data)
    for inner_list in train_aspects_list:
        if inner_list is not None:
            for _dict in inner_list:
                # for key in _dict:
                if _dict.get('term') in most_common_aspect:
                    df.loc[train_aspects_list.index(inner_list), _dict.get('term')] = _dict.get('polarity')
    return df

In [44]:
# generate data frame for aspect extraction task
def get_aspect_data_frame(df, most_common_aspect):
    for common_aspect in most_common_aspect:
        df[common_aspect] = df[common_aspect].replace(['positive','negative','neutral','conflict'],[1,1,1,1])
    df = df.fillna(0)
    return df

In [45]:
most_common_aspect = get_most_common_aspect(train_dataframe)
print(most_common_aspect)

['screen', 'price', 'use', 'battery life', 'battery', 'keyboard', 'programs', 'software', 'features', 'warranty', 'hard drive', 'quality', 'size', 'performance', 'speed', 'Windows', 'memory', 'graphics', 'applications', 'motherboard', 'Vista', 'runs', 'charge', 'works', 'program', 'system', 'gaming', 'design', 'display', 'power supply', 'mouse', 'warrenty', 'Windows 7', 'speakers', 'value', 'operating system', 'service', 'keys', 'windows', 'OS', 'look', 'games', 'extended warranty', 'work', 'cost', 'shipping', 'processor', 'weight', 'carry', 'power']


In [46]:
#get data frame
df_train = get_data_frame(final_train_text_list,train_aspects_list, most_common_aspect)
df_train.head()

Unnamed: 0,Text,battery life,quality,applications,use,features,screen,battery,gaming,speed,...,power,OS,programs,size,design,Windows,weight,speakers,carry,charge
0,charge night skip taking cord good battery life,positive,,,,,,,,,...,,,,,,,,,,
1,bought hp pavilion dv4-1222nr laptop many problems computer,,,,,,,,,,...,,,,,,,,,,
2,tech guy said service center 1-to-1 exchange direct concern sales team retail shop bought netbook,,,,,,,,,,...,,,,,,,,,,
3,investigated netbooks saw toshiba nb305-n410bl,,,,,,,,,,...,,,,,,,,,,
4,day presentation seminar large conference town- lots people little time prep set computer projector etc,,,,,,,,,,...,,,,,,,,,,


In [47]:
# get expect term df
df_train_aspect = get_aspect_data_frame(df_train, most_common_aspect)
df_train_aspect.head()

Unnamed: 0,Text,battery life,quality,applications,use,features,screen,battery,gaming,speed,...,power,OS,programs,size,design,Windows,weight,speakers,carry,charge
0,charge night skip taking cord good battery life,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,bought hp pavilion dv4-1222nr laptop many problems computer,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,tech guy said service center 1-to-1 exchange direct concern sales team retail shop bought netbook,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,investigated netbooks saw toshiba nb305-n410bl,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,day presentation seminar large conference town- lots people little time prep set computer projector etc,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [48]:
df_train_aspect = df_train_aspect.reindex(sorted(df_train_aspect.columns), axis=1)
df_train_aspect.head()

Unnamed: 0,OS,Text,Vista,Windows,Windows 7,applications,battery,battery life,carry,charge,...,speed,system,use,value,warranty,warrenty,weight,windows,work,works
0,0.0,charge night skip taking cord good battery life,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,bought hp pavilion dv4-1222nr laptop many problems computer,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,tech guy said service center 1-to-1 exchange direct concern sales team retail shop bought netbook,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,investigated netbooks saw toshiba nb305-n410bl,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,day presentation seminar large conference town- lots people little time prep set computer projector etc,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [49]:
# Similar for test list
test_dataframe = get_xml_data(path_test)
test_text_list = train_dataframe['text']
test_aspects_list = list(train_dataframe['aspect_info'])

In [50]:
tagged_text_list_test = pos_tag(test_text_list)
joblib.dump(tagged_text_list_test, 'tagged_text_list_test.pkl')
tagged_text_list_test=joblib.load('tagged_text_list_test.pkl')

In [51]:
final_test_text_list = filter_tag(tagged_text_list_test)

In [52]:
df_test = get_data_frame(final_test_text_list,test_aspects_list, most_common_aspect)
df_test_aspect = get_aspect_data_frame(df_test, most_common_aspect)
df_test_aspect = df_test_aspect.reindex(sorted(df_test_aspect.columns), axis=1)

In [53]:
# Sort the data frame according to aspect's name and separate data(X) and target(y)
# df_train_aspect = df_train_aspect.sample(frac=1).reset_index(drop=True) # For randomization
X_train= df_train_aspect.Text
y_train = df_train_aspect.drop('Text', 1)
print(y_train[:5])


# df_test_aspect = df_test_aspect.sample(frac=1).reset_index(drop=True) # For randomization
X_test = df_test_aspect.Text
y_test = df_test_aspect.drop('Text', 1)
final_most_common_aspect = list(y_train)
list(y_train)

    OS  Vista  Windows  Windows 7  applications  battery  battery life  carry  \
0  0.0  0.0    0.0      0.0        0.0           0.0      1.0           0.0     
1  0.0  0.0    0.0      0.0        0.0           0.0      0.0           0.0     
2  0.0  0.0    0.0      0.0        0.0           0.0      0.0           0.0     
3  0.0  0.0    0.0      0.0        0.0           0.0      0.0           0.0     
4  0.0  0.0    0.0      0.0        0.0           0.0      0.0           0.0     

   charge  cost  ...    speed  system  use  value  warranty  warrenty  weight  \
0  0.0     0.0   ...    0.0    0.0     0.0  0.0    0.0       0.0       0.0      
1  0.0     0.0   ...    0.0    0.0     0.0  0.0    0.0       0.0       0.0      
2  0.0     0.0   ...    0.0    0.0     0.0  0.0    0.0       0.0       0.0      
3  0.0     0.0   ...    0.0    0.0     0.0  0.0    0.0       0.0       0.0      
4  0.0     0.0   ...    0.0    0.0     0.0  0.0    0.0       0.0       0.0      

   windows  work  works  


['OS',
 'Vista',
 'Windows',
 'Windows 7',
 'applications',
 'battery',
 'battery life',
 'carry',
 'charge',
 'cost',
 'design',
 'display',
 'extended warranty',
 'features',
 'games',
 'gaming',
 'graphics',
 'hard drive',
 'keyboard',
 'keys',
 'look',
 'memory',
 'motherboard',
 'mouse',
 'operating system',
 'performance',
 'power',
 'power supply',
 'price',
 'processor',
 'program',
 'programs',
 'quality',
 'runs',
 'screen',
 'service',
 'shipping',
 'size',
 'software',
 'speakers',
 'speed',
 'system',
 'use',
 'value',
 'warranty',
 'warrenty',
 'weight',
 'windows',
 'work',
 'works']

In [54]:
# Change y_train to numpy array
import numpy as np
y_train = np.asarray(y_train, dtype=np.int64)
y_test = np.asarray(y_test, dtype=np.int64)
print(y_train[:5])

[[0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0]]


In [55]:
# Generate word vecotors using CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer
# from nltk import word_tokenize          
# from nltk.stem import WordNetLemmatizer 
vect = CountVectorizer(max_df=1.0, stop_words='english')  
X_train_dtm = vect.fit_transform(X_train)
X_test_dtm = vect.transform(X_test)

In [56]:
# Create various models. These are multi-label models.
nb_classif = OneVsRestClassifier(MultinomialNB()).fit(X_train_dtm, y_train)
C = 1.0 
# SVregularization parameter
svc = OneVsRestClassifier(svm.SVC(kernel='linear', C=C)).fit(X_train_dtm, y_train)
lin_svc = OneVsRestClassifier(svm.LinearSVC(C=C)).fit(X_train_dtm, y_train)
sgd = OneVsRestClassifier(SGDClassifier(max_iter=1000)).fit(X_train_dtm,y_train)

In [57]:
# Predict the test data using classifiers
y_pred_class = nb_classif.predict(X_test_dtm)
y_pred_class_svc = svc.predict(X_test_dtm)
y_pred_class_lin_svc = lin_svc.predict(X_test_dtm)
y_pred_class_sgd = sgd.predict(X_test_dtm)

In [58]:
# Following code to test metrics of all aspect extraction classifiers
from sklearn import metrics

In [59]:
print(metrics.accuracy_score(y_test,y_pred_class))
print(metrics.accuracy_score(y_test,y_pred_class_svc))
print(metrics.accuracy_score(y_test,y_pred_class_lin_svc))
print(metrics.accuracy_score(y_test,y_pred_class_sgd))

0.81871921182266
0.9970443349753695
0.9973727422003285
0.9977011494252873


In [60]:
print(metrics.precision_score(y_test,y_pred_class,average='micro'))
print(metrics.precision_score(y_test,y_pred_class_svc,average='micro'))
print(metrics.precision_score(y_test,y_pred_class_lin_svc,average='micro'))
print(metrics.precision_score(y_test,y_pred_class_sgd,average='micro'))

0.8853046594982079
0.9921436588103255
0.992152466367713
0.9932659932659933


In [61]:
print(metrics.recall_score(y_test,y_pred_class,average='micro'))
print(metrics.recall_score(y_test,y_pred_class_svc,average='micro'))
print(metrics.recall_score(y_test,y_pred_class_lin_svc,average='micro'))
print(metrics.recall_score(y_test,y_pred_class_sgd,average='micro'))

0.27815315315315314
0.9954954954954955
0.9966216216216216
0.9966216216216216


In [62]:
print(metrics.f1_score(y_test,y_pred_class,average='micro'))
print(metrics.f1_score(y_test,y_pred_class_svc,average='micro'))
print(metrics.f1_score(y_test,y_pred_class_lin_svc,average='micro'))
print(metrics.f1_score(y_test,y_pred_class_sgd,average='micro'))

0.42330762639245934
0.9938167509836987
0.99438202247191
0.9949409780775715


In [63]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    print(metrics.classification_report(y_test, y_pred_class))
    print(metrics.classification_report(y_test, y_pred_class_svc))
    print(metrics.classification_report(y_test, y_pred_class_lin_svc))
    print(metrics.classification_report(y_test, y_pred_class_sgd))

             precision    recall  f1-score   support

          0       1.00      0.33      0.50         9
          1       1.00      0.27      0.43        11
          2       1.00      0.11      0.19        19
          3       1.00      0.10      0.18        10
          4       1.00      0.19      0.32        16
          5       0.82      0.23      0.36        39
          6       0.83      0.43      0.57        44
          7       1.00      0.12      0.22         8
          8       1.00      0.50      0.67        10
          9       0.00      0.00      0.00         9
         10       0.75      0.25      0.38        12
         11       1.00      0.40      0.57        10
         12       1.00      0.22      0.36         9
         13       0.90      0.28      0.43        32
         14       0.00      0.00      0.00         8
         15       1.00      0.15      0.27        13
         16       1.00      0.14      0.25        14
         17       0.93      0.45      0.60   

             precision    recall  f1-score   support

          0       1.00      1.00      1.00         9
          1       1.00      1.00      1.00        11
          2       1.00      0.95      0.97        19
          3       1.00      1.00      1.00        10
          4       1.00      1.00      1.00        16
          5       0.97      1.00      0.99        39
          6       0.98      1.00      0.99        44
          7       1.00      1.00      1.00         8
          8       1.00      1.00      1.00        10
          9       1.00      1.00      1.00         9
         10       1.00      1.00      1.00        12
         11       1.00      1.00      1.00        10
         12       1.00      1.00      1.00         9
         13       1.00      1.00      1.00        32
         14       1.00      1.00      1.00         8
         15       1.00      1.00      1.00        13
         16       1.00      1.00      1.00        14
         17       0.97      1.00      0.98   

In [64]:
def get_dict_aspect(y,most_common_aspect):
    position=[]
    for innerlist in y:
        position.append([i for i, j in enumerate(innerlist) if j == 1])
    sorted_common=sorted(most_common_aspect)
    dict_aspect=[]
    for innerlist in position:
        inner_dict={}
        for word in sorted_common:
            if sorted_common.index(word) in innerlist:
                inner_dict[word]= 5
            else:
                inner_dict[word]=0
        dict_aspect.append(inner_dict)
    return dict_aspect

In [65]:
# Generating extra feature that indicates which aspect category is present in the review
train_dict_aspect=get_dict_aspect(y_train, most_common_aspect)
d_train=DictVectorizer() 
X_train_aspect_dtm = d_train.fit_transform(train_dict_aspect)

# y_test is used to generated extra feature in order to test the performance of 2nd classifer.
#Use y_pred_class_svc(Highest performer for aspect classification) as input for extra feature to test the overall performace.
test_dict_aspect=get_dict_aspect(y_test, most_common_aspect)
d_test=DictVectorizer() 

In [66]:
from src.BIO_format import BIO

def BIO_format(text, predicted_output, common_words):
    bio_obj = BIO()
    df = bio_obj.convert_into_bio(text, predicted_output, common_words_list=common_words)
    return df

In [67]:
# Aspect term extractor of user's input.
# user_input=input("Enter a laptop review:\n\n")
user_input = "The battery life is really good and its size is reasonable"
# Preprocessing and vectorizing
tagged_user_input = pos_tag([user_input])
print(tagged_user_input)
filter_tagged_user_input = filter_tag(tagged_user_input)
print(filter_tagged_user_input)

user_input_series = pd.Series(filter_tagged_user_input)
print(user_input_series)
user_input_series_dtm = vect.transform(user_input_series)
print(user_input_series_dtm)
# print(user_input_series[:5])

predict_aspect= sgd.predict(user_input_series_dtm)
print(predict_aspect)
# predict_aspect_data = predict_aspect[0]
extra_feature=get_dict_aspect(predict_aspect, most_common_aspect)
extra_feature_dtm=DictVectorizer().fit_transform(extra_feature)
predict_aspect

[[('The', 'DT'), ('battery', 'NN'), ('life', 'NN'), ('really', 'RB'), ('good', 'JJ'), ('size', 'NN'), ('reasonable', 'JJ')]]
['battery life really good size reasonable']
0    battery life really good size reasonable
dtype: object
  (0, 259)	1
  (0, 1339)	1
  (0, 1810)	1
  (0, 2573)	1
  (0, 2577)	1
  (0, 2944)	1
[[0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 1 0 0 0 0 0 0 0 0 0 0 0 0]]


array([[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0]])

In [68]:
df = BIO_format(user_input, predict_aspect[0], final_most_common_aspect)
df

Unnamed: 0,BIO,text
0,O,The
1,B,battery
2,I,life
3,O,is
4,O,really
5,O,good
6,O,and
7,O,its
8,B,size
9,O,is
