In [110]:
import pandas as pd
from sklearn.externals import joblib
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm
from scipy.sparse import hstack
import warnings

from preprocessing.xml_2_dataframe import Xml2DataFrame
from preprocessing.pos_tagger import POSTagger


In [111]:
path_train = './data/Laptops_Train_v2.xml'
path_test = r'./data/Laptops_Test_Gold.xml'
new_test_path = r'./data/test.xml'


In [112]:
# xml parser
def get_xml_data(path):
    xml2df = Xml2DataFrame()
    xml_dataframe = xml2df.process_data(path)
    return xml_dataframe


# df = get_xml_data(path_train)
# df.head()

In [113]:
# Making list to train
train_dataframe = get_xml_data(path_train)
# print(train_dataframe.head())
train_text_list = train_dataframe['text']
train_aspects_list = list(train_dataframe['aspect_info'])
print(train_text_list.head())
print("=============================")
# print(train_aspects_list[:5])

0    i charge it at night and skip taking the cord with me because of the good battery life.                                                                                            
1    i bought a hp pavilion dv4-1222nr laptop and have had so many problems with the computer.                                                                                          
2    the tech guy then said the service center does not do 1-to-1 exchange and i have to direct my concern to the "sales" team, which is the retail shop which i bought my netbook from.
3    i investigated netbooks and saw the toshiba nb305-n410bl.                                                                                                                          
4    the other day i had a presentation to do for a seminar at a large conference in town- lots of people, little time to prep and have to set up a computer to a projector, etc.       
Name: text, dtype: object


In [114]:
# POSTag function wrapper
def pos_tag(review):
    tagged_text_list = []
    pos_tagger = POSTagger()
    for text in review:
        tagged_text_list.append(pos_tagger.pos_tagger(text=text))
    return tagged_text_list


# POSTag filtering of noun,adjective,verb,adverb
def filter_tag(tagged_reviews):
    filtered_list = []
    pos_tagger = POSTagger()
    for tagged_review in tagged_reviews:
        filtered_list.append(pos_tagger.filter_pos_tag(tagged_review))
    return filtered_list

In [115]:
# tag POS
tagged_text_list_train = pos_tag(train_text_list)
joblib.dump(tagged_text_list_train, 'tagged_text_list_train.pkl')
tagged_text_list_train = joblib.load('tagged_text_list_train.pkl')
# print(tagged_text_list_train[:5])

In [116]:
# train list after filtering
final_train_text_list = filter_tag(tagged_text_list_train)

print(final_train_text_list[:5])

['charge night skip taking cord good battery life', 'bought hp pavilion dv4-1222nr laptop many problems computer', 'tech guy said service center 1-to-1 exchange direct concern sales team retail shop bought netbook', 'investigated netbooks saw toshiba nb305-n410bl', 'day presentation seminar large conference town- lots people little time prep set computer projector etc']


In [117]:
#Selecting only 20 most common aspect.
def get_most_common_aspect(aspect_list):
    import nltk
    aspect_terms = []

    aspect_list = list(aspect_list.aspect_info)

    for inner_list in aspect_list:
        if inner_list is not None:
            for _dict in inner_list:
                # for key in _dict:
                aspect_terms.append(_dict.get('term'))

    most_common_aspect = [k for k, v in nltk.FreqDist(aspect_terms).most_common(1000)]
    return most_common_aspect

In [118]:
# generate data frame
def get_data_frame(text_list, train_aspects_list, most_common_aspect):
    data = {'Text': text_list}
    df = pd.DataFrame(data)
    for inner_list in train_aspects_list:
        if inner_list is not None:
            for _dict in inner_list:
                # for key in _dict:
                if _dict.get('term') in most_common_aspect:
                    df.loc[train_aspects_list.index(inner_list), _dict.get('term')] = _dict.get('polarity')
    return df

In [119]:
# generate data frame for aspect extraction task
def get_aspect_data_frame(df, most_common_aspect):
    for common_aspect in most_common_aspect:
        df[common_aspect] = df[common_aspect].replace(['positive','negative','neutral','conflict'],[1,1,1,1])
    df = df.fillna(0)
    return df

In [120]:
most_common_aspect = get_most_common_aspect(train_dataframe)
print(most_common_aspect)

['screen', 'price', 'use', 'battery life', 'battery', 'keyboard', 'programs', 'software', 'features', 'warranty', 'hard drive', 'quality', 'size', 'performance', 'speed', 'Windows', 'memory', 'graphics', 'applications', 'motherboard', 'Vista', 'runs', 'charge', 'works', 'program', 'system', 'gaming', 'design', 'display', 'power supply', 'mouse', 'warrenty', 'Windows 7', 'speakers', 'value', 'operating system', 'service', 'keys', 'windows', 'OS', 'look', 'games', 'extended warranty', 'work', 'cost', 'shipping', 'processor', 'weight', 'carry', 'power', 'hardware', 'fan', 'feature', 'Keyboard', 'touchpad', 'trackpad', 'boot up', 'touch pad', 'tech support', 'sound', 'iWork', 'Screen', 'screen size', 'Windows 7 Starter', 'webcam', 'internet', 'edges', 'cd drive', 'RAM', 'iTunes', 'portability', 'buttons', 'mousepad', 'navigate', 'hinge', 'drivers', 'iPhoto', 'space', 'set up', 'monitor', 'MS Office', 'USB ports', 'Pages', 'functions', 'repair depot', 'HD', 'web browsing', 'DVD burner', 'cu

In [121]:
#get data frame
df_train = get_data_frame(final_train_text_list,train_aspects_list, most_common_aspect)
df_train.head()

Unnamed: 0,Text,cord,battery life,service center,"""sales"" team",tech guy,quality,GUI,applications,use,...,apple associates,mousepad sensitivity,mac osx,HDD bay,leather carrying case,commodity hardware,Paralles,Windows XP Professional,Windows Server 2008 Enterprise,repair
0,charge night skip taking cord good battery life,neutral,positive,,,,,,,,...,,,,,,,,,,
1,bought hp pavilion dv4-1222nr laptop many problems computer,,,,,,,,,,...,,,,,,,,,,
2,tech guy said service center 1-to-1 exchange direct concern sales team retail shop bought netbook,,,negative,negative,neutral,,,,,...,,,,,,,,,,
3,investigated netbooks saw toshiba nb305-n410bl,,,,,,,,,,...,,,,,,,,,,
4,day presentation seminar large conference town- lots people little time prep set computer projector etc,,,,,,,,,,...,,,,,,,,,,


In [122]:
# get expect term df
df_train_aspect = get_aspect_data_frame(df_train, most_common_aspect)
df_train_aspect.head()

Unnamed: 0,Text,cord,battery life,service center,"""sales"" team",tech guy,quality,GUI,applications,use,...,apple associates,mousepad sensitivity,mac osx,HDD bay,leather carrying case,commodity hardware,Paralles,Windows XP Professional,Windows Server 2008 Enterprise,repair
0,charge night skip taking cord good battery life,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,bought hp pavilion dv4-1222nr laptop many problems computer,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,tech guy said service center 1-to-1 exchange direct concern sales team retail shop bought netbook,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,investigated netbooks saw toshiba nb305-n410bl,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,day presentation seminar large conference town- lots people little time prep set computer projector etc,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [123]:
df_train_aspect = df_train_aspect.reindex(sorted(df_train_aspect.columns), axis=1)
df_train_aspect.head()

Unnamed: 0,"""sales"" team",1 GB ram,1-year-warranty,10-key,12 cell battery,15 inch,"15""",16GB RAM support,17 ince screen,17 inch screen,...,word editing,word processer,word processing,word processing program,word processor,work,working,works,wt,zooming
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [124]:
# Similar for test list
test_dataframe = get_xml_data(path_test)
test_text_list = train_dataframe['text']
test_aspects_list = list(train_dataframe['aspect_info'])

In [125]:
tagged_text_list_test = pos_tag(test_text_list)
joblib.dump(tagged_text_list_test, 'tagged_text_list_test.pkl')
tagged_text_list_test=joblib.load('tagged_text_list_test.pkl')

In [126]:
final_test_text_list = filter_tag(tagged_text_list_test)

In [127]:
df_test = get_data_frame(final_test_text_list,test_aspects_list, most_common_aspect)
df_test_aspect = get_aspect_data_frame(df_test, most_common_aspect)
df_test_aspect = df_test_aspect.reindex(sorted(df_test_aspect.columns), axis=1)

In [128]:
# Sort the data frame according to aspect's name and separate data(X) and target(y)
# df_train_aspect = df_train_aspect.sample(frac=1).reset_index(drop=True) # For randomization
X_train= df_train_aspect.Text
y_train = df_train_aspect.drop('Text', 1)
print(y_train[:5])


# df_test_aspect = df_test_aspect.sample(frac=1).reset_index(drop=True) # For randomization
X_test = df_test_aspect.Text
y_test = df_test_aspect.drop('Text', 1)
final_most_common_aspect = list(y_train)
list(y_train)

   "sales" team  1 GB ram  1-year-warranty  10-key  12 cell battery  15 inch  \
0  0.0           0.0       0.0              0.0     0.0              0.0       
1  0.0           0.0       0.0              0.0     0.0              0.0       
2  1.0           0.0       0.0              0.0     0.0              0.0       
3  0.0           0.0       0.0              0.0     0.0              0.0       
4  0.0           0.0       0.0              0.0     0.0              0.0       

   15"  16GB RAM support  17 ince screen  17 inch screen   ...     \
0  0.0  0.0               0.0             0.0              ...      
1  0.0  0.0               0.0             0.0              ...      
2  0.0  0.0               0.0             0.0              ...      
3  0.0  0.0               0.0             0.0              ...      
4  0.0  0.0               0.0             0.0              ...      

   word editing  word processer  word processing  word processing program  \
0  0.0           0.0       

['"sales" team',
 '1 GB ram',
 '1-year-warranty',
 '10-key',
 '12 cell battery',
 '15 inch',
 '15"',
 '16GB RAM support',
 '17 ince screen',
 '17 inch screen',
 '17"',
 '17" inch screen',
 '17-inch screen',
 '18-inch',
 '18.4" screen',
 '1GB of RAM',
 '2 GB of RAM',
 '22" Monitor',
 '2GB RAM stick',
 '2GB stick of memory',
 '3 year warranty',
 '30" HD Monitor',
 '3G network',
 '4GB of RAM',
 '4GB stick of RAM',
 '500gb external hard drive',
 '500gb hard drive',
 '8GB RAM',
 '8GB of RAM',
 'AC plug',
 'AC power port',
 'AMD Turin processor',
 'ASUS TECH SUPPORT',
 'ATI graphics card',
 'Acer screen',
 'Adobe Creative Suite',
 'Adobe Creative apps',
 'Adobe Flash player',
 'AfterEffects programs',
 'Apple Care plan',
 'Apple applications',
 'Apple keyboard',
 'Apple navigation',
 'Apple support',
 'Apple team',
 'Applecare',
 'Applecare tech support',
 'Applecare warranty plan',
 'Appleworks',
 'Applications',
 'BATTERY',
 'BIOS',
 'BIOS update',
 'BOOT MGR',
 'BOOTING UP',
 'Battery',
 

In [129]:
# Change y_train to numpy array
import numpy as np
y_train = np.asarray(y_train, dtype=np.int64)
y_test = np.asarray(y_test, dtype=np.int64)
print(y_train[:5])

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [130]:
# Generate word vecotors using CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer
# from nltk import word_tokenize          
# from nltk.stem import WordNetLemmatizer 
vect = CountVectorizer(max_df=1.0, stop_words='english')  
X_train_dtm = vect.fit_transform(X_train)
X_test_dtm = vect.transform(X_test)

In [131]:
# Create various models. These are multi-label models.
nb_classif = OneVsRestClassifier(MultinomialNB()).fit(X_train_dtm, y_train)
C = 1.0 
# SVregularization parameter
svc = OneVsRestClassifier(svm.SVC(kernel='linear', C=C)).fit(X_train_dtm, y_train)
lin_svc = OneVsRestClassifier(svm.LinearSVC(C=C)).fit(X_train_dtm, y_train)
sgd = OneVsRestClassifier(SGDClassifier(max_iter=1000)).fit(X_train_dtm,y_train)

In [95]:
# Predict the test data using classifiers
y_pred_class = nb_classif.predict(X_test_dtm)
y_pred_class_svc = svc.predict(X_test_dtm)
y_pred_class_lin_svc = lin_svc.predict(X_test_dtm)
y_pred_class_sgd = sgd.predict(X_test_dtm)

In [96]:
# Following code to test metrics of all aspect extraction classifiers
from sklearn import metrics

In [97]:
print(metrics.accuracy_score(y_test,y_pred_class))
print(metrics.accuracy_score(y_test,y_pred_class_svc))
print(metrics.accuracy_score(y_test,y_pred_class_lin_svc))
print(metrics.accuracy_score(y_test,y_pred_class_sgd))

0.7688013136288998
0.9957307060755337
0.9967159277504105
0.9970443349753695


In [98]:
print(metrics.precision_score(y_test,y_pred_class,average='micro'))
print(metrics.precision_score(y_test,y_pred_class_svc,average='micro'))
print(metrics.precision_score(y_test,y_pred_class_lin_svc,average='micro'))
print(metrics.precision_score(y_test,y_pred_class_sgd,average='micro'))

0.8955696202531646
0.9937833037300178
0.9937998228520815
0.9946808510638298


In [99]:
print(metrics.recall_score(y_test,y_pred_class,average='micro'))
print(metrics.recall_score(y_test,y_pred_class_svc,average='micro'))
print(metrics.recall_score(y_test,y_pred_class_lin_svc,average='micro'))
print(metrics.recall_score(y_test,y_pred_class_sgd,average='micro'))

0.2511091393078971
0.9929015084294588
0.9955634427684117
0.9955634427684117


In [107]:
print(metrics.f1_score(y_test,y_pred_class,average='micro'))
print(metrics.f1_score(y_test,y_pred_class_svc,average='micro'))
print(metrics.f1_score(y_test,y_pred_class_lin_svc,average='micro'))
print(metrics.f1_score(y_test,y_pred_class_sgd,average='micro'))

0.3922383922383922
0.9933422103861519
0.9946808510638298
0.9951219512195122


In [101]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    print(metrics.classification_report(y_test, y_pred_class))
    print(metrics.classification_report(y_test, y_pred_class_svc))
    print(metrics.classification_report(y_test, y_pred_class_lin_svc))
    print(metrics.classification_report(y_test, y_pred_class_sgd))

             precision    recall  f1-score   support

          0       1.00      0.25      0.40         4
          1       1.00      0.25      0.40         4
          2       1.00      0.25      0.40         4
          3       1.00      0.67      0.80         3
          4       0.00      0.00      0.00         5
          5       0.00      0.00      0.00         3
          6       1.00      0.33      0.50         9
          7       1.00      0.50      0.67         4
          8       0.00      0.00      0.00         5
          9       0.00      0.00      0.00         5
         10       1.00      0.25      0.40         4
         11       1.00      0.27      0.43        11
         12       1.00      0.11      0.19        19
         13       1.00      0.10      0.18        10
         14       0.00      0.00      0.00         6
         15       1.00      0.19      0.32        16
         16       0.82      0.23      0.36        39
         17       0.83      0.43      0.57   

             precision    recall  f1-score   support

          0       1.00      1.00      1.00         4
          1       1.00      1.00      1.00         4
          2       1.00      1.00      1.00         4
          3       1.00      1.00      1.00         3
          4       1.00      1.00      1.00         5
          5       1.00      1.00      1.00         3
          6       1.00      1.00      1.00         9
          7       1.00      1.00      1.00         4
          8       1.00      1.00      1.00         5
          9       1.00      1.00      1.00         5
         10       1.00      1.00      1.00         4
         11       1.00      1.00      1.00        11
         12       1.00      0.95      0.97        19
         13       1.00      1.00      1.00        10
         14       1.00      1.00      1.00         6
         15       1.00      1.00      1.00        16
         16       0.97      1.00      0.99        39
         17       0.96      1.00      0.98   

In [109]:
# stored trained model
joblib.dump(sgd, 'trained_sgd.pkl')

['trained_sgd.pkl']

In [102]:
def get_dict_aspect(y,most_common_aspect):
    position=[]
    for innerlist in y:
        position.append([i for i, j in enumerate(innerlist) if j == 1])
    sorted_common=sorted(most_common_aspect)
    dict_aspect=[]
    for innerlist in position:
        inner_dict={}
        for word in sorted_common:
            if sorted_common.index(word) in innerlist:
                inner_dict[word]= 5
            else:
                inner_dict[word]=0
        dict_aspect.append(inner_dict)
    return dict_aspect

In [103]:
# Generating extra feature that indicates which aspect category is present in the review
train_dict_aspect=get_dict_aspect(y_train, most_common_aspect)
d_train=DictVectorizer() 
X_train_aspect_dtm = d_train.fit_transform(train_dict_aspect)

# y_test is used to generated extra feature in order to test the performance of 2nd classifer.
#Use y_pred_class_svc(Highest performer for aspect classification) as input for extra feature to test the overall performace.
test_dict_aspect=get_dict_aspect(y_test, most_common_aspect)
d_test=DictVectorizer() 

In [104]:
from src.BIO_format import BIO

def BIO_format(text, predicted_output, common_words):
    bio_obj = BIO()
    df = bio_obj.convert_into_bio(text, predicted_output, common_words_list=common_words)
    return df

In [105]:
# Aspect term extractor of user's input.
# user_input=input("Enter a laptop review:\n\n")
# user_input = "The battery life is really good and its size is reasonable"
user_input = "it is of high quality, has a killer GUI, is extremely stable, is highly expandable, is bundled with lots of very good applications, is easy to use, and is absolutely gorgeous."
# Preprocessing and vectorizing
tagged_user_input = pos_tag([user_input])
print(tagged_user_input)
filter_tagged_user_input = filter_tag(tagged_user_input)
print(filter_tagged_user_input)

user_input_series = pd.Series(filter_tagged_user_input)
print(user_input_series)
user_input_series_dtm = vect.transform(user_input_series)
print(user_input_series_dtm)
# print(user_input_series[:5])

predict_aspect= sgd.predict(user_input_series_dtm)
print(predict_aspect)
# predict_aspect_data = predict_aspect[0]
extra_feature=get_dict_aspect(predict_aspect, most_common_aspect)
extra_feature_dtm=DictVectorizer().fit_transform(extra_feature)
predict_aspect

[[('high', 'JJ'), ('quality', 'NN'), ('killer', 'NN'), ('GUI', 'NNP'), ('extremely', 'RB'), ('stable', 'JJ'), ('highly', 'RB'), ('expandable', 'JJ'), ('bundled', 'VBN'), ('lots', 'NNS'), ('good', 'JJ'), ('applications', 'NNS'), ('easy', 'JJ'), ('use', 'NN'), ('absolutely', 'RB'), ('gorgeous', 'JJ')]]
['high quality killer GUI extremely stable highly expandable bundled lots good applications easy use absolutely gorgeous']
0    high quality killer GUI extremely stable highly expandable bundled lots good applications easy use absolutely gorgeous
dtype: object
  (0, 23)	1
  (0, 160)	1
  (0, 387)	1
  (0, 969)	1
  (0, 1079)	1
  (0, 1115)	1
  (0, 1339)	1
  (0, 1343)	1
  (0, 1382)	1
  (0, 1466)	1
  (0, 1469)	1
  (0, 1722)	1
  (0, 1878)	1
  (0, 2518)	1
  (0, 3061)	1
  (0, 3445)	1
[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0]]


array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [106]:
df = BIO_format(user_input, predict_aspect[0], final_most_common_aspect)
df

Unnamed: 0,BIO,text
0,O,it
1,O,is
2,O,of
3,O,high
4,B,"quality,"
5,O,has
6,O,a
7,O,killer
8,O,"GUI,"
9,O,is
