In [1]:
import pandas as pd
from sklearn.externals import joblib
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm
import xml.etree.ElementTree as ET
from lxml import etree
from scipy.sparse import hstack
import numpy as np
import warnings

from preprocessing.xml_2_dataframe import Xml2DataFrame
from preprocessing.pos_tagger import POSTagger


In [2]:
path_train = './data/Laptops_Train_v2.xml'
path_test = r'./data/Laptops_Test_Gold.xml'
new_test_path = r'./data/test.xml'


In [3]:
# xml parser
def get_xml_data(path):
    xml2df = Xml2DataFrame()
    xml_dataframe = xml2df.process_data(path)
    return xml_dataframe


# df = get_xml_data(path_train)
# df.head()

In [4]:
# Making list to train
train_dataframe = get_xml_data(path_train)
# print(train_dataframe.head())
train_text_list = train_dataframe['text']
train_aspects_list = list(train_dataframe['aspect_info'])
print(train_text_list.head())
print("=============================")
# print(train_aspects_list[:5])

0    i charge it at night and skip taking the cord with me because of the good battery life.                                                                                            
1    i bought a hp pavilion dv4-1222nr laptop and have had so many problems with the computer.                                                                                          
2    the tech guy then said the service center does not do 1-to-1 exchange and i have to direct my concern to the "sales" team, which is the retail shop which i bought my netbook from.
3    i investigated netbooks and saw the toshiba nb305-n410bl.                                                                                                                          
4    the other day i had a presentation to do for a seminar at a large conference in town- lots of people, little time to prep and have to set up a computer to a projector, etc.       
Name: text, dtype: object


In [5]:
# POSTag function wrapper
def pos_tag(review):
    tagged_text_list = []
    pos_tagger = POSTagger()
    for text in review:
        tagged_text_list.append(pos_tagger.pos_tagger(text=text))
    return tagged_text_list


# POSTag filtering of noun,adjective,verb,adverb
def filter_tag(tagged_reviews):
    filtered_list = []
    pos_tagger = POSTagger()
    for tagged_review in tagged_reviews:
        filtered_list.append(pos_tagger.filter_pos_tag(tagged_review))
    return filtered_list

In [6]:
# tag POS
# tagged_text_list_train = pos_tag(train_text_list)
# joblib.dump(tagged_text_list_train, 'tagged_text_list_train.pkl')
tagged_text_list_train = joblib.load('tagged_text_list_train.pkl')
# print(tagged_text_list_train[:5])

In [7]:
# train list after filtering
final_train_text_list = filter_tag(tagged_text_list_train)

print(final_train_text_list[:5])

['charge night skip taking cord good battery life', 'bought hp pavilion dv4-1222nr laptop many problems computer', 'tech guy said service center 1-to-1 exchange direct concern sales team retail shop bought netbook', 'investigated netbooks saw toshiba nb305-n410bl', 'day presentation seminar large conference town- lots people little time prep set computer projector etc']


In [8]:
#Selecting only 20 most common aspect.
def get_most_common_aspect(aspect_list):
    import nltk
    aspect_terms = []

    aspect_list = list(aspect_list.aspect_info)

    for inner_list in aspect_list:
        if inner_list is not None:
            for _dict in inner_list:
                # for key in _dict:
                aspect_terms.append(_dict.get('term'))

    most_common_aspect = [k for k, v in nltk.FreqDist(aspect_terms).most_common(50)]
    return most_common_aspect

In [9]:
# generate data frame
def get_data_frame(text_list, train_aspects_list, most_common_aspect):
    data = {'Text': text_list}
    df = pd.DataFrame(data)
    for inner_list in train_aspects_list:
        if inner_list is not None:
            for _dict in inner_list:
                # for key in _dict:
                if _dict.get('term') in most_common_aspect:
                    df.loc[train_aspects_list.index(inner_list), _dict.get('term')] = _dict.get('polarity')
    return df

In [27]:
# generate data frame for aspect extraction task
def get_aspect_data_frame(df, most_common_aspect):
    for common_aspect in most_common_aspect:
        df[common_aspect] = df[common_aspect].replace(['positive', 'negative', 'neutral'], [1, 1, 1])
    df = df.fillna(0)
    return df

In [28]:
most_common_aspect = get_most_common_aspect(train_dataframe)
print(most_common_aspect)

['screen', 'price', 'use', 'battery life', 'battery', 'keyboard', 'programs', 'software', 'features', 'warranty', 'hard drive', 'quality', 'size', 'performance', 'speed', 'Windows', 'memory', 'graphics', 'applications', 'motherboard', 'Vista', 'runs', 'program', 'charge', 'works', 'gaming', 'system', 'design', 'power supply', 'display', 'Windows 7', 'windows', 'keys', 'warrenty', 'mouse', 'service', 'OS', 'operating system', 'value', 'speakers', 'extended warranty', 'games', 'shipping', 'processor', 'cost', 'work', 'look', 'carry', 'hardware', 'power']


In [30]:
#get data frame
df_train = get_data_frame(final_train_text_list,train_aspects_list, most_common_aspect)
df_train.head()

Unnamed: 0,Text,battery life,quality,applications,use,features,screen,battery,gaming,speed,...,graphics,power,OS,programs,size,design,Windows,speakers,carry,charge
0,charge night skip taking cord good battery life,positive,,,,,,,,,...,,,,,,,,,,
1,bought hp pavilion dv4-1222nr laptop many problems computer,,,,,,,,,,...,,,,,,,,,,
2,tech guy said service center 1-to-1 exchange direct concern sales team retail shop bought netbook,,,,,,,,,,...,,,,,,,,,,
3,investigated netbooks saw toshiba nb305-n410bl,,,,,,,,,,...,,,,,,,,,,
4,day presentation seminar large conference town- lots people little time prep set computer projector etc,,,,,,,,,,...,,,,,,,,,,


In [31]:
# get expect term df
df_train_aspect = get_aspect_data_frame(df_train, most_common_aspect)
df_train_aspect.head()

Unnamed: 0,Text,battery life,quality,applications,use,features,screen,battery,gaming,speed,...,graphics,power,OS,programs,size,design,Windows,speakers,carry,charge
0,charge night skip taking cord good battery life,1,0,0.0,0.0,0.0,0,0,0.0,0.0,...,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,bought hp pavilion dv4-1222nr laptop many problems computer,0,0,0.0,0.0,0.0,0,0,0.0,0.0,...,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,tech guy said service center 1-to-1 exchange direct concern sales team retail shop bought netbook,0,0,0.0,0.0,0.0,0,0,0.0,0.0,...,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,investigated netbooks saw toshiba nb305-n410bl,0,0,0.0,0.0,0.0,0,0,0.0,0.0,...,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,day presentation seminar large conference town- lots people little time prep set computer projector etc,0,0,0.0,0.0,0.0,0,0,0.0,0.0,...,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [32]:
df_train_aspect = df_train_aspect.reindex(sorted(df_train_aspect.columns), axis=1)

In [33]:
# Similar for test list
test_dataframe = get_xml_data(path_test)
test_text_list = train_dataframe['text']
test_aspects_list = list(train_dataframe['aspect_info'])

In [34]:
# tagged_text_list_test=pos_tag(test_text_list)
# joblib.dump(tagged_text_list_test, 'tagged_text_list_test.pkl')
tagged_text_list_test=joblib.load('tagged_text_list_test.pkl')

In [35]:
final_test_text_list = filter_tag(tagged_text_list_test)

In [36]:
df_test = get_data_frame(final_test_text_list,test_aspects_list, most_common_aspect)
df_test_aspect = get_aspect_data_frame(df_test, most_common_aspect)
df_test_aspect = df_test_aspect.reindex(sorted(df_test_aspect.columns), axis=1)

In [42]:
# Sort the data frame according to aspect's name and separate data(X) and target(y)
# df_train_aspect = df_train_aspect.sample(frac=1).reset_index(drop=True) # For randomization
X_train= df_train_aspect.Text
y_train = df_train_aspect.drop('Text', 1)


# df_test_aspect = df_test_aspect.sample(frac=1).reset_index(drop=True) # For randomization
X_test = df_test_aspect.Text
y_test = df_test_aspect.drop('Text', 1)

In [43]:
# Change y_train to numpy array
import numpy as np
y_train = np.asarray(y_train)
y_test = np.asarray(y_test)

In [48]:
# Generate word vecotors using CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer
# from nltk import word_tokenize          
# from nltk.stem import WordNetLemmatizer 
vect = CountVectorizer(max_df=1.0, stop_words='english')  
X_train_dtm = vect.fit_transform(X_train)
X_test_dtm = vect.transform(X_test)

In [1]:
# Create various models. These are multi-label models.
nb_classif = OneVsRestClassifier(MultinomialNB()).fit(X_train_dtm, y_train)
C = 1.0 
# SVregularization parameter
svc = OneVsRestClassifier(svm.SVC(kernel='linear', C=C)).fit(X_train_dtm, y_train)
lin_svc = OneVsRestClassifier(svm.LinearSVC(C=C)).fit(X_train_dtm, y_train)
sgd = OneVsRestClassifier(SGDClassifier()).fit(X_train_dtm,y_train)