In [21]:
import pandas as pd
from sklearn.externals import joblib
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm
import xml.etree.ElementTree as ET
from lxml import etree
from scipy.sparse import hstack
import numpy as np
import warnings

from preprocessing.xml_2_dataframe import Xml2DataFrame
from preprocessing.pos_tagger import POSTagger


In [22]:
path_train = './data/Laptops_Train_v2.xml'
path_test = r'./data/Laptops_Test_Gold.xml'
new_test_path = r'./data/test.xml'


In [23]:
# xml parser
def get_xml_data(path):
    xml2df = Xml2DataFrame()
    xml_dataframe = xml2df.process_data(path)
    return xml_dataframe


df = get_xml_data(path_train)
df.head()

Unnamed: 0,id,text,aspect_info
0,2339,i charge it at night and skip taking the cord ...,"[{'to': '45', 'term': 'cord', 'from': '41', 'p..."
1,812,i bought a hp pavilion dv4-1222nr laptop and h...,
2,1316,the tech guy then said the service center does...,"[{'to': '41', 'term': 'service center', 'from'..."
3,2328,i investigated netbooks and saw the toshiba nb...,
4,2193,the other day i had a presentation to do for a...,


In [24]:
# Making list to train
train_dataframe = get_xml_data(path_train)
# print(train_dataframe.head())
train_text_list = train_dataframe['text']
train_aspects_list = train_dataframe['aspect_info']
print(train_text_list.head())
print("=============================")
print(train_aspects_list.head())

0    i charge it at night and skip taking the cord ...
1    i bought a hp pavilion dv4-1222nr laptop and h...
2    the tech guy then said the service center does...
3    i investigated netbooks and saw the toshiba nb...
4    the other day i had a presentation to do for a...
Name: text, dtype: object
0    [{'to': '45', 'term': 'cord', 'from': '41', 'p...
1                                                 None
2    [{'to': '41', 'term': 'service center', 'from'...
3                                                 None
4                                                 None
Name: aspect_info, dtype: object


In [45]:
# POSTag function wrapper
def pos_tag(review):
    tagged_text_list = []
    pos_tagger = POSTagger()
    for text in review:
        tagged_text_list.append(pos_tagger.pos_tagger(text=text))
    return tagged_text_list


# POSTag filtering of noun,adjective,verb,adverb
def filter_tag(tagged_reviews):
    filtered_list = []
    pos_tagger = POSTagger()
    for tagged_review in tagged_reviews:
        filtered_list.append(pos_tagger.filter_pos_tag(tagged_review))
    return filtered_list

In [46]:
# tag POS
# tagged_text_list_train = pos_tag(train_text_list)
# joblib.dump(tagged_text_list_train, 'tagged_text_list_train.pkl')
tagged_text_list_train = joblib.load('tagged_text_list_train.pkl')
print(tagged_text_list_train[:5])

[[('i', 'JJ'), ('charge', 'NN'), ('it', 'PRP'), ('at', 'IN'), ('night', 'NN'), ('and', 'CC'), ('skip', 'NN'), ('taking', 'VBG'), ('the', 'DT'), ('cord', 'NN'), ('with', 'IN'), ('me', 'PRP'), ('because', 'IN'), ('of', 'IN'), ('the', 'DT'), ('good', 'JJ'), ('battery', 'NN'), ('life', 'NN'), ('.', '.')], [('i', 'NN'), ('bought', 'VBD'), ('a', 'DT'), ('hp', 'JJ'), ('pavilion', 'NN'), ('dv4-1222nr', 'JJ'), ('laptop', 'NN'), ('and', 'CC'), ('have', 'VBP'), ('had', 'VBN'), ('so', 'RB'), ('many', 'JJ'), ('problems', 'NNS'), ('with', 'IN'), ('the', 'DT'), ('computer', 'NN'), ('.', '.')], [('the', 'DT'), ('tech', 'NN'), ('guy', 'NN'), ('then', 'RB'), ('said', 'VBD'), ('the', 'DT'), ('service', 'NN'), ('center', 'NN'), ('does', 'VBZ'), ('not', 'RB'), ('do', 'VB'), ('1-to-1', 'JJ'), ('exchange', 'NN'), ('and', 'CC'), ('i', 'NNS'), ('have', 'VBP'), ('to', 'TO'), ('direct', 'VB'), ('my', 'PRP$'), ('concern', 'NN'), ('to', 'TO'), ('the', 'DT'), ('``', '``'), ('sales', 'NNS'), ("''", "''"), ('team', '

In [50]:
# train list after filtering
final_train_text_list = filter_tag(tagged_text_list_train)

print(final_train_text_list[:5])

[['i', 'charge', 'night', 'skip', 'taking', 'cord', 'good', 'battery', 'life'], ['i', 'bought', 'hp', 'pavilion', 'dv4-1222nr', 'laptop', 'have', 'had', 'so', 'many', 'problems', 'computer'], ['tech', 'guy', 'then', 'said', 'service', 'center', 'does', 'not', 'do', '1-to-1', 'exchange', 'i', 'have', 'direct', 'concern', 'sales', 'team', 'is', 'retail', 'shop', 'i', 'bought', 'netbook'], ['i', 'investigated', 'netbooks', 'saw', 'toshiba', 'nb305-n410bl'], ['other', 'day', 'i', 'had', 'presentation', 'do', 'seminar', 'large', 'conference', 'town-', 'lots', 'people', 'little', 'time', 'prep', 'have', 'set', 'computer', 'projector']]
