Permalink
Browse files

Add HTML parser supports

Allow RXMLElement parse HTML. Depends on <libxml2/libxml/HTMLparser.h>.

Add following constructors:

    - (id)initFromHTMLString:(NSString *)xmlString encoding:(NSStringEncoding)encoding;
    - (id)initFromHTMLFile:(NSString *)filename;
    - (id)initFromHTMLFile:(NSString *)filename fileExtension:(NSString*)extension;
    - (id)initFromHTMLFilePath:(NSString *)fullPath;
    - (id)initFromHTMLData:(NSData *)data;
    + (id)elementFromHTMLString:(NSString *)xmlString encoding:(NSStringEncoding)encoding;
    + (id)elementFromHTMLFile:(NSString *)filename;
    + (id)elementFromHTMLFile:(NSString *)filename fileExtension:(NSString*)extension;
    + (id)elementFromHTMLFilePath:(NSString *)fullPath;
    + (id)elementFromHTMLData:(NSData *)data;
  • Loading branch information...
1 parent 993f083 commit a2aaad7b4bd6325db160bf62a9e9b45c991c289d @siuying siuying committed Mar 22, 2013
Showing with 129 additions and 0 deletions.
  1. +4 −0 RaptureXML.xcodeproj/project.pbxproj
  2. +13 −0 RaptureXML/RXMLElement.h
  3. +56 −0 RaptureXML/RXMLElement.m
  4. +56 −0 Tests/HTMLTests.m
@@ -45,6 +45,7 @@
02F3A3FF1526D22600E8C822 /* players.xml in Resources */ = {isa = PBXBuildFile; fileRef = 0DEB8F2D14681BD800024989 /* players.xml */; };
0DEB8EB51467EC9B00024989 /* Foundation.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 0252B2C5142ADFC60018B75D /* Foundation.framework */; };
0DEB8F2C14681A9400024989 /* libz.dylib in Frameworks */ = {isa = PBXBuildFile; fileRef = 0252B305142AE3FF0018B75D /* libz.dylib */; };
+ 5415BE4216FC638100AFC566 /* HTMLTests.m in Sources */ = {isa = PBXBuildFile; fileRef = 5415BE4116FC638100AFC566 /* HTMLTests.m */; };
6BD1BDA91558B91400F1D055 /* RXMLElement.h in Copy Headers */ = {isa = PBXBuildFile; fileRef = 027DAC2E14FBF443001BA563 /* RXMLElement.h */; };
/* End PBXBuildFile section */
@@ -88,6 +89,7 @@
0DEB8F2B14681A0800024989 /* Tests-Prefix.pch */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "Tests-Prefix.pch"; path = "Tests/Tests-Prefix.pch"; sourceTree = SOURCE_ROOT; };
0DEB8F2D14681BD800024989 /* players.xml */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.xml; name = players.xml; path = Tests/players.xml; sourceTree = SOURCE_ROOT; };
1413670716D9BEC700501ABB /* CopyTests.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; name = CopyTests.m; path = Tests/CopyTests.m; sourceTree = SOURCE_ROOT; };
+ 5415BE4116FC638100AFC566 /* HTMLTests.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; name = HTMLTests.m; path = Tests/HTMLTests.m; sourceTree = SOURCE_ROOT; };
6BD1BD951558B7A800F1D055 /* RaptureXML-StaticLib-Prefix.pch */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = "RaptureXML-StaticLib-Prefix.pch"; sourceTree = "<group>"; };
/* End PBXFileReference section */
@@ -168,6 +170,7 @@
02F3A4041526D7BC00E8C822 /* EncodingTests.m */,
027B6BCF153C652E00A4EDF2 /* XPathTests.m */,
02ADE6A116A0E33A008643D5 /* AttributeTests.m */,
+ 5415BE4116FC638100AFC566 /* HTMLTests.m */,
);
name = Tests;
path = RaptureXMLTests;
@@ -353,6 +356,7 @@
02ADE6A916A0E491008643D5 /* EncodingTests.m in Sources */,
02ADE6AA16A0E491008643D5 /* XPathTests.m in Sources */,
02565F9916E6320700A882F9 /* CopyTests.m in Sources */,
+ 5415BE4216FC638100AFC566 /* HTMLTests.m in Sources */,
);
runOnlyForDeploymentPostprocessing = 0;
};
View
@@ -31,6 +31,7 @@
#import <Foundation/Foundation.h>
#import <libxml2/libxml/xmlreader.h>
#import <libxml2/libxml/xmlmemory.h>
+#import <libxml2/libxml/HTMLparser.h>
#import <libxml/xpath.h>
#import <libxml/xpathInternals.h>
@@ -55,13 +56,25 @@
- (id)initFromXMLData:(NSData *)data;
- (id)initFromXMLDoc:(RXMLDocHolder *)doc node:(xmlNodePtr)node;
+- (id)initFromHTMLString:(NSString *)xmlString encoding:(NSStringEncoding)encoding;
+- (id)initFromHTMLFile:(NSString *)filename;
+- (id)initFromHTMLFile:(NSString *)filename fileExtension:(NSString*)extension;
+- (id)initFromHTMLFilePath:(NSString *)fullPath;
+- (id)initFromHTMLData:(NSData *)data;
+
+ (id)elementFromXMLString:(NSString *)xmlString encoding:(NSStringEncoding)encoding;
+ (id)elementFromXMLFile:(NSString *)filename;
+ (id)elementFromXMLFilename:(NSString *)filename fileExtension:(NSString *)extension;
+ (id)elementFromURL:(NSURL *)url __attribute__((deprecated));
+ (id)elementFromXMLData:(NSData *)data;
+ (id)elementFromXMLDoc:(RXMLDocHolder *)doc node:(xmlNodePtr)node;
++ (id)elementFromHTMLString:(NSString *)xmlString encoding:(NSStringEncoding)encoding;
++ (id)elementFromHTMLFile:(NSString *)filename;
++ (id)elementFromHTMLFile:(NSString *)filename fileExtension:(NSString*)extension;
++ (id)elementFromHTMLFilePath:(NSString *)fullPath;
++ (id)elementFromHTMLData:(NSData *)data;
+
- (NSString *)attribute:(NSString *)attributeName;
- (NSString *)attribute:(NSString *)attributeName inNamespace:(NSString *)ns;
View
@@ -102,6 +102,42 @@ - (id)initFromXMLDoc:(RXMLDocHolder *)doc node:(xmlNodePtr)node {
return self;
}
+- (id)initFromHTMLString:(NSString *)xmlString encoding:(NSStringEncoding)encoding {
+ return [self initFromHTMLData:[xmlString dataUsingEncoding:encoding]];
+}
+
+- (id)initFromHTMLFile:(NSString *)filename {
+ NSString *fullPath = [[[NSBundle bundleForClass:self.class] bundlePath] stringByAppendingPathComponent:filename];
+ return [self initFromHTMLData:[NSData dataWithContentsOfFile:fullPath]];
+}
+
+- (id)initFromHTMLFile:(NSString *)filename fileExtension:(NSString*)extension {
+ NSString *fullPath = [[NSBundle bundleForClass:[self class]] pathForResource:filename ofType:extension];
+ return [self initFromHTMLData:[NSData dataWithContentsOfFile:fullPath]];
+}
+
+- (id)initFromHTMLFilePath:(NSString *)fullPath {
+ return [self initFromHTMLData:[NSData dataWithContentsOfFile:fullPath]];
+
+}
+
+- (id)initFromHTMLData:(NSData *)data {
+ if ((self = [super init])) {
+ xmlDocPtr doc = htmlReadMemory([data bytes], (int)[data length], "", nil, HTML_PARSE_NOWARNING | HTML_PARSE_NOERROR);
+ self.xmlDoc = [[RXMLDocHolder alloc] initWithDocPtr:doc];
+
+ if ([self isValid]) {
+ node_ = xmlDocGetRootElement(doc);
+
+ if (!node_) {
+ self.xmlDoc = nil;
+ }
+ }
+ }
+ return self;
+}
+
+
// Copy the RaptureXML element
// (calling copy will call this method automatically with the default zone)
-(id)copyWithZone:(NSZone *)zone{
@@ -139,6 +175,26 @@ - (NSString *)description {
return [self text];
}
++ (id)elementFromHTMLString:(NSString *)xmlString encoding:(NSStringEncoding)encoding {
+ return [[RXMLElement alloc] initFromHTMLString:xmlString encoding:encoding];
+}
+
++ (id)elementFromHTMLFile:(NSString *)filename {
+ return [[RXMLElement alloc] initFromHTMLFile:filename];
+}
+
++ (id)elementFromHTMLFile:(NSString *)filename fileExtension:(NSString*)extension {
+ return [[RXMLElement alloc] initFromHTMLFile:filename fileExtension:extension];
+}
+
++ (id)elementFromHTMLFilePath:(NSString *)fullPath {
+ return [[RXMLElement alloc] initFromHTMLFilePath:fullPath];
+}
+
++ (id)elementFromHTMLData:(NSData *)data {
+ return [[RXMLElement alloc] initFromHTMLData:data];
+}
+
#pragma mark -
- (NSString *)tag {
View
@@ -0,0 +1,56 @@
+//
+// HTMLTests.m
+// RaptureXML
+//
+// Created by Francis Chong on 22/3/13.
+// Copyright (c) 2013 Rapture In Venice. All rights reserved.
+//
+
+#import "RXMLElement.h"
+
+@interface HTMLTests : SenTestCase {
+ NSString *simpleHTML_;
+}
+@end
+
+@implementation HTMLTests
+
+- (void)setUp {
+ simpleHTML_ = @"<?xml version=\"1.0\" encoding=\"UTF-8\"?>\
+ <!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"\
+ \"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">\
+ <html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\" >\
+ <head>\
+ <title>Minimal XHTML 1.1 Document</title>\
+ </head>\
+ <body>\
+ <p>This is a minimal <a href=\"http://www.w3.org/TR/xhtml11\">XHTML 1.1</a> document.</p>\
+ </body>\
+ </html>";
+}
+
+- (void)testBasicXHTML {
+ RXMLElement *html = [RXMLElement elementFromHTMLString:simpleHTML_ encoding:NSUTF8StringEncoding];
+ NSArray *atts = [html attributeNames];
+ STAssertEquals(atts.count, 2U, nil);
+
+ NSArray* children = [html childrenWithRootXPath:@"//html/body/p"];
+ STAssertTrue([children count] > 0, nil);
+
+ RXMLElement* child = [children objectAtIndex:0];
+ NSLog(@"content: %@", [child text]);
+ STAssertEqualObjects([child text], @"This is a minimal XHTML 1.1 document.", nil);
+}
+
+-(void) testHtmlEntity {
+ RXMLElement* html = [RXMLElement elementFromHTMLString:@"<p>Don&apos;t say &quot;lazy&quot;</p>" encoding:NSUTF8StringEncoding];
+ STAssertEqualObjects([html text], @"Don't say \"lazy\"", nil);
+}
+
+-(void) testFixBrokenHtml {
+ RXMLElement* html = [RXMLElement elementFromHTMLString:@"<p><b>Test</p> Broken HTML</b>" encoding:NSUTF8StringEncoding];
+ STAssertEqualObjects([html text], @"Test Broken HTML", nil);
+ STAssertEqualObjects([html xml], @"<html><body><p><b>Test</b></p> Broken HTML</body></html>", nil);
+}
+
+@end

0 comments on commit a2aaad7

Please sign in to comment.