Skip to content

Commit

Permalink
+ HtmlAgilityPack fixed the bug with the <option> tag not being parse…
Browse files Browse the repository at this point in the history
…able! Which means that that code could be simplified and we can now look for <option> directly

+ Refactored HtmlUtils into HtmlParserBase, thereby removing the last library-wide utility class. There is just a single AO3-specific utility class that we have now, which I'm fine with keeping. Not ideal, but maintainable enough.
  • Loading branch information
ahlec committed Dec 3, 2017
1 parent a9e2d1e commit 3691a30
Show file tree
Hide file tree
Showing 14 changed files with 49 additions and 46 deletions.
2 changes: 1 addition & 1 deletion Alexandria.Tests/AO3/Conformity/Languages.Data.cs
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ static IReadOnlyList<AO3Language> PullDownLanguages()

HtmlNode languageSelect = searchPageDocumentNode.SelectSingleNode( "//select[@id='work_search_language_id']" );
List<AO3Language> ao3Languages = new List<AO3Language>();
foreach ( HtmlNode option in languageSelect.Elements( Document.OptionsHtmlTag ) )
foreach ( HtmlNode option in languageSelect.Elements( "option" ) )
{
string idStr = option.GetAttributeValue( "value", null );
if ( string.IsNullOrWhiteSpace( idStr ) )
Expand Down
1 change: 0 additions & 1 deletion Alexandria.Tests/AO3/Conformity/Languages.Tests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
using System.Collections.Generic;
using System.Linq;
using Alexandria.Model;
using Alexandria.Utils;
using NUnit.Framework;

namespace Alexandria.Tests.AO3.Conformity
Expand Down
4 changes: 2 additions & 2 deletions Alexandria.Tests/Alexandria.Tests.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,8 @@
<CodeAnalysisRuleSet>AlexandriaTests.ruleset</CodeAnalysisRuleSet>
</PropertyGroup>
<ItemGroup>
<Reference Include="HtmlAgilityPack, Version=1.6.5.0, Culture=neutral, PublicKeyToken=bd319b19eaf3b43a, processorArchitecture=MSIL">
<HintPath>..\packages\HtmlAgilityPack.1.6.5\lib\Net45\HtmlAgilityPack.dll</HintPath>
<Reference Include="HtmlAgilityPack, Version=1.6.6.0, Culture=neutral, PublicKeyToken=bd319b19eaf3b43a, processorArchitecture=MSIL">
<HintPath>..\packages\HtmlAgilityPack.1.6.6\lib\Net45\HtmlAgilityPack.dll</HintPath>
</Reference>
<Reference Include="nunit.framework, Version=3.8.1.0, Culture=neutral, PublicKeyToken=2638cd05610744eb, processorArchitecture=MSIL">
<HintPath>..\packages\NUnit.3.8.1\lib\net45\nunit.framework.dll</HintPath>
Expand Down
2 changes: 1 addition & 1 deletion Alexandria.Tests/packages.config
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
<?xml version="1.0" encoding="utf-8"?>
<packages>
<package id="HtmlAgilityPack" version="1.6.5" targetFramework="net452" />
<package id="HtmlAgilityPack" version="1.6.6" targetFramework="net452" />
<package id="NUnit" version="3.8.1" targetFramework="net452" />
<package id="NUnit.ConsoleRunner" version="3.7.0" targetFramework="net452" />
<package id="NUnit3TestAdapter" version="3.8.0" targetFramework="net452" />
Expand Down
9 changes: 4 additions & 5 deletions Alexandria/AO3/Model/AO3Author.cs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
using Alexandria.Model;
using Alexandria.Querying;
using Alexandria.RequestHandles;
using Alexandria.Utils;
using HtmlAgilityPack;

namespace Alexandria.AO3.Model
Expand All @@ -25,7 +24,7 @@ internal sealed class AO3Author : AO3ModelBase<AO3Author>, IAuthor
{
{ "My pseuds", ( author, value ) => author.Nicknames = CollectPseuds( value ) },
{ "I joined on", ( author, value ) => author.DateJoined = DateTime.Parse( value.InnerText ) },
{ "I live in", ( author, value ) => author.Location = value.ReadableInnerText() },
{ "I live in", ( author, value ) => author.Location = GetReadableInnerText( value ) },
{ "My birthday", ( author, value ) => author.Birthday = DateTime.Parse( value.InnerText ) }
};

Expand Down Expand Up @@ -70,8 +69,8 @@ public static AO3Author Parse( AO3Source source, Document document )

AO3Author parsed = new AO3Author( source, document.Url )
{
Name = userHomeProfile.SelectSingleNode( "div[@class='primary header module']/h2[@class='heading']/a" ).ReadableInnerText(),
Biography = userHomeProfile.SelectSingleNode( "div[@class='bio module']/blockquote" )?.ReadableInnerText(),
Name = GetReadableInnerText( userHomeProfile.SelectSingleNode( "div[@class='primary header module']/h2[@class='heading']/a" ) ),
Biography = GetReadableInnerText( userHomeProfile.SelectSingleNode( "div[@class='bio module']/blockquote" ) ),
NumberFanfics = ParseNumberAuthorWorks( document.Html )
};

Expand All @@ -88,7 +87,7 @@ public static AO3Author Parse( AO3Source source, Document document )

static IReadOnlyList<string> CollectPseuds( HtmlNode pseudsDd )
{
return pseudsDd.Elements( "a" ).Select( pseudA => pseudA.ReadableInnerText() ).ToList();
return pseudsDd.Elements( "a" ).Select( GetReadableInnerText ).ToList();
}

static int ParseNumberAuthorWorks( HtmlNode html )
Expand Down
8 changes: 3 additions & 5 deletions Alexandria/AO3/Model/AO3ChapterInfo.cs
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,13 @@

using System.Collections.Generic;
using Alexandria.AO3.RequestHandles;
using Alexandria.Caching;
using Alexandria.Model;
using Alexandria.RequestHandles;
using Alexandria.Utils;
using HtmlAgilityPack;

namespace Alexandria.AO3.Model
{
internal sealed class AO3ChapterInfo : IChapterInfo
internal sealed class AO3ChapterInfo : HtmlParserBase, IChapterInfo
{
AO3ChapterInfo()
{
Expand All @@ -39,7 +37,7 @@ internal static AO3ChapterInfo Parse( AO3Source source, Document document, HtmlN
AO3ChapterInfo parsed = new AO3ChapterInfo();
List<IFanficRequestHandle> chapters = new List<IFanficRequestHandle>( chapterDropdownSelect.ChildNodes.Count );
int chapterNumber = 1;
foreach ( HtmlNode chapterOption in chapterDropdownSelect.Elements( Document.OptionsHtmlTag ) )
foreach ( HtmlNode chapterOption in chapterDropdownSelect.Elements( "option" ) )
{
string fanficHandle = chapterOption.GetAttributeValue( "value", null );
chapters.Add( new AO3FanficRequestHandle( source, fanficHandle ) );
Expand Down Expand Up @@ -69,7 +67,7 @@ internal static AO3ChapterInfo Parse( AO3Source source, Document document, HtmlN
HtmlNode chapterTitleTextNode = chapterPrefaceGroup.LastChild;
if ( chapterTitleTextNode.Name != "a" && chapterTitleTextNode.Name != "A" )
{
string tentativeTitle = chapterTitleTextNode.ReadableInnerText();
string tentativeTitle = GetReadableInnerText( chapterTitleTextNode );
if ( !string.IsNullOrWhiteSpace( tentativeTitle ) )
{
parsed.ChapterTitle = tentativeTitle.Trim();
Expand Down
13 changes: 6 additions & 7 deletions Alexandria/AO3/Model/AO3Fanfic.cs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
using Alexandria.AO3.Data;
using Alexandria.Model;
using Alexandria.RequestHandles;
using Alexandria.Utils;
using HtmlAgilityPack;

namespace Alexandria.AO3.Model
Expand All @@ -27,7 +26,7 @@ internal sealed class AO3Fanfic : AO3ModelBase<AO3Fanfic>, IFanfic
{ "relationship tags", ( fanfic, value ) => fanfic.Ships = ParseTagsFromDlTable<IShip, IShipRequestHandle>( fanfic, value, ShipRequestHandleCreator ) },
{ "character tags", ( fanfic, value ) => fanfic.Characters = ParseTagsFromDlTable<ICharacter, ICharacterRequestHandle>( fanfic, value, CharacterRequestHandleCreator ) },
{ "freeform tags", ( fanfic, value ) => fanfic.Tags = ParseTagsFromDlTable<ITag, ITagRequestHandle>( fanfic, value, TagRequestHandleCreator ) },
{ "language", ( fanfic, value ) => fanfic.Language = Languages.Parse( value.ReadableInnerText() ) },
{ "language", ( fanfic, value ) => fanfic.Language = Languages.Parse( value.InnerText.Trim() ) },
{ "series", ( fanfic, value ) => fanfic.SeriesInfo = AO3SeriesEntry.Parse( fanfic.Source, value ) },
{ "stats", ParseStatsTable }
};
Expand Down Expand Up @@ -180,34 +179,34 @@ static void ParsePreface( AO3Fanfic fanfic, HtmlNode html )
{
HtmlNode prefaceGroup = html.SelectSingleNode( "//div[@class='preface group']" );

fanfic.Title = prefaceGroup.SelectSingleNode( "h2[@class='title heading']" ).ReadableInnerText();
fanfic.Title = GetReadableInnerText( prefaceGroup.SelectSingleNode( "h2[@class='title heading']" ) );
fanfic.Authors = ParseAuthorsList( fanfic.Source, prefaceGroup.SelectSingleNode( "h3[@class = 'byline heading']" ) );

HtmlNode summaryBlockquote = prefaceGroup.SelectSingleNode( ".//div[@class='summary module']/blockquote" );
if ( summaryBlockquote != null )
{
fanfic.Summary = summaryBlockquote.ReadableInnerText();
fanfic.Summary = GetReadableInnerText( summaryBlockquote );
}

HtmlNode notesBlockquote = prefaceGroup.SelectSingleNode( ".//div[@class='notes module']/blockquote" );
if ( notesBlockquote != null )
{
fanfic.AuthorsNote = notesBlockquote.ReadableInnerText();
fanfic.AuthorsNote = GetReadableInnerText( notesBlockquote );
}
}

static string ParseFootnote( HtmlNode html )
{
HtmlNode workEndnotesBlockquote = html.SelectSingleNode( "//div[@id='work_endnotes']/blockquote" );
return workEndnotesBlockquote?.ReadableInnerText();
return GetReadableInnerText( workEndnotesBlockquote );
}

static string ParseFanficText( HtmlNode html )
{
HtmlNode userstuffModuleDiv = html.SelectSingleNode( "//div[@class='userstuff module']" ) ??
html.SelectSingleNode( "//div[@id='chapters']/div[contains( @class, 'userstuff' )]" );
userstuffModuleDiv.Element( "h3" )?.Remove();
return userstuffModuleDiv.ReadableInnerText();
return GetReadableInnerText( userstuffModuleDiv );
}
}
}
6 changes: 3 additions & 3 deletions Alexandria/AO3/Model/AO3ModelBase.cs
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@
using System;
using System.Collections.Generic;
using Alexandria.AO3.RequestHandles;
using Alexandria.Model;
using Alexandria.RequestHandles;
using Alexandria.Utils;
using HtmlAgilityPack;

namespace Alexandria.AO3.Model
Expand All @@ -19,7 +19,7 @@ namespace Alexandria.AO3.Model
/// </summary>
/// <typeparam name="TSelf">The child class itself. We use this for working with other generic functions to
/// pass in `this` (so to speak) in order to prevent boxing internally.</typeparam>
internal abstract class AO3ModelBase<TSelf> : IRequestable
internal abstract class AO3ModelBase<TSelf> : HtmlParserBase, IRequestable
where TSelf : AO3ModelBase<TSelf>
{
/// <summary>
Expand Down Expand Up @@ -145,7 +145,7 @@ protected static void ParseDlTable( TSelf self, HtmlNode dl, IReadOnlyDictionary
continue;
}

string tag = tagA.ReadableInnerText().Trim();
string tag = GetReadableInnerText( tagA ).Trim();
TRequestHandle requestHandle = requestHandleCreator( self, tag );
results.Add( requestHandle );
}
Expand Down
3 changes: 1 addition & 2 deletions Alexandria/AO3/Model/AO3TagBase.cs
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
using Alexandria.Model;
using Alexandria.Querying;
using Alexandria.RequestHandles;
using Alexandria.Utils;
using HtmlAgilityPack;

namespace Alexandria.AO3.Model
Expand Down Expand Up @@ -56,7 +55,7 @@ protected static HtmlNode GetMainDiv( Document document )

protected static string ParseTagText( HtmlNode mainDiv )
{
return mainDiv.SelectSingleNode( ".//div[@class='primary header module']/h2" ).ReadableInnerText();
return GetReadableInnerText( mainDiv.SelectSingleNode( ".//div[@class='primary header module']/h2" ) );
}

protected static TagType ParseTagType( HtmlNode mainDiv, Website website, Uri url )
Expand Down
6 changes: 3 additions & 3 deletions Alexandria/Alexandria.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,8 @@
<CodeAnalysisRuleSet>Alexandria.ruleset</CodeAnalysisRuleSet>
</PropertyGroup>
<ItemGroup>
<Reference Include="HtmlAgilityPack, Version=1.6.5.0, Culture=neutral, PublicKeyToken=bd319b19eaf3b43a, processorArchitecture=MSIL">
<HintPath>..\packages\HtmlAgilityPack.1.6.5\lib\Net45\HtmlAgilityPack.dll</HintPath>
<Reference Include="HtmlAgilityPack, Version=1.6.6.0, Culture=neutral, PublicKeyToken=bd319b19eaf3b43a, processorArchitecture=MSIL">
<HintPath>..\packages\HtmlAgilityPack.1.6.6\lib\Net45\HtmlAgilityPack.dll</HintPath>
</Reference>
<Reference Include="System" />
<Reference Include="System.Core" />
Expand Down Expand Up @@ -63,6 +63,7 @@
<Compile Include="Exceptions\Parsing\UnknownParsingErrorAlexandriaException.cs" />
<Compile Include="Model\IQueryable.cs" />
<Compile Include="AO3\Model\AO3ModelBase.cs" />
<Compile Include="HtmlParserBase.cs" />
<Compile Include="Website.cs" />
<None Include="Alexandria.ruleset" />
<AdditionalFiles Include="..\stylecop.json">
Expand Down Expand Up @@ -137,7 +138,6 @@
<Compile Include="Searching\NumberSearchCriteriaType.cs" />
<Compile Include="Searching\SearchField.cs" />
<Compile Include="Searching\SortDirection.cs" />
<Compile Include="Utils\HtmlUtils.cs" />
<Compile Include="Model\Languages.cs" />
</ItemGroup>
<ItemGroup />
Expand Down
13 changes: 2 additions & 11 deletions Alexandria/Document.cs
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,6 @@ namespace Alexandria
{
internal sealed class Document
{
public const string OptionsHtmlTag = "my_option";

readonly HtmlDocument _htmlDocument;

Document( string handle, Uri url, HtmlDocument document )
Expand All @@ -41,16 +39,9 @@ internal sealed class Document

public static Document ParseFromWebResult( Website website, string handle, WebResult result )
{
// There's some super-bizzaro thing with HtmlAgilityPack where it doesn't recognise </option>.
// http://stackoverflow.com/questions/293342/htmlagilitypack-drops-option-end-tags
// Just replacing the tag name altogether to something else, it doesn't matter, we're not going to pay attention
// to it right now.
const string OptionsOpenTagReplacement = "<" + OptionsHtmlTag + " ";
const string OptionsCloseTagReplacement = OptionsHtmlTag + ">";
string html = result.ResponseText.Replace( "<option ", OptionsOpenTagReplacement ).Replace( "option>", OptionsCloseTagReplacement );

HtmlDocument document = new HtmlDocument();
byte[] bytes = Encoding.UTF8.GetBytes( html );
byte[] bytes = Encoding.UTF8.GetBytes( result.ResponseText );

using ( Stream textStream = new MemoryStream( bytes ) )
{
document.Load( textStream, Encoding.UTF8 );
Expand Down
25 changes: 22 additions & 3 deletions Alexandria/Utils/HtmlUtils.cs → Alexandria/HtmlParserBase.cs
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,31 @@
using System.Web;
using HtmlAgilityPack;

namespace Alexandria.Utils
namespace Alexandria
{
internal static class HtmlUtils
/// <summary>
/// A base class for any class that interacts with parsing HTML. This provides utility functions that make
/// parsing HTML much easier.
/// </summary>
internal abstract class HtmlParserBase
{
public static string ReadableInnerText( this HtmlNode node )
/// <summary>
/// Like <seealso cref="HtmlNode.InnerText"/>, it will return all of the visible text in this node, without any
/// HTML markup. However, unlike <seealso cref="HtmlNode.InnerText"/>, some HTML nodes will be interpretted before
/// they are stripped out (for example, &lt;br /&gt; will be removed and replaced with a linebreak). This will produce
/// text that has only visible characters and no HTML nodes, but which will also more closely visually resemble what
/// it looks like when the text is rendered.
/// </summary>
/// <param name="node">The HTML node whose text should be retrieved.</param>
/// <returns>If the node is null, or the node has no text, then this will return null. Otherwise, this will return a string
/// which contains all of the text with the inner HTML nodes stripped out of it.</returns>
protected static string GetReadableInnerText( HtmlNode node )
{
if ( node == null )
{
return null;
}

// Strip out all of the HTML tags, EXCEPT for <br> and <br /> and <p>, which should be transformed into newline characters
string innerHtml = node.InnerHtml;
StringBuilder builder = new StringBuilder( innerHtml.Length );
Expand Down
1 change: 0 additions & 1 deletion Alexandria/LibrarySource.cs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
using Alexandria.Net;
using Alexandria.RequestHandles;
using Alexandria.Searching;
using Alexandria.Utils;
using HtmlAgilityPack;

namespace Alexandria
Expand Down
2 changes: 1 addition & 1 deletion Alexandria/packages.config
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
<?xml version="1.0" encoding="utf-8"?>
<packages>
<package id="HtmlAgilityPack" version="1.6.5" targetFramework="net452" />
<package id="HtmlAgilityPack" version="1.6.6" targetFramework="net452" />
<package id="StyleCop.Analyzers" version="1.0.2" targetFramework="net452" developmentDependency="true" />
<package id="System.IO.Abstractions" version="2.0.0.144" targetFramework="net452" />
</packages>

0 comments on commit 3691a30

Please sign in to comment.