Skip to content

Commit

Permalink
+ Refactored AO3Tag into AO3TagBase, which can be shared across AO3Ta…
Browse files Browse the repository at this point in the history
…g, AO3Ship, and AO3Character

+ Implemented AO3Tag using AO3TagBase
+ Implemented AO3Ship (for the first time!) using AO3TagBase
+ Added a TON of documentation to ITag and IShip and their concrete implementations
+ Added a couple of specific exceptions for parsing-related errors, and documentation to these as well
+ Added the `Website` enum which is an enum of all of the different websites that Alexandria supports. Replaced the `SourceHandle` field with this on `LibrarySource`.
+ Any exceptions that are thrown when Request()ing a RequestHandle that are not themselves AlexandriaException will be wrapped in a new UnrecognizedFormatAlexandriaException, because these errors come from some operation that's invalid because the input data wasn't how we expected it to be.
+ Added the "elements must be in order" StyleCop rule back to Alexandria. We've got a *fair* number of warnings right now, but we'll make a better codebase by the end of it.
  • Loading branch information
ahlec committed Nov 16, 2017
1 parent 11bf1e0 commit 8a3cf24
Show file tree
Hide file tree
Showing 17 changed files with 489 additions and 101 deletions.
1 change: 0 additions & 1 deletion Alexandria.ruleset
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,6 @@
<Rule Id="SA1118" Action="None" />
<Rule Id="SA1119" Action="None" />
<Rule Id="SA1200" Action="None" />
<Rule Id="SA1201" Action="None" />
<Rule Id="SA1309" Action="None" />
<Rule Id="SA1400" Action="None" />
<Rule Id="SA1402" Action="None" />
Expand Down
2 changes: 1 addition & 1 deletion Alexandria/AO3/AO3Source.cs
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ public AO3Source( IWebClient webClient, Cache cache )
}

/// <inheritdoc />
public override string SourceHandle => "ao3";
public override Website Website => Website.AO3;

/// <inheritdoc />
public override LibrarySearch MakeSearch()
Expand Down
122 changes: 113 additions & 9 deletions Alexandria/AO3/Model/AO3Ship.cs
Original file line number Diff line number Diff line change
Expand Up @@ -6,31 +6,135 @@

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text.RegularExpressions;
using Alexandria.AO3.RequestHandles;
using Alexandria.Documents;
using Alexandria.Exceptions.Parsing;
using Alexandria.Model;
using Alexandria.RequestHandles;
using HtmlAgilityPack;

namespace Alexandria.AO3.Model
{
internal sealed class AO3Ship : IShip
/// <summary>
/// A concrete class for parsing a tag from AO3.
/// <para />
/// This class can parse a ship or a character tag (because we don't always know what
/// a tag is ahead of time) whereas <seealso cref="AO3Ship"/> and <seealso cref="AO3Character"/>
/// cannot parse something that isn't exactly the type that they are.
/// </summary>
internal sealed class AO3Ship : AO3TagBase, IShip
{
AO3Ship( Uri url )
static readonly Regex _matchingParentheses = new Regex( @"\([^\)]*\)" );

AO3Ship( AO3Source source, Uri url, HtmlNode mainDiv )
: base( source, url, mainDiv )
{
Url = url;
string internalShipName = RemoveParentheses( Name );
if ( TryDetermineShipTypeFromName( internalShipName, out ShipType type, out string[] characterNames ) )
{
Type = type;
Characters = GetCharacters( source, mainDiv, characterNames );
}
}

public Uri Url { get; }
/// <summary>
/// Gets the full name of the ship, as requested. There can be many valid names
/// for a ship, and this will only be the name of the ship as it was requested.
/// While it might be *A* valid name for the ship, this is not guaranteed to be the
/// canonical, "official" name for that ship.
/// </summary>
public string Name => Text;

/// <summary>
/// Gets the type of the ship: whether this ship is platonic, romantic, or some other
/// type of relationship.
/// </summary>
public ShipType Type { get; }

/// <summary>
/// Gets a list of the characters that are involved in this ship.
/// </summary>
public IReadOnlyList<ICharacterRequestHandle> Characters { get; }

/// <summary>
/// Parses an HTML page into an instance of an <seealso cref="AO3Ship"/>.
/// </summary>
/// <param name="source">The source that the HTML page came from, which is then stored for
/// querying fanfics and also passed along to any nested request handles for them to parse
/// data with as well.</param>
/// <param name="document">The document that came from the website itself.</param>
/// <returns>An instance of <seealso cref="AO3Ship"/> that was parsed and configured using
/// the information provided.</returns>
internal static AO3Ship Parse( AO3Source source, HtmlCacheableDocument document )
{
HtmlNode mainDiv = GetMainDiv( document );

public string Name { get; private set; }
TagType type = ParseTagType( mainDiv );
if ( type != TagType.Relationship )
{
string name = ParseTagText( mainDiv );
throw new InvalidTagTypeAlexandriaException( TagType.Relationship, type, name );
}

public ShipType Type { get; private set; }
return new AO3Ship( source, document.Url, mainDiv );
}

public IReadOnlyList<ICharacterRequestHandle> Characters { get; private set; }
static string RemoveParentheses( string name )
{
return _matchingParentheses.Replace( name, string.Empty ).Trim();
}

internal static AO3Ship Parse( HtmlCacheableDocument document )
static bool TryDetermineShipTypeFromName( string name, out ShipType type, out string[] characterNames )
{
throw new NotImplementedException();
if ( name.Contains( "/" ) )
{
type = ShipType.Romantic;
characterNames = name.Split( '/' );
return true;
}

if ( name.Contains( "&" ) )
{
type = ShipType.Platonic;
characterNames = name.Split( '&' );
return true;
}

string[] namePieces = name.Split( ' ' );
if ( namePieces.Any( piece => piece.Equals( "x", StringComparison.OrdinalIgnoreCase ) ) )
{
type = ShipType.Romantic;
characterNames = namePieces;
return true;
}

type = ShipType.Unknown;
characterNames = null;
return false;
}

static bool IsCharacterTag( ITagRequestHandle tagRequest, ICollection<string> uniqueNames )
{
string tagName = RemoveParentheses( tagRequest.Text );
return uniqueNames.Contains( tagName );
}

IReadOnlyList<ICharacterRequestHandle> GetCharacters( AO3Source source, HtmlNode mainDiv, IEnumerable<string> characterNames )
{
List<ICharacterRequestHandle> characters = new List<ICharacterRequestHandle>();
HashSet<string> uniqueNames = new HashSet<string>( characterNames, StringComparer.InvariantCultureIgnoreCase );

foreach ( ITagRequestHandle tagRequest in ParseParentTags( mainDiv ) )
{
if ( IsCharacterTag( tagRequest, uniqueNames ) )
{
characters.Add( new AO3CharacterRequestHandle( source, tagRequest.Text ) );
}
}

return characters;
}
}
}
130 changes: 43 additions & 87 deletions Alexandria/AO3/Model/AO3Tag.cs
Original file line number Diff line number Diff line change
Expand Up @@ -6,105 +6,61 @@

using System;
using System.Collections.Generic;
using Alexandria.AO3.RequestHandles;
using Alexandria.Caching;
using Alexandria.Documents;
using Alexandria.Model;
using Alexandria.RequestHandles;
using Alexandria.Utils;
using HtmlAgilityPack;

namespace Alexandria.AO3.Model
{
internal sealed class AO3Tag : ITag
/// <summary>
/// A concrete class for parsing a tag from AO3.
/// <para />
/// This class can parse a ship or a character tag (because we don't always know what
/// a tag is ahead of time) whereas <seealso cref="AO3Ship"/> and <seealso cref="AO3Character"/>
/// cannot parse something that isn't exactly the type that they are.
/// </summary>
internal sealed class AO3Tag : AO3TagBase, ITag
{
AO3Tag( AO3Source source, Uri url )
AO3Tag( AO3Source source, Uri url, HtmlNode mainDiv )
: base( source, url, mainDiv )
{
_source = source;
Url = url;
Type = ParseTagType( mainDiv );
ParentTags = ParseParentTags( mainDiv );
SynonymousTags = ParseSynonymousTags( mainDiv );
}

public Uri Url { get; }

public TagType Type { get; private set; }

public string Text { get; private set; }

public IReadOnlyList<ITagRequestHandle> ParentTags { get; private set; }

public IReadOnlyList<ITagRequestHandle> SynonymousTags { get; private set; }

public IQueryResultsPage<IFanfic, IFanficRequestHandle> QueryFanfics()
/// <summary>
/// Gets the type of the tag.
/// </summary>
public TagType Type { get; }

/// <summary>
/// Gets any parent tags that this tag might have (tags which conceptually would
/// encompass this tag alongside other tags, if this website supports that).
/// </summary>
public IReadOnlyList<ITagRequestHandle> ParentTags { get; }

/// <summary>
/// Gets any other tags which have the same meaning as this tag but which are perhaps
/// written a different way (for instance, a ship tag AAAA/BBBB might have a synonymous
/// tag of BBBB/AAAA).
/// </summary>
public IReadOnlyList<ITagRequestHandle> SynonymousTags { get; }

/// <summary>
/// Parses an HTML page into an instance of an <seealso cref="AO3Tag"/>.
/// </summary>
/// <param name="source">The source that the HTML page came from, which is then stored for
/// querying fanfics and also passed along to any nested request handles for them to parse
/// data with as well.</param>
/// <param name="document">The document that came from the website itself.</param>
/// <returns>An instance of <seealso cref="AO3Tag"/> that was parsed and configured using
/// the information provided.</returns>
public static AO3Tag Parse( AO3Source source, HtmlCacheableDocument document )
{
string endpointTag = Text.Replace( "/", "*s*" );
return AO3QueryResults.Retrieve( _source, CacheableObjects.TagFanficsHtml, "tags", endpointTag, 1 );
}

internal static AO3Tag Parse( AO3Source source, HtmlCacheableDocument document )
{
AO3Tag parsed = new AO3Tag( source, document.Url );

HtmlNode mainDiv = document.Html.SelectSingleNode( "//div[@class='tags-show region']" );

string mainContentPText = mainDiv.SelectSingleNode( "div[@class='tag home profile']/p" ).InnerText;
string mainContentPFirstSentence = mainContentPText.Substring( 0, mainContentPText.IndexOf( '.' ) );
int mainContentSentenceStartLength = "This tag belongs to the ".Length;
string textCategory = mainContentPFirstSentence.Substring( mainContentSentenceStartLength, mainContentPText.LastIndexOf( " Category", StringComparison.InvariantCultureIgnoreCase ) - mainContentSentenceStartLength );
switch ( textCategory )
{
case "Character":
{
parsed.Type = TagType.Character;
break;
}

case "Relationship":
{
parsed.Type = TagType.Relationship;
break;
}

case "Additional Tags":
{
parsed.Type = TagType.Miscellaneous;
break;
}

default:
{
throw new NotImplementedException();
}
}

parsed.Text = mainDiv.SelectSingleNode( ".//div[@class='primary header module']/h2" ).ReadableInnerText().Trim();

List<ITagRequestHandle> parentTags = new List<ITagRequestHandle>();
HtmlNode parentUl = mainDiv.SelectSingleNode( ".//div[@class='parent listbox group']/ul" );
if ( parentUl != null )
{
foreach ( HtmlNode li in parentUl.Elements( "li" ) )
{
parentTags.Add( new AO3TagRequestHandle( source, li.ReadableInnerText().Trim() ) );
}
}

parsed.ParentTags = parentTags;

List<ITagRequestHandle> synonymousTags = new List<ITagRequestHandle>();
HtmlNode synonymUl = mainDiv.SelectSingleNode( ".//div[@class='synonym listbox group']/ul" );
if ( synonymUl != null )
{
foreach ( HtmlNode li in synonymUl.Elements( "li" ) )
{
synonymousTags.Add( new AO3TagRequestHandle( source, li.ReadableInnerText().Trim() ) );
}
}

parsed.SynonymousTags = synonymousTags;

return parsed;
HtmlNode mainDiv = GetMainDiv( document );
return new AO3Tag( source, document.Url, mainDiv );
}

readonly AO3Source _source;
}
}
Loading

0 comments on commit 8a3cf24

Please sign in to comment.