Skip to content

Commit

Permalink
Factor parseDomainFromURL from MistBlock into StringUtils
Browse files Browse the repository at this point in the history
git-svn-id: https://svn.opensource.yandex.net/xscript/trunk@718 b01ef89b-65f2-463d-9415-e8412542ae63
  • Loading branch information
bacek committed Jan 28, 2009
1 parent 76a2a23 commit 7fe3dd0
Show file tree
Hide file tree
Showing 3 changed files with 131 additions and 16 deletions.
9 changes: 9 additions & 0 deletions include/xscript/string_utils.h
Expand Up @@ -3,6 +3,7 @@

#include <string>
#include <ostream>
#include <boost/cstdint.hpp> // for boost::int32_t

#include <xscript/range.h>

Expand Down Expand Up @@ -38,6 +39,14 @@ namespace StringUtils {
std::string tolower(const std::string& str);
std::string toupper(const std::string& str);
const char* nextUTF8(const char* data);

/**
* Parse domain part from url.
* \param url - url to parse. Copy-on-pass.
* \param level - [optional] cut to this level. If level == 0 returns full domain.
* \return domain part of url.
*/
std::string parseDomainFromURL(std::string url, boost::int32_t level = 0);
};


Expand Down
61 changes: 61 additions & 0 deletions library/string_utils.cpp
@@ -1,6 +1,7 @@
#include <libxml/tree.h>
#include <stdexcept>
#include <memory>
#include <algorithm>
#include "xscript/string_utils.h"
#include "xscript/encoder.h"
#include "xscript/algorithm.h"
Expand Down Expand Up @@ -156,4 +157,64 @@ StringUtils::nextUTF8(const char* data) {
return data + NEXT_UTF8[static_cast<unsigned char>(*data)];
}

std::string StringUtils::parseDomainFromURL(std::string url, boost::int32_t level) {

if (0 > level) {
throw std::invalid_argument("bad param: level");
}

std::string::size_type pos;
pos = url.find('?');
if (std::string::npos != pos) {
url.erase(pos);
}

pos = url.find("://");
if (std::string::npos != pos) {
url.erase(0, pos + 3);
}

pos = std::min(url.find('/'), url.find(':'));

if (std::string::npos != pos) {
url.erase(pos);
}

if (url.empty() || '.' == *url.begin() || '.' == *url.rbegin()) {
throw std::invalid_argument("bad param: domain='" + url + "'");
}

boost::int32_t max = std::count(url.begin(), url.end(), '.');
if (0 == level) {
level = max + 1;
}

char c = url[url.rfind('.') + 1];
if (c >= '0' && c <= '9') {
throw std::invalid_argument("bad param: domain='" + url + "'");
}

//if (max < level - 1) {
// log()->warn("max available domain level is less than required mist:set_state_domain");
//}
if (max) {
std::string::size_type end = std::string::npos, tmp = 0;
for (boost::int32_t i = 0; i <= max; ++i) {
pos = url.rfind('.', --tmp);
if (tmp == pos) {
throw std::invalid_argument("bad param: domain='" + url + "'");
}
tmp = pos;
if (i < level) {
end = pos + 1;
}
}
if (end) {
url.erase(0, end);
}
}

return url;
}

}
77 changes: 61 additions & 16 deletions tests/test_string.cpp
Expand Up @@ -10,6 +10,8 @@
#include <dmalloc.h>
#endif

using namespace xscript;

class StringTest : public CppUnit::TestFixture {
public:
void testAmp();
Expand All @@ -24,6 +26,10 @@ class StringTest : public CppUnit::TestFixture {
void testUrldecodeEmpty();
void testUrldecodeLatin();
void testUrldecodeBadSuffix();
void testParseDomain();
void testParseDomainEmpty();
void testParseDomainFile();
void testParseDomainInvalid();

private:
CPPUNIT_TEST_SUITE(StringTest);
Expand All @@ -39,6 +45,12 @@ class StringTest : public CppUnit::TestFixture {
CPPUNIT_TEST(testUrldecodeEmpty);
CPPUNIT_TEST(testUrldecodeLatin);
CPPUNIT_TEST(testUrldecodeBadSuffix);

CPPUNIT_TEST(testParseDomain);
CPPUNIT_TEST_EXCEPTION(testParseDomainEmpty, std::exception);
CPPUNIT_TEST_EXCEPTION(testParseDomainFile, std::exception);
CPPUNIT_TEST_EXCEPTION(testParseDomainInvalid, std::exception);

CPPUNIT_TEST_SUITE_END();
};

Expand Down Expand Up @@ -89,10 +101,9 @@ StringTest::testEscape() {
CPPUNIT_ASSERT_EQUAL(std::string("&lt;td colspan=&quot;2&quot;&gt;"), XmlUtils::escape("<td colspan=\"2\">"));
}


void
StringTest::testParams() {

using namespace xscript;
std::vector<StringUtils::NamedValue> v;

std::string str("test=yes&successful=try%20again");
Expand All @@ -107,57 +118,91 @@ StringTest::testParams() {

void
StringTest::testUrlencode() {

using namespace xscript;
std::string str("ÒÁÚ Ä×Á ÔÒÉ ÞÅÔÙÒÅ ÐÑÔØ"), res = StringUtils::urlencode(str);
CPPUNIT_ASSERT_EQUAL(std::string("%D2%C1%DA%20%C4%D7%C1%20%D4%D2%C9%20%DE%C5%D4%D9%D2%C5%20%D0%D1%D4%D8"), res);
}

void
StringTest::testUrlencodeEmpty() {

using namespace xscript;
std::string str, res = StringUtils::urlencode(str);
CPPUNIT_ASSERT_EQUAL(std::string(""), res);
}

void
StringTest::testUrlencodeLatin() {

using namespace xscript;
std::string str("abcd efgh"), res = StringUtils::urlencode(str);
CPPUNIT_ASSERT_EQUAL(std::string("abcd%20efgh"), res);
}

void
StringTest::testUrldecode() {

using namespace xscript;
std::string str("%D2%C1%DA%20%C4%D7%C1%20%D4%D2%C9%20%DE%C5%D4%D9%D2%C5%20%D0%D1%D4%D8"), res = StringUtils::urldecode(str);
CPPUNIT_ASSERT_EQUAL(std::string("ÒÁÚ Ä×Á ÔÒÉ ÞÅÔÙÒÅ ÐÑÔØ"), res);

}

void
StringTest::testUrldecodeEmpty() {

using namespace xscript;
std::string str, res = StringUtils::urldecode(str);
CPPUNIT_ASSERT_EQUAL(std::string(""), res);
}

void
StringTest::testUrldecodeLatin() {

using namespace xscript;
std::string str("abcd%20efgh"), res = StringUtils::urldecode(str);
CPPUNIT_ASSERT_EQUAL(std::string("abcd efgh"), res);
}

void
StringTest::testUrldecodeBadSuffix() {

using namespace xscript;
std::string str("abcd%20efgh%"), res = StringUtils::urldecode(str);
CPPUNIT_ASSERT_EQUAL(std::string("abcd efgh%"), res);
}

void
StringTest::testParseDomain() {
// Parse domain
CPPUNIT_ASSERT_EQUAL(
std::string("hghltd.yandex.net"),
StringUtils::parseDomainFromURL("http://hghltd.yandex.net:1234/yandbtm?url=http%3A%2F%2Fwww.yandex.ru%2F&amp;text=%FF%ED%E4%E5%EA%F1")
);
// Parse domain with cut to level
CPPUNIT_ASSERT_EQUAL(
std::string("net"),
StringUtils::parseDomainFromURL("http://hghltd.yandex.net:1234/yandbtm?url=http%3A%2F%2Fwww.yandex.ru%2F&amp;text=%FF%ED%E4%E5%EA%F1", 1)
);

// no_level_no_scheme
CPPUNIT_ASSERT_EQUAL(
std::string("www.yandex.ru"),
StringUtils::parseDomainFromURL("www.yandex.ru:8090/yandbtm?url=http%3A%2F%2Fwww.yandex.ru%2F&amp;text=%FF%ED%E4%E5%EA%F1")
);

// yandex.ru
CPPUNIT_ASSERT_EQUAL(
std::string("yandex.ru"),
StringUtils::parseDomainFromURL("http://www.yandex.ru/", 2)
);

// yandex.ru
CPPUNIT_ASSERT_EQUAL(
std::string("yandex.ru"),
StringUtils::parseDomainFromURL("www.yandex.ru/", 2)
);
}

void
StringTest::testParseDomainEmpty() {
StringUtils::parseDomainFromURL("");
}

void
StringTest::testParseDomainFile() {
StringUtils::parseDomainFromURL("file:///home/bacek/bad/bad/boy.xml");
}

void
StringTest::testParseDomainInvalid() {
StringUtils::parseDomainFromURL("http://.www.yandex.ru/index.html");
}

0 comments on commit 7fe3dd0

Please sign in to comment.