Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
bf74793
SOLR-15557 Script to generate redirects
janhoy Feb 4, 2022
2cbfc75
Add the input files, and some docs
janhoy Feb 4, 2022
9a6ab26
Re-phrasing
janhoy Feb 4, 2022
598f509
Used correct old-guide.txt
janhoy Feb 4, 2022
2038e7b
Add back script
janhoy Feb 4, 2022
7ec7040
Add the resulting RewriteRules to this PR so it is easier to do itera…
janhoy Feb 4, 2022
b81232f
Added support for 8_11 mappings. Did some more mappings from spreadsheet
janhoy Feb 4, 2022
c871250
Use relative file names. Change name of directory
janhoy Feb 6, 2022
babaa28
Relative file names
janhoy Feb 6, 2022
825d5ea
404
janhoy Feb 6, 2022
7a246a1
Rework back to root-htaccess file
janhoy Feb 7, 2022
ce26d5f
Review feedback, adding /solr/ and /latest/ levels
janhoy Feb 7, 2022
171922e
Add 301 to RedirectMatch
janhoy Feb 8, 2022
09b09b4
Merge remote-tracking branch 'apache/main' into pr/596
HoustonPutman May 11, 2022
b266509
Remove old pages
HoustonPutman May 11, 2022
e268183
Find solutions for remaining dangling redirects, add some info to ins…
HoustonPutman May 11, 2022
78ecec9
Add no-robots header for old ref-guide-versions, except terminal 8_11…
HoustonPutman May 11, 2022
3a781d7
The no-robots rule should now work
HoustonPutman May 11, 2022
b857144
Make 'latest' remain in URL instead of `9_0` (#846)
janhoy May 11, 2022
9502245
DOAP changes for release 9.0.0
janhoy May 11, 2022
eb3c0c2
Sync CHANGES for 9.0.0
janhoy May 12, 2022
d56ec0c
Update with bugfix for RewriteRule ^/guide -> RewriteRule ^guide
janhoy May 12, 2022
132d8f4
Merge branch 'main' into SOLR-15557-script-to-generate-redirects
janhoy May 12, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
155 changes: 155 additions & 0 deletions dev-tools/scripts/refguide/gen-refguide-redirects.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
Simple script that converts old refguide page names as of 8.11.1 to the new Antora URLs from 9.0
See input files in folder gen-refguide-redirects/

The old-guide.txt is the plain .adoc names from an 'ls | grep adoc' in old ref-guide src folder
The new-guide.txt is the output from this command from the new repo in the 'modules' folder:
find . | grep adoc | sed 's/\/pages//g' | sed 's/^.\///g'
The mappings.csv comes from the explicit page renamings sourced from spreadsheet
https://docs.google.com/spreadsheets/d/1mwxSpn5Ky7-P4DLFrJGel2h7Il4muTlHmAA-AuRY1rs/edit#gid=982988701
"""

import os
import sys
from pprint import pprint
sys.path.append(os.path.dirname(__file__))
import argparse


def read_config():
parser = argparse.ArgumentParser(description='Convert old refguide page names to new')
parser.add_argument('--old', required=True, help='Old pagenames file, one .adoc filename per line')
parser.add_argument('--new', required=True, help='New pagenames file, one .adoc filename per line')
parser.add_argument('--mapping', required=True, help='Semicolon separated from-to file names (adoc)')
parser.add_argument('--htaccess', action='store_true', default=False, help='Output as htaccess rules')
newconf = parser.parse_args()
return newconf


def out(text):
global conf
if not conf.htaccess:
print(text)


def lines_from_file(filename):
with open(filename, 'r') as fp:
lines = []
for line in fp.readlines():
if line.startswith("#") or len(line.strip()) == 0:
continue
lines.append(line.replace(".adoc", ".html").strip())
return lines


def main():
global conf
conf = read_config()

new = {}
name_map = {}

out("Reading config")
old = lines_from_file(conf.old)
for line in lines_from_file(conf.new):
(path, file) = line.split("/")
new[file] = line
for line in lines_from_file(conf.mapping):
(frm, to) = line.split(";")
name_map[frm] = to

# Files in src/old-pages as of 2022-02-04
old_pages = ["configuration-apis.html", "configuration-guide.html", "controlling-results.html", "deployment-guide.html", "enhancing-queries.html", "field-types.html", "fields-and-schema-design.html", "getting-started.html", "indexing-data-operations.html", "installation-deployment.html", "monitoring-solr.html", "query-guide.html", "scaling-solr.html", "schema-indexing-guide.html", "solr-concepts.html", "solr-schema.html", "solrcloud-clusters.html", "user-managed-clusters.html"]

result = {}
old_guide = []
failed = {}
regex_new = {}
out("Converting...")
for frm in old:
if frm in new:
(subpath, name) = new[frm].split("/")
if subpath not in regex_new:
regex_new[subpath] = []
regex_new[subpath].append(name.split(".html")[0])
elif frm in name_map:
new_name = name_map[frm]
new_name_without_anchor = new_name
anchor = ""
anchor_index = new_name.find("#")
if anchor_index > 0:
new_name_without_anchor = new_name[:anchor_index]
anchor = new_name[anchor_index:]
if new_name_without_anchor.startswith("https://"):
result[frm] = new_name
elif new_name_without_anchor in new:
result[frm] = new[new_name_without_anchor] + anchor
elif new_name_without_anchor.startswith("/guide/"):
result[frm] = new_name[7:]
elif new_name_without_anchor == "_8_11":
old_guide.append(frm.split(".html")[0])
else:
failed[frm] = "Mapped value %s not in new guide" % new_name_without_anchor
elif frm in old_pages:
failed[frm] = "Not yet mapped (in src/old-pages)"
else:
failed[frm] = "404"

if conf.htaccess:
print("# Existing pages moved to sub path")
for key in regex_new:
print("RedirectMatch 301 ^/guide/(%s)\.html /guide/solr/latest/%s/$1.html" % ("|".join(regex_new[key]), key))
print("# Page renames in 9.0")
for key in result:
if result[key].startswith("https://"):
print("RewriteRule ^guide/%s %s [R=301,NE,L]" % (key, result[key]))
else:
print("RewriteRule ^guide/%s /guide/solr/latest/%s [R=301,NE,L]" % (key, result[key]))
print("# Removed pages redirected to latest 8.x guide")
old_version_pages_regex = "(%s)\.html" % "|".join(old_guide)
print("RedirectMatch 301 ^/guide/%s /guide/8_11/$1.html" % old_version_pages_regex)
print("# Paths we could not map")
for key in failed:
print("# %s: %s" % (key, failed[key]))

print("""

# Do not index old reference guide pages on search engines, except for pages that don't exist in 9+
<If "%%{REQUEST_URI} =~ m#^/guide/(6|7|8)_.*#">
<If "%%{REQUEST_URI} !~ m#^/guide/8_11/%s$#">
Header set X-Robots-Tag "noindex,nofollow,noarchive"
</If>
</If>""" % old_version_pages_regex)
else:
out("Regex mappings:")
pprint(regex_new)
out("Rename mappings:")
pprint(result)
out("Old refGuide mappings:")
pprint(old_guide)
out("Failed mappings:")
pprint(failed)


if __name__ == '__main__':
try:
main()
except KeyboardInterrupt:
print('\nReceived Ctrl-C, exiting early')
Loading