-
Notifications
You must be signed in to change notification settings - Fork 0
/
rebuild.sh
executable file
·132 lines (102 loc) · 3.35 KB
/
rebuild.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
#!/bin/bash -l
function process_url {
url="$1"
if [[ $url == *sitemaps.org* || $url == *w3.org* || $url == *google.com* ]]; then
return
fi
echo Grabbing $url ...
start_time=$(date +%s.%N)
time $BROWSER_INIT "$url" || echo ""
echo ""
end_time=$(date +%s.%N)
# Calculate how long the request took
elapsed=$(echo "$end_time - $start_time" | bc)
# Sleep to maintain the rate of 3 requests per second
sleep_time=$(echo "scale=4; $RATE - $elapsed" | bc)
# Ensure sleep time is not negative
if (( $(echo "$sleep_time > 0" | bc -l) )); then
sleep $sleep_time
fi
let "urlscount=urlscount+1"
}
get_protocol_and_domain() {
local url=$1
# Extract protocol
local protocol=$(echo "$url" | grep -o '^[^:/]*://')
# Remove the protocol part
local stripped_url=$(echo "$url" | sed -e 's|^[^/]*//||')
# Extract the domain part
local domain=$(echo "$stripped_url" | sed -e 's|/.*||')
echo "$protocol$domain"
}
# Check if domain is provided as environment variable
if [[ -n "${INPUT_SITEMAP_URL}" ]]; then
curl -sfL $INPUT_SITEMAP_URL > /dev/null
if [ $? -ne 0 ]; then
echo "Root sitemap URL returned HTTP error!"
exit 1
fi
SITEMAP_URLS+=("$INPUT_SITEMAP_URL")
homepage_url=$(get_protocol_and_domain "$INPUT_SITEMAP_URL")
elif [[ -n "${INPUT_ROBOTS_URL_PREFIX}" ]]; then
SITEMAP_URLS=($(curl -sfL "${INPUT_ROBOTS_URL_PREFIX}/robots.txt" | grep -o '^Sitemap:[[:space:]]*.*' | awk '{print $2}'))
if [ $? -ne 0 ]; then
echo "robots.txt was not found under ${INPUT_ROBOTS_URL_PREFIX}!"
exit 1
fi
homepage_url=$INPUT_ROBOTS_URL_PREFIX
else
echo You have to provide at least one of: sitemap_url, robots_url_prefix
exit 1
fi
if [[ -n "${INPUT_USE_WGET}" && ( "${INPUT_USE_WGET}" == "1" || "${INPUT_USE_WGET}" == "on" || "${INPUT_USE_WGET}" == "true" || "${INPUT_USE_WGET}" == "TRUE" ) ]]; then
BROWSER_INIT="wget -recursive --level=1 --page-requisites -e robots=off -q"
else
BROWSER_INIT="curl -sL -o /dev/null"
fi
if [[ -n "${INPUT_RATE_LIMITATION}" ]]; then
parsed_integer=$(expr "$INPUT_RATE_LIMITATION" + 0)
if [ "$parsed_integer" -gt 0 ]; then
RATE=$(echo "scale=4; 1 / $parsed_integer" | bc)
else
echo "RATE_LIMITATION has an invalid value: $INPUT_RATE_LIMITATION"
exit 1
fi
else
RATE=0.25
fi
new_sitemaps=()
for SITEMAP_FILE in "${SITEMAP_URLS[@]}"; do
sitemap_content=$(curl -sL "$SITEMAP_FILE")
more_sitemaps=($(echo "$sitemap_content" | grep -oE "https?://[^[:space:]]+\.xml"))
new_sitemaps+=("${more_sitemaps[@]}")
done
SITEMAP_URLS+=("${new_sitemaps[@]}")
echo Will process sitemaps:
for url in "${SITEMAP_URLS[@]}"; do
echo $url
done
# Initialize counts
sitemapscount="${#SITEMAP_URLS[@]}"
urlscount=0
TIMEFORMAT="%Es"
process_url "$homepage_url"
for SITEMAP_FILE in "${SITEMAP_URLS[@]}"; do
echo "Processing URLs in $SITEMAP_FILE ..."
sitemap_content=$(curl -sfL "${SITEMAP_FILE}")
if [ $? -ne 0 ]; then
echo "Sitemap URL returned HTTP error code!"
continue
fi
echo ""
# Extract URLs from the sitemap
urls=$(echo "$sitemap_content" | grep -oE 'http[s]?://[^<]+')
for url in $urls; do
process_url "$url"
done
done
echo "Completed processing sitemaps."
echo "sitemapscount=$sitemapscount" >> "$GITHUB_OUTPUT"
echo "urlscount=$urlscount" >> "$GITHUB_OUTPUT"
echo "sitemap_urls=\"${SITEMAP_URLS[@]}\"" >> "$GITHUB_OUTPUT"
echo $GITHUB_OUTPUT