In [1]:
#!/usr/bin/env python3

#!pip insall sitemap_tree_for_homepage
#!pip insall pandas

import csv, requests
from usp.tree import sitemap_tree_for_homepage
import pandas as pd
from concurrent.futures import ThreadPoolExecutor

In [2]:
tree = sitemap_tree_for_homepage('https://ceaksan.com')

pageDetails = [[
    page.url,
    page.last_modified.isoformat('#','hours').split('#')[0] if page.last_modified else None,
    float(page.priority) if page.priority else None] for page in tree.all_pages()]

with open('pages.csv', 'w+', newline='') as fl:
    write = csv.writer(fl)
    write.writerow(['URL', 'LastModified', 'Priority'])
    write.writerows(pageDetails)

print(f'{len(pageDetails)} rows founded!')

2021-07-19 23:10:08,394 INFO usp.fetch_parse [43159/MainThread]: Fetching level 0 sitemap from https://ceaksan.com/robots.txt...
2021-07-19 23:10:08,396 INFO usp.helpers [43159/MainThread]: Fetching URL https://ceaksan.com/robots.txt...
2021-07-19 23:10:09,378 INFO usp.fetch_parse [43159/MainThread]: Parsing sitemap from URL https://ceaksan.com/robots.txt...
2021-07-19 23:10:09,379 INFO usp.fetch_parse [43159/MainThread]: Fetching level 0 sitemap from http://ceaksan.com/tr/sitemap.xml...
2021-07-19 23:10:09,380 INFO usp.helpers [43159/MainThread]: Fetching URL http://ceaksan.com/tr/sitemap.xml...
2021-07-19 23:10:15,667 INFO usp.fetch_parse [43159/MainThread]: Parsing sitemap from URL http://ceaksan.com/tr/sitemap.xml...
2021-07-19 23:10:15,810 INFO usp.fetch_parse [43159/MainThread]: Fetching level 0 sitemap from http://ceaksan.com/en/sitemap.xml...
2021-07-19 23:10:15,811 INFO usp.helpers [43159/MainThread]: Fetching URL http://ceaksan.com/en/sitemap.xml...
2021-07-19 23:10:17,767 IN

In [3]:
'''
with open('pages.csv') as csvFile:
    csvReader = csv.reader(csvFile, delimiter=',')
    print([row[0] for row in csvReader][1:10])
'''

"\nwith open('pages.csv') as csvFile:\n    csvReader = csv.reader(csvFile, delimiter=',')\n    print([row[0] for row in csvReader][1:10])\n"

In [4]:
df = pd.read_csv('pages.csv')
df.head()

Unnamed: 0,URL,LastModified,Priority
0,https://ceaksan.com/tr/posts/2020/05/2019-en-p...,2021-03-18,0.8
1,https://ceaksan.com/tr/about,2021-06-01,0.8
2,https://ceaksan.com/tr/posts/2018/12/absolute-...,2021-04-23,0.8
3,https://ceaksan.com/tr/posts/2021/02/acuity-sc...,2021-03-27,0.8
4,https://ceaksan.com/tr/posts/2019/04/ads-hesap...,2021-04-23,0.8


In [5]:
# https://pythontic.com/pandas/dataframe-attributes/introduction
print(df.shape)
print(df.columns)
print(df.describe)
print(df.info)

(2272, 3)
Index(['URL', 'LastModified', 'Priority'], dtype='object')
<bound method NDFrame.describe of                                                     URL LastModified  Priority
0     https://ceaksan.com/tr/posts/2020/05/2019-en-p...   2021-03-18       0.8
1                          https://ceaksan.com/tr/about   2021-06-01       0.8
2     https://ceaksan.com/tr/posts/2018/12/absolute-...   2021-04-23       0.8
3     https://ceaksan.com/tr/posts/2021/02/acuity-sc...   2021-03-27       0.8
4     https://ceaksan.com/tr/posts/2019/04/ads-hesap...   2021-04-23       0.8
...                                                 ...          ...       ...
2267  https://ceaksan.com/tr/posts/2019/04/youtube-d...   2021-03-18       0.8
2268  https://ceaksan.com/tr/posts/2020/06/youtube-k...   2021-04-14       0.8
2269  https://ceaksan.com/tr/posts/2019/07/z-shell-o...   2021-03-18       0.8
2270  https://ceaksan.com/tr/posts/2021/01/zoho-sale...   2021-03-27       0.8
2271  https://ceaksan.com/tr

In [6]:
def callbackResp(cll):
    response = cll.result()
    df['Status'] = response.status_code
    df['Redirected'] = response.is_redirect
    df['PermanentRedirection'] = response.is_permanent_redirect
    df['IssueReason'] = response.reason if response.reason != 'OK' else None


In [9]:
with ThreadPoolExecutor(max_workers=1) as executor:
    for url in df['URL']:
        cll = executor.submit(requests.head, url)
        cll.add_done_callback(callbackResp)
        if(cll.running()):
            print('Task 1 running')
            print()
        elif(cll.done()):
            print(cll.result())
            break

Task 1 running



In [None]:
df.head()

In [None]:
print(df.columns)
print(list(set(df['Priority'])))
print(list(set(df['Status'])))
print(list(set(df['Redirected'])))
print(list(set(df['PermanentRedirection'])))
print(list(set(df['IssueReason'])))

In [None]:
df.loc[df.Priority > 0.6]

In [None]:
# df['Status'].value_counts()
df[['URL', 'Status']].head(10)

In [None]:
df['Status'] = (pd
    .Categorical(
        df['Status'],
        categories=list(set(df['Status'])),
        ordered=False))

df['Priority'] = (pd
    .Categorical(
        df['Priority'],
        categories=[float(i/10) for i in range(0,10,1)],
        ordered=True))

df['Redirected'] = (pd
    .Categorical(
        df['Redirected'],
        categories=list(set(df['Redirected'])),
        ordered=False))

df['PermanentRedirection'] = (pd
    .Categorical(
        df['PermanentRedirection'],
        categories=list(set(df['PermanentRedirection'])),
        ordered=False))

In [None]:
df[['URL', 'Status','Priority']].sample(5)

In [None]:
df.to_csv('sitemap.zip', index=False, compression=dict(method='zip',archive_name='pages-new.csv'))