# Sitesmap-gather
Get a sites xml sitemap files and save locally as .csv files

Note: I am learning Python here. It's my first thing. Don't laugh.

Documentation / comments are intermittent, sporadic and anemic.

In [None]:
import pandas as pd
import urllib.request
from urllib.parse import urlparse
from bs4 import BeautifulSoup
import requests # for downloading sitemap files
import os, glob # to check current directory, to combine csv files
from lxml import etree
from datetime import datetime # so we can get some time info for debugging and progress indication
from datetime import date


## ToDo

- [x] put all files into subdir with same name as domain
- [x] add user agent to avoid refusal of download
    - credits:
    - https://www.shellhacks.com/python-requests-user-agent-web-scraping/
- [x] add date to subdir name to avoid overwriting
- [ ] auto check if online xml or local csv, so we don't have to specify that ourselves
- [ ] auto check if sitemap index file or not
- [x] merge csv files
    - credits:
    -  https://softhints.com/how-to-merge-multiple-csv-files-with-python/
- [ ] delete .xml files
- [ ] add UI
    - [ ] user selectable directory to save csv's in
- [ ] recursion - submit a list of sitemap index files and get everything within them
- [ ] bug: UnicodeDecodeError: 'charmap' codec can't decode byte 0x81 in position 1112525: character maps to <undefined>
    - during https://www.tv2.no/sitemap/sitemap.xml

# Defining Our Methods

## Download an xml sitemap file

In [None]:
# Download a sitemap-file
# Modified to set user agent to a desktop browser, since some sites refuse to serve it's sitemaps otherwise

def download_sitemap(url,fileToDownload, debug=None):

    headers = {
     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; rv:91.0) Gecko/20100101 Firefox/91.0' 
    }
    
    subDirToCreate = file_info(url,path=True) # maybe we need to create a subdir based on the domain name from the sitemap, checking if subdir exists is done by create_subdir
    create_dir(subDirToCreate)
    
    r = requests.get(url, allow_redirects=True, headers=headers)
    open(fileToDownload, 'wb').write(r.content)




## Read the XML from downloaded sitemap file

In [None]:
# Get XML from local sitemap-file
def get_xml(fileToRead):
    xml = BeautifulSoup(open(fileToRead).read())
    return xml

## Test for sitemap index file or regular sitemap file

In [None]:
# We check whether the sitemap is an index sitemap, or regular sitemap
# This seems slow to me, as it has xml.find_all, but it might be performant for all I know

def get_sitemap_type(xml):
    sitemapindex = xml.find_all('sitemapindex')
    sitemap = xml.find_all('urlset')
    
    if sitemapindex:
        return 'sitemapindex'
    elif sitemap:
        return 'urlset'
    else:
        return

## Load XMl into padas dataframe

In [None]:
# Load XML into a pandas dataframe
def sitemap_to_dataframe(xml, name=None, data=None, verbose=False):
    df = pd.DataFrame(columns=['domain','sitemap_name','loc'])
    
    urls = xml.find_all("url")
    
    for x in urls:

        if xml.find("loc"):
            loc = x.findNext("loc").text
            parsed_uri = urlparse(loc)
            domain = '{uri.netloc}'.format(uri=parsed_uri)
        else:
            loc = ''
            domain = ''

#        if xml.find("changefreq"):
#            changefreq = x.findNext("changefreq").text
#        else:
#            changefreq = ''

#        if xml.find("priority"):
#            priority = x.findNext("priority").text
#        else:
#            priority = ''

        if name:
            sitemap_name = name
        else:
            sitemap_name = ''
              
        row = {
            'domain': domain,
            'loc': loc,
#            'changefreq': changefreq,
#            'priority': priority,
            'sitemap_name': sitemap_name,
        }

        if verbose:
            print(row)

        df = df.append(row, ignore_index=True)
    return df


## Load list of sitemaps
Either from csv or xml index sitemap file

In [None]:
# Load a list of sitemaps
# either from csv or sitemap index
#def get_sitemaplist(list_file):
#    sitemapList = []
#    df = pd.read_csv(list_file)
#    sitemapList = df.values.tolist()
#    return sitemapList


def get_sitemaplist(listToGet, list_type=None):
    if list_type == 'csv':
        sitemapList = []
        df = pd.read_csv(listToGet)
        sitemapList = df.values.tolist()
        return sitemapList
    if list_type == 'xml':
        url = listToGet
        filename = file_info(listToGet,name=True)
        filepath = file_info(listToGet,path=True)
        filelocation = file_info(listToGet,location=True)
        
        download_sitemap(listToGet,filelocation)

        sitemapList = get_child_sitemaps(get_xml(filelocation))
        return sitemapList
    

## Get child sitemaps from index sitemap file

In [None]:
# Get child sitemaps from sitemap index file
# takes xml
# returns list

def get_child_sitemaps(xml):
    sitemaps = xml.find_all("sitemap")

    output = []

    for sitemap in sitemaps:
        output.append(sitemap.findNext("loc").text)
    return output


## File Functions

### Get filename, filepath, filelocation, domain from the URL of a Sitemap

In [16]:
# get filename, filepath or filelocation of sitemap url

# adding subdir support
def file_info(url,name=None, path=None, location=None, domain=None):
    dateToday = str(date.today())
    domain = urlparse(url).netloc
    filename = url[url.rfind("/")+1:len(url)] # extract just the filename from the URL    
    #filepath = os.getcwd()+"\\"+domain+"-"+dateToday # get the current directory    
    filepath = "C:\\Users\\ben\\Downloads\\sitesmapgather\\"+domain+"-"+dateToday
    filelocation = filepath+"\\"+filename # construct a complete URI for the file, including a subdir named after the domain
    
    if name:
        return filename
    if path:
        return filepath
    if location:
        return filelocation
    if domain:
        return domain

   
    

### Create a directory
but test if it exists first

In [None]:
# create a subdirectory named after the domain
# we also check for whether it exists
# maybe this check should be in the main loop?

def create_dir(subdirToCreate):
#    subdirToCreateLocation = file_info(subdirToCreate,domain=True) # set subdir variable
    if not os.path.isdir(subdirToCreate): # does the subdir not exist?
        os.mkdir(subdirToCreate) # then create it
        #print('created directory: '+subdirToCreate)
    else: # if it does exist
        print('directory exists: '+subdirToCreate) # do nothing

### Combine csv files in a folder

In [None]:
# taken from
#  https://softhints.com/how-to-merge-multiple-csv-files-with-python/

def combine_csv(pathToCombine):

    all_files = glob.glob(os.path.join(pathToCombine, "*.csv"))
   
    all_df = []
    for f in all_files:
        df = pd.read_csv(f, sep=',')
        df['file'] = f.split('/')[-1]
        all_df.append(df)
        
    merged_df = pd.concat(all_df, ignore_index=True, sort=True)
    merged_df.to_csv(pathToCombine+"\\"+"merged.csv")

## Workhorse

In [None]:
### main workhorse
### 1. takes url
### 2. downloads file
### 3. gets the xml from the file
### 4. creates dataframe
### 5. saves dataframe as csv

def sitemap_to_csv(url,debug=False):

    
    filename = file_info(url,name=True)
    filepath = file_info(url,path=True)
    filelocation = file_info(url,location=True)
    

    if debug:
        time_mainstart = datetime.now()
        print('sitemap                : ' +filename)
        print('start                  : ' +str(time_mainstart))

    # call download_sitemap method with url and filename
    if debug:
        time_start = datetime.now()
        print('    starting download      : '+str(time_start))
    download_sitemap(url,filelocation) 
    if debug:
        time_end = datetime.now()
        print('    end download           : '+str(time_end-time_start))
        time_start = datetime.now()
        print('    start get xml          : '+str(time_start))    
    # read the xml out of the local file
    xml = get_xml(filelocation)
    if debug:
        time_end = datetime.now()
        print('    end get xml            : '+str(time_end-time_start))
        time_start = datetime.now()
        print('    start create dataframe : '+str(time_start))    
    thing = sitemap_to_dataframe(xml,name=filename)
    if debug:
        time_end = datetime.now()
        print('    end create dataframe   : '+str(time_end-time_start))
        time_start = datetime.now()
        print('    start write csv        : '+str(time_start))    
    thing.to_csv(filelocation+'.csv',index=False)
    if debug:
        time_end = datetime.now()
        print('    end write csv files          : '+str(time_end-time_start))    
        time_mainend = datetime.now()
        print('end                        : '+str(time_mainend-time_mainstart))

    

# Here begins the testing

## Testing main methods

### Get a single sitemap

In [None]:
# test
# trying the main method - get a url and write csv
url = 'https://www.elon.se/sitemap.xml' 
sitemap_to_csv(url,debug=True)

### Combine csv files in a folder
"in a folder" is a lie. It looks in a subfolder named after the domain of the sitemap URL

In [None]:
test_combine = file_info('https://www.elkjop.no/service-sitemap-site-elkjop-no-no-sitemap_index.xml',path=True)
print(test_combine)
combine_csv(test_combine)

### Main loop for a list of sitemap index files
Provide a list of index sitemap files, and we loop through them.
Useful for multinationals with several TLD's, or just getting a heap of different sites while you look out the window*

*) Do that. Look out the window! Your eyes need the excercise, your circadian rythm will sync to the daylight, your sense of presence in the physical world wil solidify.

In [None]:
# Loop through a list of index files

indexList = ['https://www.elkjop.no/sitemaps/OCNOELK.pdp.index.sitemap.xml',
             'https://www.gigantti.fi/sitemaps/OCFIGIG.pdp.index.sitemap.xml',
             'https://www.elgiganten.dk/sitemaps/OCDKELG.pdp.index.sitemap.xml',
             'https://www.elgiganten.se/sitemaps/OCSEELG.pdp.index.sitemap.xml']
#indexList = ['https://www.elkjop.no/service-sitemap-site-elkjop-no-no-sitemap_index.xml',
#             'https://www.gigantti.fi/service-sitemap-site-gigantti-fi-fi-sitemap_index.xml',
#             'https://www.elgiganten.dk/service-sitemap-site-elgiganten-dk-da-sitemap_index.xml',
#             'https://www.elgiganten.se/service-sitemap-site-elgiganten-se-sv-sitemap_index.xml']

#print(indexList)


for y in indexList:
    pathToCombine = file_info(y,path=True)
    sitemapList = get_sitemaplist(y,'xml')
    for x in sitemapList:
        currentURL = str(x)
        sitemap_to_csv(currentURL,debug=True)
    
    print('path to combine: '+pathToCombine)
    combine_csv(pathToCombine) # when this inner loop is done, combine the csv files



## Main loop for a list of sitemaps
Either from local csv or online xml

In [17]:
# test
# doing a loop through a sitemap list csv

sitemapList = get_sitemaplist('https://www.tv2.no/sitemap/sitemap.xml','xml')
print(sitemapList)

for x in sitemapList:
    currentURL = str(x)
    sitemap_to_csv(currentURL,debug=True)
    

# url = 'https://www.gigantti.fi/service-sitemap-site-gigantti-fi-fi-sitemap_vc.xml' 
# sitemap_to_csv(url,debug=True)

['https://www.tv2.no/sitemap/nyheter-0.xml', 'https://www.tv2.no/sitemap/nyheter-1.xml', 'https://www.tv2.no/sitemap/nyheter-2.xml', 'https://www.tv2.no/sitemap/nyheter-3.xml', 'https://www.tv2.no/sitemap/nyheter-4.xml', 'https://www.tv2.no/sitemap/nyheter-5.xml', 'https://www.tv2.no/sitemap/nyheter-6.xml', 'https://www.tv2.no/sitemap/nyheter-7.xml', 'https://www.tv2.no/sitemap/nyheter-8.xml', 'https://www.tv2.no/sitemap/nyheter-9.xml', 'https://www.tv2.no/sitemap/nyheter-10.xml', 'https://www.tv2.no/sitemap/nyheter-11.xml', 'https://www.tv2.no/sitemap/nyheter-12.xml', 'https://www.tv2.no/sitemap/nyheter-13.xml', 'https://www.tv2.no/sitemap/nyheter-14.xml', 'https://www.tv2.no/sitemap/nyheter-15.xml', 'https://www.tv2.no/sitemap/nyheter-16.xml', 'https://www.tv2.no/sitemap/nyheter-17.xml', 'https://www.tv2.no/sitemap/nyheter-18.xml', 'https://www.tv2.no/sitemap/nyheter-19.xml', 'https://www.tv2.no/sitemap/nyheter-20.xml', 'https://www.tv2.no/sitemap/nyheter-21.xml', 'https://www.tv2.no

UnicodeDecodeError: 'charmap' codec can't decode byte 0x81 in position 1112525: character maps to <undefined>

## Testing all the smaller cogs in the machine

In [None]:
# test
# check the get_sitemaplist method

#sitemap_to_csv('https://arstechnica.com/sitemap.xml',debug=True)
#get_sitemaplist('sitemap_list.csv','csv')
get_sitemaplist('https://arstechnica.com/sitemap.xml','xml')

In [None]:
# test
# test the get_child_sitemap method

url = 'https://www.gigantti.fi/service-sitemap-site-gigantti-fi-fi-sitemap_index.xml'

filename = file_info(url,name=True)
filepath = file_info(url,path=True)
filelocation = file_info(url,location=True)

download_sitemap(url,filelocation)

get_child_sitemaps(get_xml(filelocation))

In [None]:
# test
sitemap_to_csv('https://www.elkjop.no/service-sitemap-site-elkjop-no-no-sitemap_index.xml',debug=True)

In [None]:
testname = file_info('https://arstechnica.com/sitemap.xml',name=True)
testpath = file_info('https://arstechnica.com/sitemap.xml',path=True)
testloc = file_info('https://arstechnica.com/sitemap.xml',location=True)
testdom = file_info('https://arstechnica.com/sitemap.xml',domain=True)

print(testname)
print(type(testname))
print(testpath)
print(type(testpath))
print(testloc)
print(type(testloc))
print(testdom)
print(type(testdom))





In [None]:
print(date.today())