Get all datasets and the according attribute values which contain (at least) one link without _https_ which also works with the replaced protocol.

In [67]:
import requests     # 2.18.4
import json         # 2.0.9
import pandas as pd # 0.23.0
import numpy as np
import ast
from urlextract import URLExtract as extract
extractor = extract()

In [68]:
# BFE Abnahme
packages = 'https://ckan.ogdch-abnahme.clients.liip.ch/api/3/action/package_search?fq=organization:(bundesamt-fur-energie-bfe)&rows=500'

# Make the HTTP request
response = requests.get(packages)

# Use the json module to load CKAN's response into a dictionary
response_dict = json.loads(response.content)

# Check the contents of the response
assert response_dict['success'] is True  # make sure if response is OK

# Get a list of all publications and their information (each publication is a dictionary)
data = response_dict['result']['results'].copy()

# get all upper keys
allKeys = []

# store information about each dataset in a frame
df = pd.DataFrame(data)

In [69]:
bigFiles   = pd.read_csv('bigFiles.csv')['Link'].to_list()
notWorking = pd.read_csv('notWorking.csv')['Link'].to_list()

In [70]:
def url_checker(url):
	try:
		# first check if one of the urls is downloading huge files
		# (already tested)
		
		if url in bigFiles:
			return True
		
		if url in notWorking:
			return False

		#Get Url
		get = requests.get(url)
		# if the request succeeds 
		if get.status_code == 200:
			return True
		else:
			return False

	#Exception
	except requests.exceptions.RequestException as e:
        # print URL with Errs
		raise SystemExit(f"{url}: is Not reachable \nErr: {e}")

def testAdjustedURL(url):
    adjusted = url.replace("http:", "https:" )
    adjusted = adjusted.replace("www", "https://www" )
    adjusted = adjusted.replace("https://https://www", "https://www" )

    return [url_checker(adjusted), adjusted]

In [71]:
def fromDictionary(d):

    changed  = False

    for key in d.keys():

        val      = d[key]
        isString = True
        

        if type(val) not in [dict, str, list]:
            continue
        elif type(val) != str:
            isString = False
            val = str(val)

        urls = extractor.find_urls(val)

        if len(urls) == 0:
            continue
        else:
            for url in urls:

                working = False

                if not any(prefix in url for prefix in ['www', 'http']):
                    continue
                elif 'https://' in url:
                    continue
                else:
                    #first check if the adjusted url works, then replace it in the original

                    working, adjusted_url = testAdjustedURL(url)

                    if working:
                        changed = True

                        if isString:
                            d[key] = val.replace(url, adjusted_url)
                        else:
                            d[key] = ast.literal_eval(val.replace(url, adjusted_url))


    if changed:
        return [d]
    else:
        return []


In [72]:
def fromList(l):

    changed  = False

    for i in range(len(l)):

        val = l[i]
        isString = True

        if type(val) not in [dict, str, list]:
            continue
        elif type(val) != str:
            isString = False
            val = str(val)

        urls = extractor.find_urls(val)

        if len(urls) == 0:
            continue
        else:
            for url in urls:

                working = False

                if not any(prefix in url for prefix in ['www', 'http']):
                    continue
                elif 'https://' in url:
                    continue
                else:
                    #first check if the adjusted url works, then replace it in the original

                    working, adjusted_url = testAdjustedURL(url)

                    if working:
                        changed = True

                        if isString:
                            l[i] = val.replace(url, adjusted_url)
                        else:
                            l[i] = ast.literal_eval(val.replace(url, adjusted_url))


    if changed:
        return [l]
    else:
        return []


In [73]:
def fromString(string):
     
     changed = False
     urls = extractor.find_urls(string)
     
     for url in urls:
          
          working = False
          
          if not any(prefix in url for prefix in ['www', 'http']):
               continue
          elif 'https://' in url:
               continue

          # change to https://
          else:
               working, adjusted_url = testAdjustedURL(url)
            
          
          if working:
               changed = True
               string  = string.replace(url, adjusted_url)
               
     
     if changed:
          return [string]
     else:
          return []

In [74]:
def getInfo(elem):

    if type(elem) == dict:
        return fromDictionary(elem)

    elif type(elem) == list:
        return fromList(elem)

    elif type(elem) == str:
        return fromString(elem)

    else:
        return []

In [75]:
cols = df.columns

packageIDs      = []
packageNames    = []
attributeNames  = []
attributeValues = []

for index, row in df.iterrows():
    for col in cols:

        value = getInfo(row[col])

        if value == []:
            continue
        
        else:
            packageIDs.append(row['id'])
            packageNames.append(row['title']['de'])
            attributeNames.append(col)
            attributeValues = attributeValues + value

In [84]:
frame = pd.DataFrame([packageIDs, packageNames, attributeNames, attributeValues]).T
frame.columns = ['package_id', 'package_name', 'attribute_name', 'attribute_value']
frame.to_csv('linkAndAttributeValues.csv', index=False)