# **Data Extraction and processing**

In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from collections import Counter

# **Vectorisation Preprocessing and Web Scrapping Function**

In [None]:
def preprocess_text(text):
          # Remove HTML tags
          text = BeautifulSoup(text, "html.parser").get_text()
          # Convert to lowercase
          text = text.lower()
          # Remove non-alphanumeric characters
          text = re.sub(r'[^a-zA-Z0-9\s]', '', text)



          # Tokenize the text into words
          words = word_tokenize(text)



          # Remove stopwords
          words = [word for word in words if word not in stopwords.words('english')]



          # Lemmatize the words
          lemmatizer = WordNetLemmatizer()
          words = [lemmatizer.lemmatize(word) for word in words]

          words = [word for word in words if not word.isdigit()]


          return words


URLs = [ "https://find-energy-certificate.service.gov.uk/energy-certificate/0010-7954-0466-1537-4054",
"https://find-energy-certificate.service.gov.uk/energy-certificate/0010-7954-0466-1533-4050",
"https://find-energy-certificate.service.gov.uk/energy-certificate/9844-4009-0038-0200-6701",
"https://find-energy-certificate.service.gov.uk/energy-certificate/9795-4086-0465-0500-3001",
"https://find-energy-certificate.service.gov.uk/energy-certificate/0790-0348-5599-1798-9002",
"https://find-energy-certificate.service.gov.uk/energy-certificate/0970-3943-0401-3481-2024",
"https://find-energy-certificate.service.gov.uk/energy-certificate/0980-2922-0408-1751-5070",
"https://find-energy-certificate.service.gov.uk/energy-certificate/9217-4058-0280-0502-7291",
"https://find-energy-certificate.service.gov.uk/energy-certificate/0818-0847-8439-0405-3002",
"https://find-energy-certificate.service.gov.uk/energy-certificate/0710-1943-0436-8044-8100",
"https://find-energy-certificate.service.gov.uk/energy-certificate/0013-0846-8439-0471-3106",
"https://find-energy-certificate.service.gov.uk/energy-certificate/0017-0846-8439-0471-3002",
"https://find-energy-certificate.service.gov.uk/energy-certificate/9990-1943-0434-8441-8104",
"https://find-energy-certificate.service.gov.uk/energy-certificate/9180-4049-0443-1892-4395",
"https://find-energy-certificate.service.gov.uk/energy-certificate/0560-0446-7679-9692-4092",
"https://find-energy-certificate.service.gov.uk/energy-certificate/9186-4032-0942-0600-4895",
"https://find-energy-certificate.service.gov.uk/energy-certificate/4767-9411-5109-9259-4882",
"https://find-energy-certificate.service.gov.uk/energy-certificate/0596-9296-1540-6800-9803",
"https://find-energy-certificate.service.gov.uk/energy-certificate/9987-4059-0657-0990-4621",
"https://find-energy-certificate.service.gov.uk/energy-certificate/9944-4028-0360-0900-6591",
"https://find-energy-certificate.service.gov.uk/energy-certificate/0770-0645-1299-5922-9092",
"https://find-energy-certificate.service.gov.uk/energy-certificate/9497-4092-0938-0100-3705",
"https://find-energy-certificate.service.gov.uk/energy-certificate/9455-4000-0562-0090-8591",
"https://find-energy-certificate.service.gov.uk/energy-certificate/0560-0146-4499-5529-1092",
"https://find-energy-certificate.service.gov.uk/energy-certificate/0997-0917-3240-6600-2803",
"https://find-energy-certificate.service.gov.uk/energy-certificate/0390-2909-8340-9690-2613",
"https://find-energy-certificate.service.gov.uk/energy-certificate/9645-4055-0439-0101-2125",
"https://find-energy-certificate.service.gov.uk/energy-certificate/0390-0999-8341-3690-2663",
"https://find-energy-certificate.service.gov.uk/energy-certificate/9290-4039-0668-3997-9305",
"https://find-energy-certificate.service.gov.uk/energy-certificate/7551-1681-3933-1295-2216",
"https://find-energy-certificate.service.gov.uk/energy-certificate/9090-2963-0486-9938-9104",
"https://find-energy-certificate.service.gov.uk/energy-certificate/0390-0999-8343-9690-2653",
"https://find-energy-certificate.service.gov.uk/energy-certificate/9090-2963-0486-9936-9304",
"https://find-energy-certificate.service.gov.uk/energy-certificate/0921-0946-9689-0329-3492",
"https://find-energy-certificate.service.gov.uk/energy-certificate/0390-2999-8344-9690-2623",
"https://find-energy-certificate.service.gov.uk/energy-certificate/9290-2963-0486-9933-9404",
"https://find-energy-certificate.service.gov.uk/energy-certificate/9200-2963-0486-9831-9204",
"https://find-energy-certificate.service.gov.uk/energy-certificate/0322-0946-9689-0399-3292",
"https://find-energy-certificate.service.gov.uk/energy-certificate/0390-9999-8342-3690-2633",
"https://find-energy-certificate.service.gov.uk/energy-certificate/0924-0946-9689-0370-3296",
"https://find-energy-certificate.service.gov.uk/energy-certificate/9090-2963-0486-9739-9204" ]

cleanedTexts = []
for url in URLs:
          response = requests.get(url)

          content = response.text
          soup = BeautifulSoup(content, 'html.parser')

          text = soup.get_text()

          # Preprocess the text
          cleaned_text = preprocess_text(text)

          cleanedTexts.append(cleaned_text)

          print(cleaned_text)










['energy', 'performance', 'certificate', 'epc', 'find', 'energy', 'certificate', 'govuk', 'cooky', 'find', 'energy', 'certificate', 'use', 'essential', 'cooky', 'make', 'service', 'work', 'wed', 'also', 'like', 'use', 'analytics', 'cooky', 'understand', 'use', 'service', 'make', 'improvement', 'accept', 'analytics', 'cooky', 'reject', 'analytics', 'cooky', 'view', 'cooky', 'youve', 'accepted', 'analytics', 'cooky', 'change', 'cookie', 'setting', 'time', 'youve', 'rejected', 'analytics', 'cooky', 'change', 'cookie', 'setting', 'time', 'hide', 'cookie', 'message', 'skip', 'main', 'content', 'govuk', 'find', 'energy', 'certificate', 'energy', 'performance', 'certificate', 'epc', 'recommendation', 'report', 'report', 'content', 'energy', 'rating', 'epc', 'recommendation', 'property', 'report', 'detail', 'assessor', 'detail', 'report', 'property', 'share', 'report', 'email', 'copy', 'link', 'clipboard', 'print', 'seventh', 'floor', 'office', 'greek', 'street', 'leeds', 'ls1', '5ru', 'report

# **Function for Extraction of Uniques Word Vectors**

In [None]:
dumCleanedTexts =

dumCleanedTexts_split = []
for v in dumCleanedTexts:
              dum = v.split()
              dumCleanedTexts_split.append(dum)

#dumX = [v for v in d for d in dumCleanedTexts_split]
words = []
for d in dumCleanedTexts_split:
              for v in d:
                  words.append(v)
words_ = list(set(words))

WordVectors = list(set(words)) # all unique words

# Print the output
print("Cleaned Texts:")
for text in cleanedTexts:
    print(text)

print("Word Vectors:")
print(WordVectors)


df = pd.DataFrame(WordVectors)

# Specify the path and file name for the Excel file
output_file = "generatedoutput.xlsx"

# Copy the DataFrame to an Excel file
df.to_excel(output_file)




Cleaned Texts:
['energy', 'performance', 'certificate', 'epc', 'find', 'energy', 'certificate', 'govuk', 'cooky', 'find', 'energy', 'certificate', 'use', 'essential', 'cooky', 'make', 'service', 'work', 'wed', 'also', 'like', 'use', 'analytics', 'cooky', 'understand', 'use', 'service', 'make', 'improvement', 'accept', 'analytics', 'cooky', 'reject', 'analytics', 'cooky', 'view', 'cooky', 'youve', 'accepted', 'analytics', 'cooky', 'change', 'cookie', 'setting', 'time', 'youve', 'rejected', 'analytics', 'cooky', 'change', 'cookie', 'setting', 'time', 'hide', 'cookie', 'message', 'skip', 'main', 'content', 'govuk', 'find', 'energy', 'certificate', 'energy', 'performance', 'certificate', 'epc', 'recommendation', 'report', 'report', 'content', 'energy', 'rating', 'epc', 'recommendation', 'property', 'report', 'detail', 'assessor', 'detail', 'report', 'property', 'share', 'report', 'email', 'copy', 'link', 'clipboard', 'print', 'su5355', 'vicar', 'lane', 'leeds', 'ls1', '6ba', 'report', 'num

In [None]:
#dictionary = dict()
#for word in WordVectors:
    #dictionary[word] = 0

#WordVectorsDF_ =[]
#for document in dumCleanedTexts_split[1]:
    #for word in document:
        #if word in dictionary.keys():
            #dictionary[word] = 1

# **Function for Binary Count Extraction Test**

In [None]:
from collections import Counter
import pandas as pd

# WordVec: List of word vectors
WordVec = ['energy', 'performance', 'certificate', 'epc', 'find', 'energy', 'certificate', 'govuk', 'cooky', 'find', 'energy', 'certificate', 'use', 'essential', 'cooky', 'make', 'service', 'work', 'wed', 'also', 'like', 'use', 'analytics', 'cooky', 'understand', 'use', 'service', 'make', 'improvement', 'accept', 'analytics', 'cooky', 'reject', 'analytics', 'cooky', 'view', 'cooky', 'youve', 'accepted', 'analytics', 'cooky', 'change', 'cookie', 'setting', 'time', 'youve', 'rejected', 'analytics', 'cooky', 'change', 'cookie', 'setting', 'time', 'hide', 'cookie', 'message', 'skip', 'main', 'content', 'govuk', 'find', 'energy', 'certificate', 'energy', 'performance', 'certificate', 'epc', 'recommendation', 'report', 'report', 'content', 'energy', 'rating', 'epc', 'recommendation', 'property', 'report', 'detail', 'assessor', 'detail', 'report', 'property', 'share', 'report', 'email', 'copy', 'link', 'clipboard', 'print', 'su5355', 'vicar', 'lane', 'leeds', 'ls1', '6ba', 'report', 'number', '92431656815756147032', 'valid', '3', 'february', '2032', 'energy', 'rating', 'epc', 'property', 'current', 'energy', 'rating', 'information', 'property', 'energy', 'performance', 'see', 'epc', 'property', 'recommendation', 'make', 'change', 'improve', 'property', 'energy', 'efficiency', 'recommended', 'improvement', 'grouped', 'estimated', 'time', 'would', 'take', 'change', 'pay', 'assessor', 'may', 'also', 'make', 'additional', 'recommendation', 'recommendation', 'marked', 'low', 'medium', 'high', 'show', 'potential', 'impact', 'change', 'reducing', 'property', 'carbon', 'emission', 'change', 'pay', 'within', '3', 'year', 'recommendation', 'potential', 'impact', 'improve', 'insulation', 'hws', 'storage', 'low', 'add', 'time', 'control', 'heating', 'system', 'medium', 'space', 'solar', 'gain', 'limit', 'defined', 'ncm', 'exceeded', 'might', 'cause', 'overheating', 'consider', 'solar', 'control', 'measure', 'application', 'reflective', 'coating', 'shading', 'device', 'window', 'medium', 'add', 'optimum', 'startstop', 'heating', 'system', 'medium', 'change', 'pay', 'within', '3', '7', 'year', 'recommendation', 'potential', 'impact', 'window', 'high', 'uvalues', 'consider', 'installing', 'secondary', 'glazing', 'medium', 'add', 'local', 'temperature', 'control', 'heating', 'system', 'medium', 'add', 'weather', 'compensation', 'control', 'heating', 'system', 'medium', 'loft', 'space', 'poorly', 'insulated', 'installimprove', 'insulation', 'medium', 'add', 'local', 'time', 'control', 'heating', 'system', 'medium', 'solid', 'wall', 'poorly', 'insulated', 'introduce', 'improve', 'internal', 'wall', 'insulation', 'medium', 'carry', 'pressure', 'test', 'identify', 'treat', 'identified', 'air', 'leakage', 'enter', 'result', 'epc', 'calculation', 'medium', 'change', 'pay', '7', 'year', 'recommendation', 'potential', 'impact', 'glazing', 'poorly', 'insulated', 'replaceimprove', 'glazing', 'andor', 'frame', 'medium', 'consider', 'installing', 'air', 'source', 'heat', 'pump', 'high', 'consider', 'installing', 'ground', 'source', 'heat', 'pump', 'high', 'consider', 'installing', 'building', 'mounted', 'wind', 'turbine', 'low', 'consider', 'installing', 'solar', 'water', 'heating', 'low', 'additional', 'recommendation', 'recommendation', 'potential', 'impact', 'consider', 'replacing', 't8', 'lamp', 'led', 'low', 'property', 'report', 'detail', 'report', 'issued', '4', 'february', '2022', 'total', 'useful', 'floor', 'area', '864', 'square', 'metre', 'building', 'environment', 'heating', 'natural', 'ventilation', 'calculation', 'tool', 'designbuilder', 'software', 'ltd', 'designbuilder', 'sbem', 'v618', 'sbem', 'v56b0', 'assessor', 'detail', 'assessor', 'name', 'ashton', 'kaziboni', 'telephone', '01327', '811166', 'email', 'ashtonkazibonisocoteccom', 'employer', 'name', 'socotec', 'ltd', 'employer', 'address', 'henge', 'barn', 'pury', 'hill', 'business', 'park', 'alderton', 'road', 'assessor', 'id', 'stro034300', 'assessor', 'declaration', 'assessor', 'related', 'owner', 'property', 'accreditation', 'scheme', 'stroma', 'certification', 'ltd', 'report', 'property', 'aware', 'previous', 'report', 'property', 'listed', 'please', 'contact', 'u', 'dluhcdigitalserviceslevellingupgovuk', 'call', 'helpdesk', '020', '3829', '0748', 'monday', 'friday', '9am', '5pm', 'related', 'report', 'property', 'support', 'link', 'accessibility', 'statement', 'cooky', 'service', 'give', 'feedback', 'service', 'performance', 'content', 'available', 'open', 'government', 'licence', 'v30', 'except', 'otherwise', 'stated', 'crown', 'copyright']

# cleanedtext: List of a list of words
cleanedtext = [
    ['energy', 'certificate', 'performance', 'energy', 'certificate'],
    ['epc', 'find', 'energy', 'certificate', 'govuk'],
    ['cooky', 'find', 'energy', 'certificate', 'use'],
    ['essential', 'cooky', 'make', 'service', 'work'],
    ['youve', 'accepted', 'analytics', 'cooky', 'change']
]

# Create a dictionary for WordVec and initialize each word count list to [0, 0, 0, ...] (same length as cleanedtext)
word_vec_dict = {word: [0] * len(cleanedtext) for word in WordVec}

# Count how many times each word appears in each row of cleanedtext and update the dictionary
for i, document in enumerate(cleanedtext):
    word_counts = Counter(document)
    for word, count in word_counts.items():
        if word in word_vec_dict:
            word_vec_dict[word][i] = count

# Convert the dictionary into a dataframe
word_vec_df = pd.DataFrame(word_vec_dict)

# Display the dataframe
print(word_vec_df)