In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize

In [None]:
urls = ["https://find-energy-certificate.service.gov.uk/energy-certificate/9243-1656-8157-5614-7032",
       "https://find-energy-certificate.service.gov.uk/energy-certificate/0913-4892-7174-6575-8766",
       "https://find-energy-certificate.service.gov.uk/energy-certificate/5236-8361-7770-3484-3888",
       "https://find-energy-certificate.service.gov.uk/energy-certificate/5371-2019-9466-5525-7406",
       "https://find-energy-certificate.service.gov.uk/energy-certificate/6635-7895-8451-2459-1185"
       "https://find-energy-certificate.service.gov.uk/energy-certificate/0210-0540-3952-5807-7002"
       "https://find-energy-certificate.service.gov.uk/energy-certificate/7698-5227-9050-3703-7810"
       "https://find-energy-certificate.service.gov.uk/energy-certificate/2955-7547-6216-0595-4628"
       "https://find-energy-certificate.service.gov.uk/energy-certificate/9739-4077-0764-0001-9501" ]

def scrape_webpage(url):
    response = requests.get(url)
    content = response.text

    soup = BeautifulSoup(content, 'html.parser')
    text = soup.get_text()
    return text

# Define a function for cleaning and preprocessing the text data
def preprocess_text(text):
    # Remove HTML tags
    text = BeautifulSoup(text, "html.parser").get_text()

    # Convert to lowercase
    text = text.lower()

    # Remove non-alphanumeric characters
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)

    # Tokenize the text into words
    words = word_tokenize(text)

    # Remove stopwords
    words = [word for word in words if word not in stopwords.words('english')]

    # Lemmatize the words
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]

    cleaned_text = [word for word in words if not word.isdigit()]

    # Join the cleaned words back into a single string
    cleaned_text = " ".join(words)

    return cleaned_text




# Example Usage
# Assuming 'text_data' contains the extracted text from the web source
    #cleaned_text = preprocess_text(text)
    #print(cleaned_text)







In [None]:
# Scrape the content from each web page and preprocess the text
cleaned_texts = []
for url in urls:
    content = scrape_webpage(url)
    cleaned_text = preprocess_text(content)
    cleaned_texts.append(cleaned_text)

# Tokenize the cleaned text and remove stopwords
tokenized_texts = [word_tokenize(cleaned_text) for text in cleaned_text]
stop_words = set(stopwords.words("english"))
filtered_texts = [[word for word in tokens if word not in stop_words] for tokens in tokenized_texts]

# Join tokenized words back into strings to use with TfidfVectorizer
document_texts = [" ".join(tokens) for tokens in filtered_texts]

# Calculate TF-IDF scores using TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(document_texts)

# Create a DataFrame with the vectorized words and their corresponding TF-IDF scores
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# Add an additional column to the DataFrame to represent the web pages (URLs)
#tfidf_df["urls"] = urls

# Set the web page URLs as the index of the DataFrame
#tfidf_df.set_index("urls", inplace=True)

# Display the DataFrame
print(tfidf_df)







      accept  accepted  accessibility  address     also  analytics  available  \
0    0.06415   0.06415        0.06415  0.19245  0.06415    0.32075    0.06415   
1    0.06415   0.06415        0.06415  0.19245  0.06415    0.32075    0.06415   
2    0.06415   0.06415        0.06415  0.19245  0.06415    0.32075    0.06415   
3    0.06415   0.06415        0.06415  0.19245  0.06415    0.32075    0.06415   
4    0.06415   0.06415        0.06415  0.19245  0.06415    0.32075    0.06415   
..       ...       ...            ...      ...      ...        ...        ...   
655  0.06415   0.06415        0.06415  0.19245  0.06415    0.32075    0.06415   
656  0.06415   0.06415        0.06415  0.19245  0.06415    0.32075    0.06415   
657  0.06415   0.06415        0.06415  0.19245  0.06415    0.32075    0.06415   
658  0.06415   0.06415        0.06415  0.19245  0.06415    0.32075    0.06415   
659  0.06415   0.06415        0.06415  0.19245  0.06415    0.32075    0.06415   

     certificate  change   

In [None]:
from collections import Counter

# WordVec: List of word vectors
WordVec = ['energy', 'performance', 'certificate', 'epc', 'find', 'energy', 'certificate', 'govuk', 'cooky', 'find', 'energy', 'certificate', 'use', 'essential', 'cooky', 'make', 'service', 'work', 'wed', 'also', 'like', 'use', 'analytics', 'cooky', 'understand', 'use', 'service', 'make', 'improvement', 'accept', 'analytics', 'cooky', 'reject', 'analytics', 'cooky', 'view', 'cooky', 'youve', 'accepted', 'analytics', 'cooky', 'change', 'cookie', 'setting', 'time', 'youve', 'rejected', 'analytics', 'cooky', 'change', 'cookie', 'setting', 'time', 'hide', 'cookie', 'message', 'skip', 'main', 'content', 'govuk', 'find', 'energy', 'certificate', 'energy', 'performance', 'certificate', 'epc', 'recommendation', 'report', 'report', 'content', 'energy', 'rating', 'epc', 'recommendation', 'property', 'report', 'detail', 'assessor', 'detail', 'report', 'property', 'share', 'report', 'email', 'copy', 'link', 'clipboard', 'print', 'su5355', 'vicar', 'lane', 'leeds', 'ls1', '6ba', 'report', 'number', '92431656815756147032', 'valid', '3', 'february', '2032', 'energy', 'rating', 'epc', 'property', 'current', 'energy', 'rating', 'information', 'property', 'energy', 'performance', 'see', 'epc', 'property', 'recommendation', 'make', 'change', 'improve', 'property', 'energy', 'efficiency', 'recommended', 'improvement', 'grouped', 'estimated', 'time', 'would', 'take', 'change', 'pay', 'assessor', 'may', 'also', 'make', 'additional', 'recommendation', 'recommendation', 'marked', 'low', 'medium', 'high', 'show', 'potential', 'impact', 'change', 'reducing', 'property', 'carbon', 'emission', 'change', 'pay', 'within', '3', 'year', 'recommendation', 'potential', 'impact', 'improve', 'insulation', 'hws', 'storage', 'low', 'add', 'time', 'control', 'heating', 'system', 'medium', 'space', 'solar', 'gain', 'limit', 'defined', 'ncm', 'exceeded', 'might', 'cause', 'overheating', 'consider', 'solar', 'control', 'measure', 'application', 'reflective', 'coating', 'shading', 'device', 'window', 'medium', 'add', 'optimum', 'startstop', 'heating', 'system', 'medium', 'change', 'pay', 'within', '3', '7', 'year', 'recommendation', 'potential', 'impact', 'window', 'high', 'uvalues', 'consider', 'installing', 'secondary', 'glazing', 'medium', 'add', 'local', 'temperature', 'control', 'heating', 'system', 'medium', 'add', 'weather', 'compensation', 'control', 'heating', 'system', 'medium', 'loft', 'space', 'poorly', 'insulated', 'installimprove', 'insulation', 'medium', 'add', 'local', 'time', 'control', 'heating', 'system', 'medium', 'solid', 'wall', 'poorly', 'insulated', 'introduce', 'improve', 'internal', 'wall', 'insulation', 'medium', 'carry', 'pressure', 'test', 'identify', 'treat', 'identified', 'air', 'leakage', 'enter', 'result', 'epc', 'calculation', 'medium', 'change', 'pay', '7', 'year', 'recommendation', 'potential', 'impact', 'glazing', 'poorly', 'insulated', 'replaceimprove', 'glazing', 'andor', 'frame', 'medium', 'consider', 'installing', 'air', 'source', 'heat', 'pump', 'high', 'consider', 'installing', 'ground', 'source', 'heat', 'pump', 'high', 'consider', 'installing', 'building', 'mounted', 'wind', 'turbine', 'low', 'consider', 'installing', 'solar', 'water', 'heating', 'low', 'additional', 'recommendation', 'recommendation', 'potential', 'impact', 'consider', 'replacing', 't8', 'lamp', 'led', 'low', 'property', 'report', 'detail', 'report', 'issued', '4', 'february', '2022', 'total', 'useful', 'floor', 'area', '864', 'square', 'metre', 'building', 'environment', 'heating', 'natural', 'ventilation', 'calculation', 'tool', 'designbuilder', 'software', 'ltd', 'designbuilder', 'sbem', 'v618', 'sbem', 'v56b0', 'assessor', 'detail', 'assessor', 'name', 'ashton', 'kaziboni', 'telephone', '01327', '811166', 'email', 'ashtonkazibonisocoteccom', 'employer', 'name', 'socotec', 'ltd', 'employer', 'address', 'henge', 'barn', 'pury', 'hill', 'business', 'park', 'alderton', 'road', 'assessor', 'id', 'stro034300', 'assessor', 'declaration', 'assessor', 'related', 'owner', 'property', 'accreditation', 'scheme', 'stroma', 'certification', 'ltd', 'report', 'property', 'aware', 'previous', 'report', 'property', 'listed', 'please', 'contact', 'u', 'dluhcdigitalserviceslevellingupgovuk', 'call', 'helpdesk', '020', '3829', '0748', 'monday', 'friday', '9am', '5pm', 'related', 'report', 'property', 'support', 'link', 'accessibility', 'statement', 'cooky', 'service', 'give', 'feedback', 'service', 'performance', 'content', 'available', 'open', 'government', 'licence', 'v30', 'except', 'otherwise', 'stated', 'crown', 'copyright']

# cleanedtext: Set of list of words
cleanedtext = [
    ['energy', 'certificate', 'performance'],
    ['epc', 'find', 'energy', 'certificate', 'govuk'],
    ['cooky', 'find', 'energy', 'certificate', 'use'],
    ['essential', 'cooky', 'make', 'service', 'work'],
    ['youve', 'accepted', 'analytics', 'cooky', 'change']
]

# Create a dictionary for WordVec and initialize each word count to 0
word_vec_dict = {word: 0 for word in WordVec}

# Count how many times each word appears in cleanedtext and update the dictionary
for document in cleanedtext:
    for word in document:
        if word in word_vec_dict:
            word_vec_dict[word] += 1

# Display the word count dictionary
print(word_vec_dict)


{'energy': 3, 'performance': 1, 'certificate': 3, 'epc': 1, 'find': 2, 'govuk': 1, 'cooky': 3, 'use': 1, 'essential': 1, 'make': 1, 'service': 1, 'work': 1, 'wed': 0, 'also': 0, 'like': 0, 'analytics': 1, 'understand': 0, 'improvement': 0, 'accept': 0, 'reject': 0, 'view': 0, 'youve': 1, 'accepted': 1, 'change': 1, 'cookie': 0, 'setting': 0, 'time': 0, 'rejected': 0, 'hide': 0, 'message': 0, 'skip': 0, 'main': 0, 'content': 0, 'recommendation': 0, 'report': 0, 'rating': 0, 'property': 0, 'detail': 0, 'assessor': 0, 'share': 0, 'email': 0, 'copy': 0, 'link': 0, 'clipboard': 0, 'print': 0, 'su5355': 0, 'vicar': 0, 'lane': 0, 'leeds': 0, 'ls1': 0, '6ba': 0, 'number': 0, '92431656815756147032': 0, 'valid': 0, '3': 0, 'february': 0, '2032': 0, 'current': 0, 'information': 0, 'see': 0, 'improve': 0, 'efficiency': 0, 'recommended': 0, 'grouped': 0, 'estimated': 0, 'would': 0, 'take': 0, 'pay': 0, 'may': 0, 'additional': 0, 'marked': 0, 'low': 0, 'medium': 0, 'high': 0, 'show': 0, 'potential':

In [None]:
from collections import Counter

# WordVec: List of word vectors
WordVec = ['energy', 'performance', 'certificate', 'epc', 'find', 'energy', 'certificate', 'govuk', 'cooky', 'find', 'energy', 'certificate', 'use', 'essential', 'cooky', 'make', 'service', 'work', 'wed', 'also', 'like', 'use', 'analytics', 'cooky', 'understand', 'use', 'service', 'make', 'improvement', 'accept', 'analytics', 'cooky', 'reject', 'analytics', 'cooky', 'view', 'cooky', 'youve', 'accepted', 'analytics', 'cooky', 'change', 'cookie', 'setting', 'time', 'youve', 'rejected', 'analytics', 'cooky', 'change', 'cookie', 'setting', 'time', 'hide', 'cookie', 'message', 'skip', 'main', 'content', 'govuk', 'find', 'energy', 'certificate', 'energy', 'performance', 'certificate', 'epc', 'recommendation', 'report', 'report', 'content', 'energy', 'rating', 'epc', 'recommendation', 'property', 'report', 'detail', 'assessor', 'detail', 'report', 'property', 'share', 'report', 'email', 'copy', 'link', 'clipboard', 'print', 'su5355', 'vicar', 'lane', 'leeds', 'ls1', '6ba', 'report', 'number', '92431656815756147032', 'valid', '3', 'february', '2032', 'energy', 'rating', 'epc', 'property', 'current', 'energy', 'rating', 'information', 'property', 'energy', 'performance', 'see', 'epc', 'property', 'recommendation', 'make', 'change', 'improve', 'property', 'energy', 'efficiency', 'recommended', 'improvement', 'grouped', 'estimated', 'time', 'would', 'take', 'change', 'pay', 'assessor', 'may', 'also', 'make', 'additional', 'recommendation', 'recommendation', 'marked', 'low', 'medium', 'high', 'show', 'potential', 'impact', 'change', 'reducing', 'property', 'carbon', 'emission', 'change', 'pay', 'within', '3', 'year', 'recommendation', 'potential', 'impact', 'improve', 'insulation', 'hws', 'storage', 'low', 'add', 'time', 'control', 'heating', 'system', 'medium', 'space', 'solar', 'gain', 'limit', 'defined', 'ncm', 'exceeded', 'might', 'cause', 'overheating', 'consider', 'solar', 'control', 'measure', 'application', 'reflective', 'coating', 'shading', 'device', 'window', 'medium', 'add', 'optimum', 'startstop', 'heating', 'system', 'medium', 'change', 'pay', 'within', '3', '7', 'year', 'recommendation', 'potential', 'impact', 'window', 'high', 'uvalues', 'consider', 'installing', 'secondary', 'glazing', 'medium', 'add', 'local', 'temperature', 'control', 'heating', 'system', 'medium', 'add', 'weather', 'compensation', 'control', 'heating', 'system', 'medium', 'loft', 'space', 'poorly', 'insulated', 'installimprove', 'insulation', 'medium', 'add', 'local', 'time', 'control', 'heating', 'system', 'medium', 'solid', 'wall', 'poorly', 'insulated', 'introduce', 'improve', 'internal', 'wall', 'insulation', 'medium', 'carry', 'pressure', 'test', 'identify', 'treat', 'identified', 'air', 'leakage', 'enter', 'result', 'epc', 'calculation', 'medium', 'change', 'pay', '7', 'year', 'recommendation', 'potential', 'impact', 'glazing', 'poorly', 'insulated', 'replaceimprove', 'glazing', 'andor', 'frame', 'medium', 'consider', 'installing', 'air', 'source', 'heat', 'pump', 'high', 'consider', 'installing', 'ground', 'source', 'heat', 'pump', 'high', 'consider', 'installing', 'building', 'mounted', 'wind', 'turbine', 'low', 'consider', 'installing', 'solar', 'water', 'heating', 'low', 'additional', 'recommendation', 'recommendation', 'potential', 'impact', 'consider', 'replacing', 't8', 'lamp', 'led', 'low', 'property', 'report', 'detail', 'report', 'issued', '4', 'february', '2022', 'total', 'useful', 'floor', 'area', '864', 'square', 'metre', 'building', 'environment', 'heating', 'natural', 'ventilation', 'calculation', 'tool', 'designbuilder', 'software', 'ltd', 'designbuilder', 'sbem', 'v618', 'sbem', 'v56b0', 'assessor', 'detail', 'assessor', 'name', 'ashton', 'kaziboni', 'telephone', '01327', '811166', 'email', 'ashtonkazibonisocoteccom', 'employer', 'name', 'socotec', 'ltd', 'employer', 'address', 'henge', 'barn', 'pury', 'hill', 'business', 'park', 'alderton', 'road', 'assessor', 'id', 'stro034300', 'assessor', 'declaration', 'assessor', 'related', 'owner', 'property', 'accreditation', 'scheme', 'stroma', 'certification', 'ltd', 'report', 'property', 'aware', 'previous', 'report', 'property', 'listed', 'please', 'contact', 'u', 'dluhcdigitalserviceslevellingupgovuk', 'call', 'helpdesk', '020', '3829', '0748', 'monday', 'friday', '9am', '5pm', 'related', 'report', 'property', 'support', 'link', 'accessibility', 'statement', 'cooky', 'service', 'give', 'feedback', 'service', 'performance', 'content', 'available', 'open', 'government', 'licence', 'v30', 'except', 'otherwise', 'stated', 'crown', 'copyright']

# cleanedtext: List of a list of words
cleanedtext = [
    ['energy', 'certificate', 'performance', 'energy', 'certificate'],
    ['epc', 'find', 'energy', 'certificate', 'govuk'],
    ['cooky', 'find', 'energy', 'certificate', 'use'],
    ['essential', 'cooky', 'make', 'service', 'work'],
    ['youve', 'accepted', 'analytics', 'cooky', 'change']
]

# Create a dictionary for WordVec and initialize each word count list to [0, 0, 0, ...] (same length as cleanedtext)
word_vec_dict = {word: [0] * len(cleanedtext) for word in WordVec}

# Count how many times each word appears in each row of cleanedtext and update the dictionary
for i, document in enumerate(cleanedtext):
    word_counts = Counter(document)
    for word, count in word_counts.items():
        if word in word_vec_dict:
            word_vec_dict[word][i] = count

# Display the word count dictionary
print(word_vec_dict)


{'energy': [2, 1, 1, 0, 0], 'performance': [1, 0, 0, 0, 0], 'certificate': [2, 1, 1, 0, 0], 'epc': [0, 1, 0, 0, 0], 'find': [0, 1, 1, 0, 0], 'govuk': [0, 1, 0, 0, 0], 'cooky': [0, 0, 1, 1, 1], 'use': [0, 0, 1, 0, 0], 'essential': [0, 0, 0, 1, 0], 'make': [0, 0, 0, 1, 0], 'service': [0, 0, 0, 1, 0], 'work': [0, 0, 0, 1, 0], 'wed': [0, 0, 0, 0, 0], 'also': [0, 0, 0, 0, 0], 'like': [0, 0, 0, 0, 0], 'analytics': [0, 0, 0, 0, 1], 'understand': [0, 0, 0, 0, 0], 'improvement': [0, 0, 0, 0, 0], 'accept': [0, 0, 0, 0, 0], 'reject': [0, 0, 0, 0, 0], 'view': [0, 0, 0, 0, 0], 'youve': [0, 0, 0, 0, 1], 'accepted': [0, 0, 0, 0, 1], 'change': [0, 0, 0, 0, 1], 'cookie': [0, 0, 0, 0, 0], 'setting': [0, 0, 0, 0, 0], 'time': [0, 0, 0, 0, 0], 'rejected': [0, 0, 0, 0, 0], 'hide': [0, 0, 0, 0, 0], 'message': [0, 0, 0, 0, 0], 'skip': [0, 0, 0, 0, 0], 'main': [0, 0, 0, 0, 0], 'content': [0, 0, 0, 0, 0], 'recommendation': [0, 0, 0, 0, 0], 'report': [0, 0, 0, 0, 0], 'rating': [0, 0, 0, 0, 0], 'property': [0, 0,

In [None]:
from collections import Counter
import pandas as pd

# WordVec: List of word vectors
WordVec = ['energy', 'performance', 'certificate', 'epc', 'find', 'energy', 'certificate', 'govuk', 'cooky', 'find', 'energy', 'certificate', 'use', 'essential', 'cooky', 'make', 'service', 'work', 'wed', 'also', 'like', 'use', 'analytics', 'cooky', 'understand', 'use', 'service', 'make', 'improvement', 'accept', 'analytics', 'cooky', 'reject', 'analytics', 'cooky', 'view', 'cooky', 'youve', 'accepted', 'analytics', 'cooky', 'change', 'cookie', 'setting', 'time', 'youve', 'rejected', 'analytics', 'cooky', 'change', 'cookie', 'setting', 'time', 'hide', 'cookie', 'message', 'skip', 'main', 'content', 'govuk', 'find', 'energy', 'certificate', 'energy', 'performance', 'certificate', 'epc', 'recommendation', 'report', 'report', 'content', 'energy', 'rating', 'epc', 'recommendation', 'property', 'report', 'detail', 'assessor', 'detail', 'report', 'property', 'share', 'report', 'email', 'copy', 'link', 'clipboard', 'print', 'su5355', 'vicar', 'lane', 'leeds', 'ls1', '6ba', 'report', 'number', '92431656815756147032', 'valid', '3', 'february', '2032', 'energy', 'rating', 'epc', 'property', 'current', 'energy', 'rating', 'information', 'property', 'energy', 'performance', 'see', 'epc', 'property', 'recommendation', 'make', 'change', 'improve', 'property', 'energy', 'efficiency', 'recommended', 'improvement', 'grouped', 'estimated', 'time', 'would', 'take', 'change', 'pay', 'assessor', 'may', 'also', 'make', 'additional', 'recommendation', 'recommendation', 'marked', 'low', 'medium', 'high', 'show', 'potential', 'impact', 'change', 'reducing', 'property', 'carbon', 'emission', 'change', 'pay', 'within', '3', 'year', 'recommendation', 'potential', 'impact', 'improve', 'insulation', 'hws', 'storage', 'low', 'add', 'time', 'control', 'heating', 'system', 'medium', 'space', 'solar', 'gain', 'limit', 'defined', 'ncm', 'exceeded', 'might', 'cause', 'overheating', 'consider', 'solar', 'control', 'measure', 'application', 'reflective', 'coating', 'shading', 'device', 'window', 'medium', 'add', 'optimum', 'startstop', 'heating', 'system', 'medium', 'change', 'pay', 'within', '3', '7', 'year', 'recommendation', 'potential', 'impact', 'window', 'high', 'uvalues', 'consider', 'installing', 'secondary', 'glazing', 'medium', 'add', 'local', 'temperature', 'control', 'heating', 'system', 'medium', 'add', 'weather', 'compensation', 'control', 'heating', 'system', 'medium', 'loft', 'space', 'poorly', 'insulated', 'installimprove', 'insulation', 'medium', 'add', 'local', 'time', 'control', 'heating', 'system', 'medium', 'solid', 'wall', 'poorly', 'insulated', 'introduce', 'improve', 'internal', 'wall', 'insulation', 'medium', 'carry', 'pressure', 'test', 'identify', 'treat', 'identified', 'air', 'leakage', 'enter', 'result', 'epc', 'calculation', 'medium', 'change', 'pay', '7', 'year', 'recommendation', 'potential', 'impact', 'glazing', 'poorly', 'insulated', 'replaceimprove', 'glazing', 'andor', 'frame', 'medium', 'consider', 'installing', 'air', 'source', 'heat', 'pump', 'high', 'consider', 'installing', 'ground', 'source', 'heat', 'pump', 'high', 'consider', 'installing', 'building', 'mounted', 'wind', 'turbine', 'low', 'consider', 'installing', 'solar', 'water', 'heating', 'low', 'additional', 'recommendation', 'recommendation', 'potential', 'impact', 'consider', 'replacing', 't8', 'lamp', 'led', 'low', 'property', 'report', 'detail', 'report', 'issued', '4', 'february', '2022', 'total', 'useful', 'floor', 'area', '864', 'square', 'metre', 'building', 'environment', 'heating', 'natural', 'ventilation', 'calculation', 'tool', 'designbuilder', 'software', 'ltd', 'designbuilder', 'sbem', 'v618', 'sbem', 'v56b0', 'assessor', 'detail', 'assessor', 'name', 'ashton', 'kaziboni', 'telephone', '01327', '811166', 'email', 'ashtonkazibonisocoteccom', 'employer', 'name', 'socotec', 'ltd', 'employer', 'address', 'henge', 'barn', 'pury', 'hill', 'business', 'park', 'alderton', 'road', 'assessor', 'id', 'stro034300', 'assessor', 'declaration', 'assessor', 'related', 'owner', 'property', 'accreditation', 'scheme', 'stroma', 'certification', 'ltd', 'report', 'property', 'aware', 'previous', 'report', 'property', 'listed', 'please', 'contact', 'u', 'dluhcdigitalserviceslevellingupgovuk', 'call', 'helpdesk', '020', '3829', '0748', 'monday', 'friday', '9am', '5pm', 'related', 'report', 'property', 'support', 'link', 'accessibility', 'statement', 'cooky', 'service', 'give', 'feedback', 'service', 'performance', 'content', 'available', 'open', 'government', 'licence', 'v30', 'except', 'otherwise', 'stated', 'crown', 'copyright']

# cleanedtext: List of a list of words
cleanedtext = [
    ['energy', 'certificate', 'performance', 'energy', 'certificate'],
    ['epc', 'find', 'energy', 'certificate', 'govuk'],
    ['cooky', 'find', 'energy', 'certificate', 'use'],
    ['essential', 'cooky', 'make', 'service', 'work'],
    ['youve', 'accepted', 'analytics', 'cooky', 'change']
]

# Create a dictionary for WordVec and initialize each word count list to [0, 0, 0, ...] (same length as cleanedtext)
word_vec_dict = {word: [0] * len(cleanedtext) for word in WordVec}

# Count how many times each word appears in each row of cleanedtext and update the dictionary
for i, document in enumerate(cleanedtext):
    word_counts = Counter(document)
    for word, count in word_counts.items():
        if word in word_vec_dict:
            word_vec_dict[word][i] = count

# Convert the dictionary into a dataframe
word_vec_df = pd.DataFrame(word_vec_dict)

# Display the dataframe
print(word_vec_df)


   energy  performance  certificate  epc  find  govuk  cooky  use  essential  \
0       2            1            2    0     0      0      0    0          0   
1       1            0            1    1     1      1      0    0          0   
2       1            0            1    0     1      0      1    1          0   
3       0            0            0    0     0      0      1    0          1   
4       0            0            0    0     0      0      1    0          0   

   make  ...  available  open  government  licence  v30  except  otherwise  \
0     0  ...          0     0           0        0    0       0          0   
1     0  ...          0     0           0        0    0       0          0   
2     0  ...          0     0           0        0    0       0          0   
3     1  ...          0     0           0        0    0       0          0   
4     0  ...          0     0           0        0    0       0          0   

   stated  crown  copyright  
0       0      0    