In [None]:
from google.colab import drive

drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import os
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Function to read text data from a file in a directory
def read_text_from_file(file_path):
    with open(file_path, 'r', encoding='latin-1') as file:
        text_data = file.read()
    return text_data


# Define a function for cleaning and preprocessing the text data
def preprocess_text(text):
    # Remove HTML tags
    text = BeautifulSoup(text, "html.parser").get_text()

    # Convert to lowercase
    text = text.lower()

    # Remove non-alphanumeric characters
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)

    # Tokenize the text into words
    words = word_tokenize(text)

    # Remove stopwords
    words = [word for word in words if word not in stopwords.words('english')]

    # Lemmatize the words
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]

    # Join the cleaned words back into a single string
    cleaned_text = " ".join(words)

    return cleaned_text


def vectorize_text_to_df(_text):
    # Vectorize the text coming from cleaned text
    # Feature extraction
    vectorizer = TfidfVectorizer()
    features = vectorizer.fit_transform([_text])
    # Create dataframe
    df = pd.DataFrame(features.toarray(), columns=vectorizer.get_feature_names_out())
    # Print the dataframe
    print(df)
    return df

directory = '/content/drive/MyDrive/energyreporttest'
file_list = os.listdir(directory)
dfMain = pd.DataFrame()
for file_name in file_list:
    file_path = os.path.join(directory, file_name)
    # Call web scrapping function
    _scrapped_data = read_text_from_file(file_path)
    _cleaned_text = preprocess_text(_scrapped_data)
    # Call api to summarize clean txt
    dfn = vectorize_text_to_df(_cleaned_text)
    if dfMain is None or dfMain.empty:
        dfMain = dfn.copy()
    dfMain = pd.merge(dfMain, dfn)

dfMain.head()

      00000  0000000000  0000000015  0000000262  0000000299  0000002071  \
0  0.547646    0.003184    0.003184    0.003184    0.003184    0.003184   

   0000004731  0000004945  0000007133  0000007364  ...  zvdco04c3fdk  \
0    0.003184    0.003184    0.003184    0.003184  ...      0.003184   

        zvh   zvzfk5a      zwr9  zxfeygmkmt0m6n  zxwqyywqwe  zymsbpighs2  \
0  0.003184  0.003184  0.003184        0.003184    0.003184     0.003184   

    zzauqvr      zzhj       zzo  
0  0.003184  0.003184  0.003184  

[1 rows x 1950 columns]
      00000  0000000000  0000000015  0000000262  0000000299  0000002029  \
0  0.548476    0.003407    0.003407    0.003407    0.003407    0.003407   

   0000004982  0000005185  0000006950  0000007181  ...      ztjn  ztzcxhxxha  \
0    0.003407    0.003407    0.003407    0.003407  ...  0.003407    0.003407   

   zvj0t8cym1sbm  zwellvoji     zxe7f       zxj  zy0unervfgx   zybqkwb  \
0       0.003407   0.003407  0.003407  0.003407     0.003407  0.003407  

Unnamed: 0,00000,0000000000,0000000015,0000000262,0000000299,0000002050,0000004980,0000005184,0000006918,0000007149,...,zrizs,zss9,ztquplscdqfr6438utt9c,ztsv3y0pcngzvmn,zu,zuvss8zfv9ux4s,zv,zyy,zyzib4luzk,zzmrutvqd
0,0.550853,0.003421,0.003421,0.003421,0.003421,0.003421,0.003421,0.003421,0.003421,0.003421,...,0.003421,0.003421,0.003421,0.003421,0.003421,0.003421,0.003421,0.003421,0.003421,0.003421


In [None]:
# Specify the path and file name for the Excel file
output_file = "output2.xlsx"

# Copy the DataFrame to an Excel file
dfMain.to_excel(output_file)

print("DataFrame copied to Excel file successfully.")