# data analysis

In [None]:
# pip install nltk pillow pyautogui pynput selenium

In [2]:
import os
import cv2
import pytesseract
from PIL import Image
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from IPython.display import display, HTML

# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Specify the path to tesseract.exe
pytesseract.pytesseract.tesseract_cmd = r'C:\Users\bbartling\AppData\Local\Programs\Tesseract-OCR\tesseract.exe'

data_directory = r'C:\Users\bbartling\Desktop\AutonomousBuildingOperator\data'
file_prefix = 'rtu_'

# Function to preprocess the image
def preprocess_image(image_path):
    image = cv2.imread(image_path)
    scale_percent = 150  # percent of original size
    width = int(image.shape[1] * scale_percent / 100)
    height = int(image.shape[0] * scale_percent / 100)
    dim = (width, height)
    resized = cv2.resize(image, dim, interpolation=cv2.INTER_LINEAR)
    gray = cv2.cvtColor(resized, cv2.COLOR_BGR2GRAY)
    _, thresh = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    return thresh, image

# Function to extract text and coordinates from an image
def image_to_text_with_coordinates(image_path):
    preprocessed_image, original_image = preprocess_image(image_path)
    data = pytesseract.image_to_data(preprocessed_image, output_type=pytesseract.Output.DICT)
    text_with_coords = []
    n_boxes = len(data['text'])
    
    for i in range(n_boxes):
        if int(data['conf'][i]) > 0:  # Filter out low-confidence results
            text = data['text'][i]
            x, y, w, h = data['left'][i], data['top'][i], data['width'][i], data['height'][i]
            text_with_coords.append((text, (x, y, w, h)))
    
    return text_with_coords, original_image, preprocessed_image

# Group text elements by proximity
def group_text_elements(text_with_coords, threshold=50):
    groups = []
    current_group = [text_with_coords[0]]
    
    for text, coords in text_with_coords[1:]:
        last_coords = current_group[-1][1]
        if abs(coords[1] - last_coords[1]) <= threshold:
            current_group.append((text, coords))
        else:
            groups.append(current_group)
            current_group = [(text, coords)]
    groups.append(current_group)
    return groups

# Sort text elements within each group
def sort_within_groups(groups):
    sorted_text_with_coords = []
    for group in groups:
        group.sort(key=lambda x: (x[1][1], x[1][0]))
        sorted_text_with_coords.extend(group)
    return sorted_text_with_coords

# Function to form a sentence from text elements
def form_sentence_from_text(text_with_coords):
    sentence = " ".join([text for text, _ in text_with_coords])
    return sentence

# Function to clean up text using NLTK
def clean_text(text):
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    
    word_tokens = word_tokenize(text)
    filtered_tokens = [lemmatizer.lemmatize(w) for w in word_tokens if w.lower() not in stop_words and w.isalnum()]
    
    cleaned_text = " ".join(filtered_tokens)
    return cleaned_text


20240706_101808: ‘he User @ Alarms Favorites @ Home @ Help (E17L90491) hics (Custom) WECC_RL_D60 - Occupancy Status Duct Pressure Static Setpoint BAS in(H:0) Free Cooling Enabled: Disable Discharge Air Cooling Setpoint BAS Supply Fan Start Stop Command: On Econ Decision Method: Dry Bulb Economizer Lockout Setpoint Supply Fan Status: On Outdoor Air Damper Position: 0.0 % First Floor OA Damper Minimum Setpoint Supply Fan Speed Status: 67.2 % Data Logs Outdoor Air Relative Humidity: 74.3% » ts Main Menu Second Floor Schedules » Reports » Alarm Configuration = Tools = Installation » Heating Capacity Status: 0.0% Cooling Capacity Status: 38.0 % slipstream >> breakthrough sofutions Air Retum Fan Enable: Enable energy Discharge Temperature: 63.3 °F ERU Supply Fan Command: Enable Supply Fan Alarm: Normal Retum Fan Speed Status: 52.2 % ERU Exhaust Fan Command: On Retum Air Temperature: 72.3 °F Supply Fan VED Alarm: Normal Retum Air Outside Relative Humidity: 78.8 % ERU Air Temperature: 72.3 Duc

In [None]:
# Get all rtu_ files
rtu_files = sorted([f for f in os.listdir(data_directory) if f.startswith(file_prefix)])

all_sentences = []

# Loop through each rtu_ file
for rtu_file in rtu_files:
    rtu_path = os.path.join(data_directory, rtu_file)
    extracted_text_with_coords, original_image, thresh_image = image_to_text_with_coordinates(rtu_path)
    
    # Group and sort text elements
    groups = group_text_elements(extracted_text_with_coords)
    sorted_text_with_coords = sort_within_groups(groups)
    
    # Form and collect sentence from the text elements
    sentence = form_sentence_from_text(sorted_text_with_coords)
    cleaned_sentence = clean_text(sentence)
    timestamp = rtu_file[len(file_prefix):-4]  # Extract timestamp from filename
    all_sentences.append(f"{timestamp}: {cleaned_sentence}")

# Concatenate all sentences into one giant sentence
giant_sentence = " ".join(all_sentences)


In [None]:
all_sentences

In [None]:
giant_sentence