In [4]:
import pandas as pd
import openpyxl
import re
from datetime import datetime
from pathlib import Path
from azure.storage.blob import BlobServiceClient

In [9]:
# Function to get weeks and comments where value is 1
def get_weeks_and_comments(row):
    weeks = []
    comments = []
    row_idx = int(row.name.strip("N°"))
    col_idx = 1
    for col, value in row.items():
        if value == 1:
            weeks.append(col)
            cell = sheet[row_idx + 1][col_idx]
            comments.append(cell.comment.text if cell.comment else "")
        col_idx += 1
    return pd.Series({"weeks": weeks, "comments": comments})


def get_end_week(row):
    weeks = []
    pool_repair_types = []
    row_idx = int(row.name.strip("N°"))
    for col_idx in range(0, row.__len__()):
        if col_idx > 2:
            prev_cell = sheet[row_idx + 1][col_idx - 1]
            cell = sheet[row_idx + 1][col_idx]
            if (
                (prev_cell.fill.fgColor.rgb != cell.fill.fgColor.rgb)
                & (prev_cell.value is None)
                & (prev_cell.fill.fgColor.rgb in ["FFEDBFBB", "FFC5E0B4", "FFE88880"])
                # & (prev_cell.fill.fgColor.rgb in ["FFC5E0B4", "FFE88880"])
            ):
                weeks.append(list(row.keys())[col_idx - 2])
                if prev_cell.fill.fgColor.rgb in ["FFEDBFBB", '"FFE88880"']:
                    pool_repair_types.append("I")
                else:
                    pool_repair_types.append("P")

    return pd.Series({"weeks": weeks, "pool_repair_type": pool_repair_types})


def idx_to_pool_number(df):
    df = (
        df.reset_index()
        .rename(columns={"index": "pool_number"})
        .assign(pool_number=lambda x: x["pool_number"].str.strip("N°"))
    )
    return df


# Function to extract information from comments
def extract_info(comment):
    equipo_match = re.search(r"Equipo:\s*(\d+)", comment)
    ns_match = re.search(r"NS:\s*(#\w*-?\w*)", comment)

    equipo = equipo_match.group(1) if equipo_match else None
    ns = ns_match.group(1) if ns_match else None

    return equipo, ns


files = [p for p in Path("/home/cecilvega/Public/pool-files").rglob("*cl.xlsx")]


frames = []
for file in files:

    # Read the Excel file
    excel_file = file.__str__()  # Replace with your actual file name
    wb = openpyxl.load_workbook(excel_file, data_only=True)
    sheet = wb.active

    # Create a DataFrame from the Excel data
    data = []
    for row in sheet.iter_rows(min_row=2, values_only=True):
        data.append(row)

    df = pd.DataFrame(data, columns=[cell.value for cell in sheet[1]])
    df.set_index(df.columns[0], inplace=True)

    # Process the dataframe
    start_repair_df = (
        df.apply(get_weeks_and_comments, axis=1)
        .explode(["weeks", "comments"])
        .rename(columns={"weeks": "repair_start_week"})
        .pipe(idx_to_pool_number)
    )

    # Extract information from comments
    start_repair_df[["equipo", "ns"]] = start_repair_df["comments"].apply(
        lambda x: pd.Series(extract_info(x))
    )
    start_repair_df = start_repair_df.drop(columns=["comments"]).assign(
        repair_start_date=start_repair_df["repair_start_week"].map(
            lambda x: datetime.strptime(x + "-1", "%Y-W%W-%w")
        )
    )
    end_repair_df = (
        df.apply(get_end_week, axis=1)
        .explode(["weeks", "pool_repair_type"])
        .rename(columns={"weeks": "repair_end_week"})
        .pipe(idx_to_pool_number)
    )
    end_repair_df = end_repair_df.assign(
        repair_end_date=end_repair_df["repair_end_week"].map(
            lambda x: datetime.strptime(x + "-1", "%Y-W%W-%w")
        )
    )

    df = pd.merge_asof(
        start_repair_df.sort_values("repair_start_date"),
        end_repair_df.sort_values("repair_end_date"),
        by="pool_number",
        left_on="repair_start_date",
        right_on="repair_end_date",
        direction="forward",
    ).assign(component=file.stem.split("-")[-1])
    frames.append(df)
df = pd.concat(frames)

In [10]:
df

Unnamed: 0,pool_number,repair_start_week,equipo,ns,repair_start_date,repair_end_week,pool_repair_type,repair_end_date,component
0,1,2023-W30,872,#178,2023-07-24,2023-W41,P,2023-10-09,cl
1,2,2023-W35,861,#RLTC707,2023-08-28,2023-W46,P,2023-11-13,cl
2,3,2023-W41,856,#101,2023-10-09,2023-W52,P,2023-12-25,cl
3,4,2023-W51,398,#LM017653,2023-12-18,2024-W10,P,2024-03-04,cl
4,5,2024-W2,883,#223,2024-01-08,2024-W13,P,2024-03-25,cl
5,6,2024-W2,883,#222,2024-01-08,2024-W13,P,2024-03-25,cl
6,7,2024-W8,856,#LM021235,2024-02-19,2024-W19,P,2024-05-06,cl
7,8,2024-W8,870,#180,2024-02-19,2024-W19,P,2024-05-06,cl
8,9,2024-W13,859,#LM017272,2024-03-25,2024-W32,I,2024-08-05,cl
9,10,2024-W13,869,#173,2024-03-25,2024-W24,P,2024-06-10,cl


In [13]:
110 / 7

15.714285714285714

In [8]:
files

[PosixPath('/home/cecilvega/Public/pool-files/pool-cc-cl.xlsx')]

In [1]:
# Read the Excel file
excel_file = file.__str__()  # Replace with your actual file name
wb = openpyxl.load_workbook(excel_file, data_only=True)
sheet = wb.active

# Create a DataFrame from the Excel data
data = []
for row in sheet.iter_rows(min_row=2, values_only=True):
    data.append(row)

df = pd.DataFrame(data, columns=[cell.value for cell in sheet[1]])
df.set_index(df.columns[0], inplace=True)

# Process the dataframe
start_repair_df = (
    df.apply(get_weeks_and_comments, axis=1)
    .explode(["weeks", "comments"])
    .rename(columns={"weeks": "repair_start_week"})
    .pipe(idx_to_pool_number)
)

# Extract information from comments
start_repair_df[["equipo", "ns"]] = start_repair_df["comments"].apply(
    lambda x: pd.Series(extract_info(x))
)
start_repair_df = start_repair_df.drop(columns=["comments"]).assign(
    repair_start_date=start_repair_df["repair_start_week"].map(
        lambda x: datetime.strptime(x + "-1", "%Y-W%W-%w")
    )
)
end_repair_df = (
    df.apply(get_end_week, axis=1)
    .explode(["weeks", "pool_repair_type"])
    .rename(columns={"weeks": "repair_end_week"})
    .pipe(idx_to_pool_number)
)
end_repair_df = end_repair_df.assign(
    repair_end_date=end_repair_df["repair_end_week"].map(
        lambda x: datetime.strptime(x + "-1", "%Y-W%W-%w")
    )
)

df = pd.merge_asof(
    start_repair_df.sort_values("repair_start_date"),
    end_repair_df.sort_values("repair_end_date"),
    by="pool_number",
    left_on="repair_start_date",
    right_on="repair_end_date",
    direction="forward",
).assign(component=file.stem.split("-")[-1])

NameError: name 'file' is not defined

In [None]:
df

In [None]:
"/home/cecilvega/Public"

In [3]:
account_url = "https://tecnologiakchqa.blob.core.windows.net"
sas_token = "?sv=2023-01-03&spr=https%2Chttp&st=2024-07-15T12%3A00%3A00Z&se=2028-01-01T16%3A44%3A00Z&sr=c&sp=racwdxltf&sig=AKC%2Fcx8Rfxma4l38WBC6f%2BkLRAHuMjjN2x7jrhjqVsg%3D"
prefix = "FAENAS/ESCONDIDA/y=2023/m=09/MEL_Septiembre_2023/TK848"
container_name = "kch-ddmm"
blob_service_client = BlobServiceClient(account_url=account_url, credential=sas_token)

In [25]:
container_client = blob_service_client.get_container_client(container_name)
blob_list = container_client.list_blobs(name_starts_with=prefix)
blob_list = [f.name for f in blob_list]

['FAENAS/ESCONDIDA/y=2023/m=09/MEL_Septiembre_2023/TK848/pool-files/.~lock.pool-cc-cms.xlsx#',
 'FAENAS/ESCONDIDA/y=2023/m=09/MEL_Septiembre_2023/TK848/pool-files/pool-cc-blower.xlsx',
 'FAENAS/ESCONDIDA/y=2023/m=09/MEL_Septiembre_2023/TK848/pool-files/pool-cc-cd.xlsx',
 'FAENAS/ESCONDIDA/y=2023/m=09/MEL_Septiembre_2023/TK848/pool-files/pool-cc-cl.xlsx',
 'FAENAS/ESCONDIDA/y=2023/m=09/MEL_Septiembre_2023/TK848/pool-files/pool-cc-cms.xlsx',
 'FAENAS/ESCONDIDA/y=2023/m=09/MEL_Septiembre_2023/TK848/pool-files/pool-cc-mp.xlsx',
 'FAENAS/ESCONDIDA/y=2023/m=09/MEL_Septiembre_2023/TK848/pool-files/pool-cc-mt.xlsx',
 'FAENAS/ESCONDIDA/y=2023/m=09/MEL_Septiembre_2023/TK848/pool-files/pool-cc-st.xlsx']

In [None]:
def process_excel_files(connection_string: str, container_name: str, folder_name: str):
    blob_service_client = BlobServiceClient.from_connection_string(connection_string)
    container_client = blob_service_client.get_container_client(container_name)

    # List all blobs in the specified folder
    blob_list = container_client.list_blobs(name_starts_with=folder_name)

    for blob in blob_list:
        if blob.name.endswith(".xlsx"):
            print(f"Processing file: {blob.name}")

            # Download the blob content
            blob_client = container_client.get_blob_client(blob.name)
            download_stream = blob_client.download_blob()
            content = download_stream.readall()

            # Load the Excel file
            workbook = openpyxl.load_workbook(BytesIO(content), data_only=True)
            sheet = workbook.active

            # Process the Excel file (example: print the value of cell A1)
            print(f"Value in A1: {sheet['A1'].value}")

            # Close the workbook
            workbook.close()