In [None]:
#!/usr/bin/env python3
# PySpark Promotion Pipeline (completely flat: no def, no env)

import os, sys, logging, smtplib
from datetime import datetime
from email.mime.text import MIMEText
from email.mime.multipart import MIMEMultipart
from email.mime.application import MIMEApplication

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, isnan, count, when, trim, upper, to_date, lit
import pandas as pd

# ===================== LOGGING =====================
APP_NAME = "PromotionPipeline"
LOG_DIR = "logs"
os.makedirs(LOG_DIR, exist_ok=True)
log_filename = datetime.now().strftime(os.path.join(LOG_DIR, "promotion_pipeline_%Y%m%d_%H%M%S.log"))

for h in logging.root.handlers[:]:
    logging.root.removeHandler(h)
logging.basicConfig(
    level=logging.INFO,
    handlers=[logging.FileHandler(log_filename,encoding="utf-8"), logging.StreamHandler(sys.stdout)],
    format="%(asctime)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(APP_NAME)

In [None]:
#export excel to csv        
import os

xls_path = os.path.join(os.environ["USERPROFILE"], "Documents", "BDA2", "data", "Promotion_data.xlsx")
out_dir  = os.path.join(os.environ["USERPROFILE"], "Documents", "BDA2", "data", "csv")

print(xls_path)
print(out_dir)

os.makedirs(out_dir, exist_ok=True)
base_name = os.path.splitext(os.path.basename(xls_path))[0]
xls = pd.ExcelFile(xls_path)
for sheet in xls.sheet_names:
    df = pd.read_excel(xls, sheet_name=sheet)
    # save with Excel filename (Promotion_data.csv), not sheet name
    out_file = os.path.join(out_dir, f"{base_name}.csv")
    df.to_csv(out_file, index=False)
    print(f"Saved {out_file}")
    

In [None]:
xls_path = os.path.join(os.environ["USERPROFILE"], "Documents", "BDA2", "data", "Promotion_new_data.xlsx")
out_dir  = os.path.join(os.environ["USERPROFILE"], "Documents", "BDA2", "data", "csv")

print(xls_path)
print(out_dir)

os.makedirs(out_dir, exist_ok=True)
base_name = os.path.splitext(os.path.basename(xls_path))[0]
xls = pd.ExcelFile(xls_path)
for sheet in xls.sheet_names:
    df = pd.read_excel(xls, sheet_name=sheet)
    # save with Excel filename (Promotion_data.csv), not sheet name
    out_file = os.path.join(out_dir, f"{base_name}.csv")
    df.to_csv(out_file, index=False)
    print(f"Saved {out_file}")
    

In [None]:
# ===================== CREATE SPARK SESSION =====================
# Build or get an existing Spark session. 
# - .builder: starts a new session builder
# - .appName(APP_NAME): names the Spark application (shows in Spark UI/logs)
# - .getOrCreate(): reuses an existing session if one is active, otherwise creates a new one
spark = (
    SparkSession.builder
    .appName(APP_NAME)
    .getOrCreate()
)

# Log confirmation that Spark has started successfully
logger.info("Spark session created.")


In [None]:
# ===================== LOAD CSV: OLD (each CSV -> separate DataFrame) =====================
csv_path = os.path.join(os.environ["USERPROFILE"], "Documents", "BDA2", "data", "csv", "Promotion_data.csv")

print(csv_path)


logger.info(f"[OLD] Starting to load CSV: {csv_path}")

df = (
    spark.read.format("csv")
    .option("header", "true")       # first row as header
    .option("inferSchema", "true")  # try to detect column types
    .load(csv_path)
)

row_count = df.count()
col_count = len(df.columns)

logger.info(f"[OLD] Loaded CSV {csv_path} -> rows={row_count}, cols={col_count}")



In [None]:
# ===================== LOAD CSV: NEW (each CSV -> separate DataFrame) =====================
csv_path_new = os.path.join(os.environ["USERPROFILE"], "Documents", "BDA2", "data", "csv", "Promotion_new_data.csv")

logger.info(f"[NEW] Starting to load CSV: {csv_path_new}")

df_new = (
    spark.read.format("csv")
    .option("header", "true")       # first row as header
    .option("inferSchema", "true")  # try to detect column types
    .load(csv_path_new)
)

row_count_new = df_new.count()
col_count_new = len(df_new.columns)

logger.info(f"[NEW] Loaded CSV {csv_path_new} -> rows={row_count_new}, cols={col_count_new}")


In [None]:
df.show()


In [None]:
# =====================DATA  VALIDATION =====================
from pyspark.sql.functions import col, isnan, count, lit
from pyspark.sql.functions import abs as spark_abs
logger.info("Starting data validation for Promotion_data.csv (Spark DataFrame)")

# 1. Null counts per column
null_counts = df.select([count(when(col(c).isNull() | isnan(c), c)).alias(c) for c in df.columns]).collect()[0].asDict()
for c, n in null_counts.items():
    if n > 0:
        logger.warning(f"[NULL] Column '{c}' has {n} nulls")
    else:
        logger.info(f"[NULL] Column '{c}' has no nulls")

# 2. Negative values check for numeric columns
numeric_cols = ["Price", "Discount", "Units", "Sales $", "Gross Margin $", "# Transactions that contained the product"]
for colname in numeric_cols:
    negatives = df.filter(col(colname) < 0).count()
    if negatives > 0:
        logger.error(f"[NEGATIVE] Column '{colname}' has {negatives} negative values")
    else:
        logger.info(f"[NEGATIVE] Column '{colname}' all values non-negative")

# 3. Discount between 0 and 1
invalid_discounts = df.filter((col("Discount") < 0) | (col("Discount") > 1)).count()
if invalid_discounts > 0:
    logger.error(f"[DISCOUNT] Found {invalid_discounts} invalid discount values")
else:
    logger.info("[DISCOUNT] All discount values between 0 and 1")

# 4. On Flyer? only Yes/No
invalid_on_flyer = df.filter(~col("On Flyer?").isin("Yes", "No")).count()
if invalid_on_flyer > 0:
    logger.error(f"[ON FLYER] Found {invalid_on_flyer} invalid values in 'On Flyer?' column")
else:
    logger.info("[ON FLYER] All values are Yes/No")

# 5. Year range check
invalid_years = df.filter((col("Year") < 2000) | (col("Year") > 2030)).count()
if invalid_years > 0:
    logger.error(f"[YEAR] Found {invalid_years} invalid year values")
else:
    logger.info("[YEAR] All years are within 2000‚Äì2030")

# 6. Week number between 1 and 53
invalid_weeks = df.filter((col("week number") < 1) | (col("week number") > 53)).count()
if invalid_weeks > 0:
    logger.error(f"[WEEK] Found {invalid_weeks} invalid week numbers")
else:
    logger.info("[WEEK] All week numbers between 1‚Äì53")


# 7. Sales consistency check: Sales $ ‚âà Units * Price * (1 - Discount)
expected_sales = (col("Units") * col("Price") * (lit(1) - col("Discount")))
sales_mismatch = df.filter(spark_abs(col("Sales $") - expected_sales) > 1e-2).count()
if sales_mismatch > 0:
    logger.warning(f"[SALES] {sales_mismatch} rows where Sales $ != Units*Price*(1-Discount)")
else:
    logger.info("[SALES] All rows pass sales consistency check")

logger.info("Data validation finished for Promotion_data.csv (Spark DataFrame)")



In [None]:
# =====================DATA  TRANSFORM OPTION 1 USING SPARK =====================
from pyspark.sql.functions import (
    col, trim, upper, round, when,
    to_date, concat_ws, lit
)

logger.info("Starting data transformations on Promotion_data.csv")

# 1. Clean column names (replace spaces with underscores)
df_clean = df.toDF(*[c.strip().replace(" ", "_") for c in df.columns])
logger.info("Step 1: Column names cleaned (spaces replaced with underscores)")

# 2. Standardize text columns
df_clean = df_clean.withColumn("Product", trim(upper(col("Product"))))
df_clean = df_clean.withColumn("On_Flyer", when(col("On_Flyer?") == "Yes", lit(1)).otherwise(lit(0)))
logger.info("Step 2: Standardized text columns (Product uppercased, On_Flyer flag set)")

# 3. Create proper date from Year + week number
df_clean = df_clean.withColumn("Year", col("Year").cast("int"))
df_clean = df_clean.withColumn("week_number", col("week_number").cast("int"))
df_clean = df_clean.withColumn("Week_Start_Date", to_date(concat_ws("-W", col("Year"), col("week_number")), "yyyy-ww"))
logger.info("Step 3: Derived Week_Start_Date from Year and week_number")

# 4. Calculate normalized discount percentage
df_clean = df_clean.withColumn("Discount_Percent", round(col("Discount") * 100, 2))
logger.info("Step 4: Discount_Percent column created")

# 5. Calculate unit price after discount
df_clean = df_clean.withColumn("Final_Unit_Price", round(col("Price") * (1 - col("Discount")), 2))
logger.info("Step 5: Final_Unit_Price column created")

# 6. Calculate Gross Margin % relative to Sales
df_clean = df_clean.withColumn(
    "Gross_Margin_Percent",
    round((col("Gross_Margin_$") / col("Sales_$")) * 100, 2)
)
logger.info("Step 6: Gross_Margin_Percent column created")

# 7. Flag transactions with unusually high discounts (>50%)
df_clean = df_clean.withColumn("High_Discount_Flag", when(col("Discount") > 0.5, lit(1)).otherwise(lit(0)))
logger.info("Step 7: High_Discount_Flag column created")

# 8. Categorize sales volume
df_clean = df_clean.withColumn(
    "Sales_Category",
    when(col("Units") >= 100, lit("High"))
    .when(col("Units") >= 50, lit("Medium"))
    .otherwise(lit("Low"))
)
logger.info("Step 8: Sales_Category column created (Low/Medium/High)")

logger.info("All data transformations completed successfully")



In [None]:
# =====================DATA  TRANSFORM OPTION 2 USING SPARK SQL=====================
# ==================== REGISTER TEMP VIEW ====================
# Clean column names (replace spaces, remove special chars)
df_sql = df.toDF(*[c.strip().replace(" ", "_").replace("?", "") for c in df.columns])
df_sql.createOrReplaceTempView("promotion_data")
logger.info("Temporary SQL view 'promotion_data' created")

# ==================== TRANSFORMATION WITH SPARK SQL ====================
query = """
SELECT
    Year,
    week_number,
    UPPER(TRIM(Product)) AS Product,               -- normalize product names
    Price,
    Discount,
    ROUND(Discount * 100, 2) AS Discount_Percent,  -- discount in %
    ROUND(Price * (1 - Discount), 2) AS Final_Unit_Price,
    Units,
    `Sales_$`,
    `Gross_Margin_$`,
    ROUND((`Gross_Margin_$` / NULLIF(`Sales_$`,0)) * 100, 2) AS Gross_Margin_Percent,
    CASE WHEN On_Flyer = 'Yes' THEN 1 ELSE 0 END AS On_Flyer_Flag,
    CASE WHEN Discount > 0.5 THEN 1 ELSE 0 END AS High_Discount_Flag,
    CASE 
        WHEN Units >= 100 THEN 'High'
        WHEN Units >= 50 THEN 'Medium'
        ELSE 'Low'
    END AS Sales_Category
FROM promotion_data
"""

df_transformed = spark.sql(query)
logger.info("Transformation query executed successfully")

# ==================== SHOW RESULTS ====================
df_transformed.show(20, truncate=False)
logger.info("Sample transformed rows displayed")

# =====================DATA  TRANSFORM OPTION 2 USING SPARK SQL=====================
# ==================== REGISTER TEMP VIEW ====================
# Clean column names (replace spaces, remove special chars)
df_sql = df_new.toDF(*[c.strip().replace(" ", "_").replace("?", "") for c in df.columns])
df_sql.createOrReplaceTempView("promotion_data_new")
logger.info("Temporary SQL view 'promotion_data_new' created")

# ==================== TRANSFORMATION WITH SPARK SQL ====================
query = """
SELECT
    Year,
    week_number,
    UPPER(TRIM(Product)) AS Product,               -- normalize product names
    Price,
    Discount,
    ROUND(Discount * 100, 2) AS Discount_Percent,  -- discount in %
    ROUND(Price * (1 - Discount), 2) AS Final_Unit_Price,
    Units,
    `Sales_$`,
    `Gross_Margin_$`,
    ROUND((`Gross_Margin_$` / NULLIF(`Sales_$`,0)) * 100, 2) AS Gross_Margin_Percent,
    CASE WHEN On_Flyer = 'Yes' THEN 1 ELSE 0 END AS On_Flyer_Flag,
    CASE WHEN Discount > 0.5 THEN 1 ELSE 0 END AS High_Discount_Flag,
    CASE 
        WHEN Units >= 100 THEN 'High'
        WHEN Units >= 50 THEN 'Medium'
        ELSE 'Low'
    END AS Sales_Category
FROM promotion_data_new
"""

df_transformed_new = spark.sql(query)
logger.info("Transformation query executed successfully")

# ==================== SHOW RESULTS ====================
df_transformed_new.show(20, truncate=False)
logger.info("Sample transformed rows displayed")

In [None]:
# ==================== APPEND OLD + NEW ====================
df_combined = df_transformed.unionByName(df_transformed_new)
logger.info("Combined DataFrame created by appending old and new data")

# ==================== SHOW RESULTS ====================
df_combined.show(20, truncate=False)
logger.info("Sample combined rows displayed")

In [None]:
# ===================== JDBC WRITE =====================
JDBC_URL    = "jdbc:sqlserver://localhost:1433;databaseName=datahub;encrypt=true;trustServerCertificate=true"
JDBC_USER   = "sa"
JDBC_PWD    = "user1"
JDBC_DRIVER = "com.microsoft.sqlserver.jdbc.SQLServerDriver"
JDBC_TABLE  = "dbo.PromotionTable"
JDBC_MODE   = "overwrite"

props = {"user": JDBC_USER, "password": JDBC_PWD, "driver": JDBC_DRIVER}
try:
    df_combined.write.jdbc(url=JDBC_URL, table=JDBC_TABLE, mode=JDBC_MODE, properties=props)
    logger.info(f"Wrote {row_count} rows to {JDBC_TABLE}")
    write_ok = True
except Exception as e:
    logger.error(f"JDBC write failed: {e}", exc_info=True)
    write_ok = False

üõ† Step-by-step: Enable 2FA + Create App Password

Here‚Äôs what you do:

Enable 2-Step Verification (if you haven‚Äôt already)

Go to your Google Account > Security

Under ‚ÄúHow you sign in to Google,‚Äù find 2-Step Verification and turn it on

Follow the prompts (enter phone number, etc.)

After this, your account requires a second factor to login. 
CCTV Camera World
+2
Saleshandy
+2

Go to App Passwords page

In Google Account > Security > ‚ÄúApp passwords‚Äù

Or navigate directly: https://myaccount.google.com/apppasswords

You may be asked to sign in again for security. 
Reolink Support
+2
Google Help
+2

Generate a new App Password

Under ‚ÄúSelect app,‚Äù choose ‚ÄúOther (Custom name)‚Äù

Give it a label (e.g. PromotionPipelineEmail)

Click Generate

Google shows a 16-character password (no spaces). Copy it. 
Reolink Support
+2
Saleshandy
+2

Use the App Password in your SMTP config

In your Python / email-sending code, replace your normal Gmail password with this App Password

Example: MAIL_PWD = "abcd wxyz pqrs uv12" (without spaces)

Test sending email

Try sending a simple test email using SMTP with TLS

If it works, you're good. If it fails: check port, host, firewall, etc.

In [None]:
# ===================== EMAIL WITH LOG ATTACHED =====================
import smtplib, os, logging
from email.mime.text import MIMEText
from email.mime.multipart import MIMEMultipart
from email.mime.application import MIMEApplication

# --- Config (edit these) ---
MAIL_HOST = "smtp.gmail.com"         # Gmail: smtp.gmail.com | Office 365: smtp.office365.com
MAIL_PORT = 587
MAIL_USER = "abc@gmail.com"        # sender address (must be the auth user on many providers)
MAIL_PWD  = "kfah osuo oxzp aide"    # app password / SMTP password (not your normal login)
MAIL_TO   = "abc@gmail.com"        # comma-separated for multiple: "a@x.com,b@y.com"

SUBJECT   = "Promotion Pipeline Status"
BODY_TEXT = "The pipeline completed. See attached log."

# --- Log file to attach (uses your earlier variable 'log_filename') ---
# If you used a different variable name, set it here:
# log_filename = r"C:\path\to\promotion_pipeline_YYYYMMDD_HHMMSS.log"

logger.info("Preparing email with log attachment")

if not (MAIL_USER and MAIL_PWD and MAIL_TO):
    logger.warning("Email config missing (MAIL_USER/MAIL_PWD/MAIL_TO). Skipping email.")
else:
    msg = MIMEMultipart()
    msg["Subject"] = SUBJECT
    msg["From"] = MAIL_USER
    msg["To"] = MAIL_TO

    # body
    msg.attach(MIMEText(BODY_TEXT, "plain", _charset="utf-8"))

    # attach log if present
    if os.path.exists(log_filename):
        try:
            with open(log_filename, "rb") as f:
                part = MIMEApplication(f.read(), Name=os.path.basename(log_filename))
                part.add_header("Content-Disposition", f'attachment; filename="{os.path.basename(log_filename)}"')
                msg.attach(part)
            logger.info(f"Attached log file: {log_filename}")
        except Exception as e:
            logger.error(f"Failed to attach log file: {e}", exc_info=True)
    else:
        logger.warning(f"Log file not found, skipping attachment: {log_filename}")

    # send
    try:
        with smtplib.SMTP(MAIL_HOST, MAIL_PORT) as server:
            server.starttls()
            server.login(MAIL_USER, MAIL_PWD)
            server.send_message(msg)
        logger.info("Email sent successfully.")
    except Exception as e:
        logger.error(f"Failed to send email: {e}", exc_info=True)
