In [0]:
source = '/Volumes/workspace/default/my_volume/Sentiment_Project/enhancements/bronze_tweets'
new_silver_path = '/Volumes/workspace/default/my_volume/Sentiment_Project/enhancements/silver_tweets'

bronze_df = spark.read.format('delta').load(source)
display(bronze_df.limit(10))

In [0]:
brand_keywords = {
    "apple": ["iphone", "ipad", "macbook", "apple", "macos", "ios", "applestore", "tim cook", "apple inc", "mac pro", "airpods", "appple", "iphone", "mac"],
    "google": ["google", "android", "pixel", "chrome", "youtube", "gmail", "googledrive", "google cloud", "googl", "android phone", "youtube video"],
    "microsoft": ["microsoft", "windows", "surface", "xbox", "azure", "office365", "teams", "msft", "outlook", "word", "excel"],
    "samsung": ["samsung", "galaxy", "note", "fold", "zflip", "android", "oneui", "samsung phone", "galaxy s", "galaxy note", "samsing"],
    "amazon": ["amazon", "alexa", "kindle", "primevideo", "aws", "echo dot", "amazon prime", "amazon echo", "amazon com", "amazon"]
}

In [0]:
def detect_brands(text):
    if text is None:
        return[]
    else:
        text_lower = text.lower()
        detected_brands = []
        for brand, keywords in brand_keywords.items():
            for keyword in keywords: # Using raw string prefix(r) and word boundry anchors to match whole words only
                if re.search(r'\b' + re.escape(keyword) + r'\b', text_lower):
                    detected_brands.append(brand)
                    break

        return detected_brands

In [0]:
from pyspark.sql.types import StringType, ArrayType
detect_brands_udf = udf(detect_brands, ArrayType(StringType()))

**Cleaning**

In [0]:
from pyspark.sql.functions import col, regexp_replace, lower, when, size
import re

enhanced_df = bronze_df.withColumn(
    'Cleaned_text',
    regexp_replace(lower(col('text')), '[^a-zA-Z0-9]', ' ')
).withColumn(
    'Detected_brands',
    detect_brands_udf(col('Cleaned_text'))
).withColumn(
    'Age_group',
    col('Age_of_User')
).withColumn(
    'Time_category',
    col('Time_of_Tweet')
).withColumn(
    'Has_Brand_mention',
    when(size(col('Detected_brands'))>0, 1).otherwise(0)
).select(
    'Cleaned_text',
    'Detected_brands',
    'Age_group',
    'Time_category',
    'Has_Brand_mention'
)

In [0]:
display(enhanced_df.filter(col('Has_Brand_mention') > 0))

In [0]:
enhanced_df.write.format('delta').mode('overwrite').save(new_silver_path)