In [10]:
import os
os.getcwd()
os.chdir("H:\pyspark_advanced-coding_interview")
os.getcwd()

'H:\\pyspark_advanced-coding_interview'

# Count the occurrence of a character in a string

# Pyspark

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType

# Initialize Spark Session
spark = SparkSession.builder.appName("CharacterCount").getOrCreate()

# Define schema and sample data
schema = StructType([
    StructField("ID", StringType(), True),
    StructField("Text", StringType(), True)
])

# Sample data with some strings
data = [
    ("1", "hello world"),
    ("2", "python programming"),
    ("3", "data science with spark"),
    ("4", "hello again"),
    ("5", "the quick brown fox")
]

# Create DataFrame
df = spark.createDataFrame(data, schema)
df.show(truncate=False)


+---+-----------------------+
|ID |Text                   |
+---+-----------------------+
|1  |hello world            |
|2  |python programming     |
|3  |data science with spark|
|4  |hello again            |
|5  |the quick brown fox    |
+---+-----------------------+



In [2]:
from pyspark.sql.functions import length, regexp_replace

# Character to count
char_to_count = "o"

# Count occurrences by comparing lengths
df_with_count = df.withColumn(
    "Count", 
    length("Text") - length(regexp_replace("Text", char_to_count, ""))
)
df_with_count.show(truncate=False)


+---+-----------------------+-----+
|ID |Text                   |Count|
+---+-----------------------+-----+
|1  |hello world            |2    |
|2  |python programming     |2    |
|3  |data science with spark|0    |
|4  |hello again            |1    |
|5  |the quick brown fox    |2    |
+---+-----------------------+-----+



In [3]:
from pyspark.sql.functions import size, split

# Count occurrences using split and size
df_with_split_count = df.withColumn(
    "Count", 
    size(split("Text", char_to_count)) - 1
)
df_with_split_count.show(truncate=False)


+---+-----------------------+-----+
|ID |Text                   |Count|
+---+-----------------------+-----+
|1  |hello world            |2    |
|2  |python programming     |2    |
|3  |data science with spark|0    |
|4  |hello again            |1    |
|5  |the quick brown fox    |2    |
+---+-----------------------+-----+



# Spark SQL

In [4]:
# Register as a temporary table
df.createOrReplaceTempView("TextData")


In [5]:
# SQL Query
sql_query = """
SELECT ID, Text, 
       LENGTH(Text) - LENGTH(REGEXP_REPLACE(Text, 'o', '')) AS Count
FROM TextData
"""

# Execute the query
result_sql = spark.sql(sql_query)
result_sql.show(truncate=False)


+---+-----------------------+-----+
|ID |Text                   |Count|
+---+-----------------------+-----+
|1  |hello world            |2    |
|2  |python programming     |2    |
|3  |data science with spark|0    |
|4  |hello again            |1    |
|5  |the quick brown fox    |2    |
+---+-----------------------+-----+



In [6]:
# SQL Query
sql_query_split = """
SELECT ID, Text, 
       SIZE(SPLIT(Text, 'o')) - 1 AS Count
FROM TextData
"""

# Execute the query
result_sql_split = spark.sql(sql_query_split)
result_sql_split.show(truncate=False)


+---+-----------------------+-----+
|ID |Text                   |Count|
+---+-----------------------+-----+
|1  |hello world            |2    |
|2  |python programming     |2    |
|3  |data science with spark|0    |
|4  |hello again            |1    |
|5  |the quick brown fox    |2    |
+---+-----------------------+-----+



# Python

In [7]:
# Example strings
strings = ["hello world", "python programming", "data science with spark", "hello again", "the quick brown fox"]

# Character to count
char_to_count = "o"

# Count occurrences using str.count
counts = [s.count(char_to_count) for s in strings]
print(counts)  # Output: [2, 2, 1, 1, 2]


[2, 2, 0, 1, 2]


In [8]:
from collections import Counter

# Count occurrences using collections.Counter
for s in strings:
    counter = Counter(s)
    print(f"'{char_to_count}' appears {counter[char_to_count]} times in '{s}'")


'o' appears 2 times in 'hello world'
'o' appears 2 times in 'python programming'
'o' appears 0 times in 'data science with spark'
'o' appears 1 times in 'hello again'
'o' appears 2 times in 'the quick brown fox'


In [9]:
# Count occurrences manually using str.find()
for s in strings:
    count = 0
    index = s.find(char_to_count)
    while index != -1:
        count += 1
        index = s.find(char_to_count, index + 1)
    print(f"'{char_to_count}' appears {count} times in '{s}'")


'o' appears 2 times in 'hello world'
'o' appears 2 times in 'python programming'
'o' appears 0 times in 'data science with spark'
'o' appears 1 times in 'hello again'
'o' appears 2 times in 'the quick brown fox'
