<a href="https://colab.research.google.com/github/ankitarm/PySpark/blob/main/Chatgpt_Pyspark_questions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

1. Question 1: Customer Transactions Analysis
Write a PySpark program to find the first transaction date and total transaction amount for each customer.


In [5]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_date
spark = SparkSession.builder.master("local[*]").appName("CustomerTransactions").getOrCreate()
from pyspark.sql import Row

# Sample data
data = [
    Row(customer_id=101, transaction_date="2023-01-10", amount=150.0),
    Row(customer_id=102, transaction_date="2023-01-12", amount=200.0),
    Row(customer_id=101, transaction_date="2023-01-15", amount=100.0),
    Row(customer_id=101, transaction_date="2023-01-05", amount=50.0),
    Row(customer_id=102, transaction_date="2023-01-14", amount=300.0)
]

# Create DataFrame
transactions = spark.createDataFrame(data)


# Show input data
transactions.show()


+-----------+----------------+------+
|customer_id|transaction_date|amount|
+-----------+----------------+------+
|        101|      2023-01-10| 150.0|
|        102|      2023-01-12| 200.0|
|        101|      2023-01-15| 100.0|
|        101|      2023-01-05|  50.0|
|        102|      2023-01-14| 300.0|
+-----------+----------------+------+



In [12]:
from pyspark.sql.types import DateType

# Cast transaction_date from string to date
# to_date(column, "yyyy-MM-dd")     column.cast(StringType())
transactions.withColumn("transaction_date", to_date(col("transaction_date"), "yyyy-MM-dd")).show()
transactions.select( "*" , col("transaction_date").cast(DateType())).show()


+-----------+----------------+------+
|customer_id|transaction_date|amount|
+-----------+----------------+------+
|        101|      2023-01-10| 150.0|
|        102|      2023-01-12| 200.0|
|        101|      2023-01-15| 100.0|
|        101|      2023-01-05|  50.0|
|        102|      2023-01-14| 300.0|
+-----------+----------------+------+

+-----------+----------------+------+----------------+
|customer_id|transaction_date|amount|transaction_date|
+-----------+----------------+------+----------------+
|        101|      2023-01-10| 150.0|      2023-01-10|
|        102|      2023-01-12| 200.0|      2023-01-12|
|        101|      2023-01-15| 100.0|      2023-01-15|
|        101|      2023-01-05|  50.0|      2023-01-05|
|        102|      2023-01-14| 300.0|      2023-01-14|
+-----------+----------------+------+----------------+



In [10]:
display(transactions)

DataFrame[customer_id: bigint, transaction_date: string, amount: double]

In [14]:
transactions = transactions.withColumn( "transaction_date" , col("transaction_date").cast(DateType()))
transactions.show()

+-----------+----------------+------+
|customer_id|transaction_date|amount|
+-----------+----------------+------+
|        101|      2023-01-10| 150.0|
|        102|      2023-01-12| 200.0|
|        101|      2023-01-15| 100.0|
|        101|      2023-01-05|  50.0|
|        102|      2023-01-14| 300.0|
+-----------+----------------+------+



In [19]:
from pyspark.sql.functions import sum, min
transactions.groupBy(col("customer_id")).agg(min(transactions.transaction_date).alias("transaction_date"), sum("amount").alias("amount")).show()

+-----------+----------------+------+
|customer_id|transaction_date|amount|
+-----------+----------------+------+
|        101|      2023-01-05| 300.0|
|        102|      2023-01-12| 500.0|
+-----------+----------------+------+



2.  Question Active User Streaks
For each user, find the longest streak of consecutive login days.



In [20]:
from pyspark.sql import Row
from pyspark.sql.functions import to_date, col

# Sample login data
login_data = [
    Row(user_id=1, login_date="2023-01-01"),
    Row(user_id=1, login_date="2023-01-02"),
    Row(user_id=1, login_date="2023-01-04"),
    Row(user_id=1, login_date="2023-01-05"),
    Row(user_id=1, login_date="2023-01-06"),
    Row(user_id=2, login_date="2023-01-01"),
    Row(user_id=2, login_date="2023-01-03"),
    Row(user_id=2, login_date="2023-01-04")
]

# Create the DataFrame
logins = spark.createDataFrame(login_data)
logins.show()

+-------+----------+
|user_id|login_date|
+-------+----------+
|      1|2023-01-01|
|      1|2023-01-02|
|      1|2023-01-04|
|      1|2023-01-05|
|      1|2023-01-06|
|      2|2023-01-01|
|      2|2023-01-03|
|      2|2023-01-04|
+-------+----------+



In [21]:
display(logins)

DataFrame[user_id: bigint, login_date: string]

In [23]:
from pyspark.sql.types import DateType
logins = logins.withColumn("login_date", col("login_date").cast(DateType()))
logins.show()

+-------+----------+
|user_id|login_date|
+-------+----------+
|      1|2023-01-01|
|      1|2023-01-02|
|      1|2023-01-04|
|      1|2023-01-05|
|      1|2023-01-06|
|      2|2023-01-01|
|      2|2023-01-03|
|      2|2023-01-04|
+-------+----------+



In [24]:
display(logins)

DataFrame[user_id: bigint, login_date: date]

have to use row_number() which is window function so import window from window


dense_rank cannot be used - it will over count the entries.

In [26]:
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number

window_row = Window.partitionBy(col("user_id")).orderBy(col("login_date"))

logins = logins.withColumn("row_num", row_number().over(window_row))
logins.show()


+-------+----------+-------+
|user_id|login_date|row_num|
+-------+----------+-------+
|      1|2023-01-01|      1|
|      1|2023-01-02|      2|
|      1|2023-01-04|      3|
|      1|2023-01-05|      4|
|      1|2023-01-06|      5|
|      2|2023-01-01|      1|
|      2|2023-01-03|      2|
|      2|2023-01-04|      3|
+-------+----------+-------+



error because login_date and row_num are different datatype.

In [30]:
from pyspark.sql.functions import date_diff
logins = logins.withColumn("Difference", date_diff(("login_date"),col("row_num").cast("int")))
logins.show()

AnalysisException: [DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE] Cannot resolve "date_diff(login_date, CAST(row_num AS INT))" due to data type mismatch: Parameter 2 requires the "DATE" type, however "CAST(row_num AS INT)" has the type "INT".;
'Project [user_id#227L, login_date#240, row_num#255, date_diff(login_date#240, cast(row_num#255 as int)) AS Difference#275]
+- Project [user_id#227L, login_date#240, row_num#255]
   +- Project [user_id#227L, login_date#240, row_num#255, row_num#255]
      +- Window [row_number() windowspecdefinition(user_id#227L, login_date#240 ASC NULLS FIRST, specifiedwindowframe(RowFrame, unboundedpreceding$(), currentrow$())) AS row_num#255], [user_id#227L], [login_date#240 ASC NULLS FIRST]
         +- Project [user_id#227L, login_date#240]
            +- Project [user_id#227L, cast(login_date#228 as date) AS login_date#240]
               +- LogicalRDD [user_id#227L, login_date#228], false


expr("  ") --- sql like operations directly on dataframe

In [32]:
from pyspark.sql.functions import date_diff, expr
logins = logins.withColumn("Difference", expr("date_sub(login_date,row_num)"))
logins.show()

+-------+----------+-------+----------+
|user_id|login_date|row_num|Difference|
+-------+----------+-------+----------+
|      1|2023-01-01|      1|2022-12-31|
|      1|2023-01-02|      2|2022-12-31|
|      1|2023-01-04|      3|2023-01-01|
|      1|2023-01-05|      4|2023-01-01|
|      1|2023-01-06|      5|2023-01-01|
|      2|2023-01-01|      1|2022-12-31|
|      2|2023-01-03|      2|2023-01-01|
|      2|2023-01-04|      3|2023-01-01|
+-------+----------+-------+----------+



In [33]:
from pyspark.sql.functions import count
logins = logins.groupBy(col("user_id"),col("Difference")).agg(count("Difference").alias("Streak"))
logins.show()

+-------+----------+------+
|user_id|Difference|Streak|
+-------+----------+------+
|      1|2022-12-31|     2|
|      1|2023-01-01|     3|
|      2|2022-12-31|     1|
|      2|2023-01-01|     2|
+-------+----------+------+



In [36]:
from pyspark.sql.functions import max
logins = logins.groupBy("user_id").agg((max("Streak")).alias("Max_Streak"))
logins.show()

+-------+----------+
|user_id|Max_Streak|
+-------+----------+
|      1|         3|
|      2|         2|
+-------+----------+

