In [5]:
import findspark
findspark.init()

from pyspark.sql import SparkSession

spark=SparkSession.builder.appName("transaction").getOrCreate()

cdf=spark.read.csv("customer.csv",header=True)

cdf.printSchema()
cdf.show()

root
 |-- id: string (nullable = true)
 |-- total_amount: string (nullable = true)

+---+------------+
| id|total_amount|
+---+------------+
|  1|        1000|
|  2|        2400|
|  3|        1200|
|  4|        5000|
|  5|        3000|
+---+------------+



In [6]:
tdf=spark.read.csv("transaction.csv",header=True)

tdf.printSchema()
tdf.show()

root
 |-- id: string (nullable = true)
 |-- type: string (nullable = true)
 |-- amount: string (nullable = true)

+---+------+------+
| id|  type|amount|
+---+------+------+
|  1|credit|   100|
|  1| debit|   200|
|  1|credit|    50|
|  2|credit|   200|
|  2| debit|    50|
|  3| debit|   100|
|  4|credit|   150|
+---+------+------+



In [9]:
from pyspark.sql.functions import when,col

tdf_neg=tdf.withColumn("new_amount",when( col("type")=='debit', col("amount")*-1).otherwise(col("amount")) )
tdf_neg.show()

+---+------+------+----------+
| id|  type|amount|new_amount|
+---+------+------+----------+
|  1|credit|   100|       100|
|  1| debit|   200|    -200.0|
|  1|credit|    50|        50|
|  2|credit|   200|       200|
|  2| debit|    50|     -50.0|
|  3| debit|   100|    -100.0|
|  4|credit|   150|       150|
+---+------+------+----------+



In [13]:
from pyspark.sql.functions import sum
tdf_group=tdf_neg.groupBy(col("id")).agg(sum("new_amount").alias("sum_amount")).orderBy(col("id").asc())
                                         
tdf_group.show()

+---+----------+
| id|sum_amount|
+---+----------+
|  1|     -50.0|
|  2|     150.0|
|  3|    -100.0|
|  4|     150.0|
+---+----------+



In [22]:
from pyspark.sql.functions import coalesce,lit
overall_trans=cdf.join(tdf_group,"id","left").select(cdf.id,(cdf.total_amount+coalesce(tdf_group.sum_amount,lit(0))).alias("total_amount"))
overall_trans.show()

+---+------------+
| id|total_amount|
+---+------------+
|  1|       950.0|
|  2|      2550.0|
|  3|      1100.0|
|  4|      5150.0|
|  5|      3000.0|
+---+------------+



In [43]:
# create udf for transaction data

from pyspark.sql.functions import udf, col
from pyspark.sql.types import IntegerType

tdf_new=tdf.select(col("id"),col("type"),col("amount").cast("int").alias("amount"))
tdf_new.printSchema()
tdf_new.show()
# Define UDF to convert amount based on transaction type
x_udf = udf(lambda x, y: -1 * y if x == 'debit' else y, IntegerType())

# Apply the UDF
df_with_new = tdf_new.withColumn("new", x_udf(col("type"), col("amount")))

# Show the result
df_with_new.show()

root
 |-- id: string (nullable = true)
 |-- type: string (nullable = true)
 |-- amount: integer (nullable = true)

+---+------+------+
| id|  type|amount|
+---+------+------+
|  1|credit|   100|
|  1| debit|   200|
|  1|credit|    50|
|  2|credit|   200|
|  2| debit|    50|
|  3| debit|   100|
|  4|credit|   150|
+---+------+------+

+---+------+------+----+
| id|  type|amount| new|
+---+------+------+----+
|  1|credit|   100| 100|
|  1| debit|   200|-200|
|  1|credit|    50|  50|
|  2|credit|   200| 200|
|  2| debit|    50| -50|
|  3| debit|   100|-100|
|  4|credit|   150| 150|
+---+------+------+----+

