In [4]:
from pyspark.sql import SparkSession

# SparkSession oluştur
spark = SparkSession.builder \
    .appName("CSV_Reader") \
    .getOrCreate()

# CSV dosyasını oku, noktalı virgül ayırıcısını kullanarak
df = spark.read.option("header", "true") \
    .option("delimiter", ";") \
    .option("inferSchema", "true") \
    .csv("/home/jovyan/work/MarketSales.csv")

# Dataframe'in ilk birkaç satırını göster
df.show()



+-----+--------+--------------------+-------+---------------+------+-----+------------+-------+--------+----------------+------------------+---------+-----------------+--------+---------+----------+----------------+---------+--------------+---------------+---------------+------------------+---------------+---------------+------+
|   ID|ITEMCODE|            ITEMNAME|FICHENO|          DATE_|AMOUNT|PRICE|LINENETTOTAL|LINENET|BRANCHNR|          BRANCH|          SALESMAN|     CITY|           REGION|LATITUDE|LONGITUDE|CLIENTCODE|      CLIENTNAME|BRANDCODE|         BRAND| CATEGORY_NAME1| CATEGORY_NAME2|    CATEGORY_NAME3|      STARTDATE|        ENDDATE|GENDER|
+-----+--------+--------------------+-------+---------------+------+-----+------------+-------+--------+----------------+------------------+---------+-----------------+--------+---------+----------+----------------+---------+--------------+---------------+---------------+------------------+---------------+---------------+------+
|11738|

In [16]:
df.select("DATE_").show(5)

+---------------+
|          DATE_|
+---------------+
|7.01.2017 00:00|
|6.01.2017 00:00|
|3.01.2017 00:00|
|3.01.2017 00:00|
|5.01.2017 00:00|
+---------------+
only showing top 5 rows



In [17]:
from pyspark.sql.functions import year, month, sum, to_timestamp

# Tarihi doğru formata dönüştür ve yıl/ay analizini yap
sales_trend = df.withColumn("DATE_FORMATTED", to_timestamp("DATE_", "d.MM.yyyy HH:mm")) \
                .withColumn("year", year("DATE_FORMATTED")) \
                .withColumn("month", month("DATE_FORMATTED")) \
                .groupBy("year", "month") \
                .agg(sum("LINENETTOTAL").alias("total_sales")) \
                .orderBy("year", "month")

sales_trend.show()

+----+-----+-----------+
|year|month|total_sales|
+----+-----+-----------+
|NULL| NULL|       NULL|
|2017|    1|    88857.0|
|2017|    2|    91138.0|
|2017|    3|   112005.0|
+----+-----+-----------+



In [20]:
from pyspark.sql.functions import year, month, sum, to_timestamp, datediff, max, count

# Tarihi doğru formata dönüştür
df_formatted = df.withColumn("DATE_FORMATTED", to_timestamp("DATE_", "d.MM.yyyy HH:mm"))

# Yıl ve aya göre toplam satışları hesapla
sales_trend = df_formatted.withColumn("year", year("DATE_FORMATTED")) \
                          .withColumn("month", month("DATE_FORMATTED")) \
                          .groupBy("year", "month") \
                          .agg(sum("LINENETTOTAL").alias("total_sales")) \
                          .orderBy("year", "month")

sales_trend.show()

+----+-----+-----------+
|year|month|total_sales|
+----+-----+-----------+
|NULL| NULL|       NULL|
|2017|    1|    88857.0|
|2017|    2|    91138.0|
|2017|    3|   112005.0|
+----+-----+-----------+



In [7]:
from pyspark.sql.functions import year, month, sum, to_timestamp, datediff, max, count, current_date, lit


# Tarihi doğru formata dönüştür
df_formatted = df.withColumn("DATE_FORMATTED", to_timestamp("DATE_", "d.MM.yyyy HH:mm"))

# Yıl ve aya göre toplam satışları hesapla
sales_trend = df_formatted.withColumn("year", year("DATE_FORMATTED")) \
                          .withColumn("month", month("DATE_FORMATTED")) \
                          .groupBy("year", "month") \
                          .agg(sum("LINENETTOTAL").alias("total_sales")) \
                          .orderBy("year", "month")

sales_trend.show()

# current_date'i bir Spark Column olarak al
current_date_col = current_date()

# RFM hesaplamasında bu Column'u kullan
rfm = df_formatted.withColumn("recency", datediff(current_date_col, "DATE_FORMATTED"))

# Diğer işlemler...
current_date_value = df_formatted.agg(max("DATE_FORMATTED")).collect()[0][0]

rfm = df_formatted.groupBy("CLIENTCODE") \
    .agg(
        datediff(lit(current_date_value), max("DATE_FORMATTED")).alias("recency"),
        count("FICHENO").alias("frequency"),
        sum("LINENETTOTAL").alias("monetary")
    )

rfm.show()

+----+-----+-----------+
|year|month|total_sales|
+----+-----+-----------+
|NULL| NULL|       NULL|
|2017|    1|    88857.0|
|2017|    2|    91138.0|
|2017|    3|   112005.0|
+----+-----+-----------+

+----------+-------+---------+--------+
|CLIENTCODE|recency|frequency|monetary|
+----------+-------+---------+--------+
|   1093856|     27|       39|    14.0|
|    536646|     22|       22|    NULL|
|    869396|     11|       31|    13.0|
|    924386|      1|       47|     8.0|
|    983041|     11|       15|     8.0|
|     17506|     42|       14|    10.0|
|    195395|      4|       52|    NULL|
|    164951|     36|        7|     3.0|
|    379975|      4|       16|     9.0|
|   1055537|      8|        8|     7.0|
|   1080760|     54|        8|     1.0|
|   1005313|     57|       48|    11.0|
|   1052349|     14|       70|    14.0|
|    350582|     36|        4|    NULL|
|    327859|     61|        7|    NULL|
|   1028795|      1|       22|    18.0|
|   1068016|     46|       23|     7.0|