#### Get the second most recent activity or if there is only one activity then return that one


In [1]:
# Import and create SparkSession
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.types import *
#from pyspark.sql.types import StructField,IntegerType, StringType , DateType ,StructType, TimestampType
from pyspark.sql.functions import *
#from pyspark.sql.functions import count,sum,col, to_date
from pyspark.sql import Window

In [2]:
# Set configuration
my_conf = SparkConf()
my_conf.set("spark.app.name", "My application 1")
my_conf.set("spark.master", "local[*]")
my_conf.set("spark.logConf", "false")  # Disable Spark's internal logging configuration
my_conf.set("spark.driver.log.level", "ERROR")  # Set the log level for the driver

# Create spark session
spark = SparkSession.builder.config(conf=my_conf).getOrCreate()


In [3]:
# Read csv to dataframe
df = spark.read \
    .format("csv") \
    .option("header", True) \
    .option("path", r"C:\Users\ajith\Practice Python\Pyspark_coding\Dataset\2.txt") \
    .load()

In [4]:
df.show()
df.printSchema()

+--------+--------+----------+----------+
|username|activity| startDate|   endDate|
+--------+--------+----------+----------+
|   Alice|  Travel|12-02-2020|20-02-2020|
|   Alice| Dancing|21-02-2020|23-02-2020|
|   Alice|  Travel|24-02-2020|28-02-2020|
|     Bob|  Travel|11-02-2020|18-02-2020|
+--------+--------+----------+----------+

root
 |-- username: string (nullable = true)
 |-- activity: string (nullable = true)
 |-- startDate: string (nullable = true)
 |-- endDate: string (nullable = true)



In [5]:
df1 = df.withColumn("startDate",to_date(col("startDate"),'dd-MM-yyyy')) \
        .withColumn("endDate",to_date(col("startDate"),'dd-MM-yyyy')) 

In [6]:
df1.show()
df1.printSchema()

+--------+--------+----------+----------+
|username|activity| startDate|   endDate|
+--------+--------+----------+----------+
|   Alice|  Travel|2020-02-12|2020-02-12|
|   Alice| Dancing|2020-02-21|2020-02-21|
|   Alice|  Travel|2020-02-24|2020-02-24|
|     Bob|  Travel|2020-02-11|2020-02-11|
+--------+--------+----------+----------+

root
 |-- username: string (nullable = true)
 |-- activity: string (nullable = true)
 |-- startDate: date (nullable = true)
 |-- endDate: date (nullable = true)



In [7]:
window_for_count = Window.partitionBy("username")
window_for_rank = Window.partitionBy("username").orderBy("startDate")

In [8]:
df2 = df1.withColumn("total_activity",count("username").over(window_for_count)) \
         .withColumn("rn",rank().over(window_for_rank))


In [9]:
df3 = df2.filter((col("total_activity")==1) | (col("rn")==2)).select("username","activity","startDate","endDate")

#df3 = df2.filter((df2.total_activity==1) | (df2.rn==2))


df3.show()


+--------+--------+----------+----------+
|username|activity| startDate|   endDate|
+--------+--------+----------+----------+
|   Alice| Dancing|2020-02-21|2020-02-21|
|     Bob|  Travel|2020-02-11|2020-02-11|
+--------+--------+----------+----------+

