<a href="https://colab.research.google.com/github/candidlpd/pyspark-coding-interview/blob/master/Find%20Age%20from%20Birth%20Date.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.3.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.3-py2.py3-none-any.whl size=317840625 sha256=048a93c6f87f976598523db983cd2ce670dcad7d862ce7be241b53a92ee73c4d
  Stored in directory: /root/.cache/pip/wheels/1b/3a/92/28b93e2fbfdbb07509ca4d6f50c5e407f48dce4ddbda69a4ab
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.3


In [2]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import DateType

# Initialize Spark session
spark = SparkSession.builder.master("local").appName("WeekdaysBetweenDates").getOrCreate()

In [7]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DateType
from datetime import datetime

# Initialize Spark session
spark = SparkSession.builder.appName("SparkSQLExample").getOrCreate()

# Define the schema for the DataFrame
schema = StructType([
    StructField("FirstName", StringType(), True),
    StructField("LastName", StringType(), True),
    StructField("BirthDate", DateType(), True)

])

# Create data, converting date strings to datetime.date objects
data = [
    ("Guy", "Gilbert", datetime.strptime("1981-11-12", "%Y-%m-%d").date()),
    ("Kevin", "Brown", datetime.strptime("1985-02-01", "%Y-%m-%d").date()),
    ("Roberto", "Tamburello", datetime.strptime("1974-06-12", "%Y-%m-%d").date()),
    ("Rob", "Walters", datetime.strptime("1974-07-23", "%Y-%m-%d").date()),
    ("Thierry", "D'Hers", datetime.strptime("1959-02-26", "%Y-%m-%d").date()),
    ("David", "Bradley", datetime.strptime("1974-10-17", "%Y-%m-%d").date()),
    ("JoLynn", "Dobney", datetime.strptime("1955-08-16", "%Y-%m-%d").date()),
    ("Ruth", "Ellerbrock", datetime.strptime("1956-01-03", "%Y-%m-%d").date()),
    ("Gail", "Erickson", datetime.strptime("1952-04-27", "%Y-%m-%d").date())
]

# Create a DataFrame from the data
df = spark.createDataFrame(data, schema=schema)

# Show the DataFrame
df.show()


+---------+----------+----------+
|FirstName|  LastName| BirthDate|
+---------+----------+----------+
|      Guy|   Gilbert|1981-11-12|
|    Kevin|     Brown|1985-02-01|
|  Roberto|Tamburello|1974-06-12|
|      Rob|   Walters|1974-07-23|
|  Thierry|    D'Hers|1959-02-26|
|    David|   Bradley|1974-10-17|
|   JoLynn|    Dobney|1955-08-16|
|     Ruth|Ellerbrock|1956-01-03|
|     Gail|  Erickson|1952-04-27|
+---------+----------+----------+



In [8]:
# Register the DataFrame as a temporary SQL view
df.createOrReplaceTempView("People")

# Now you can run SQL queries on the "People" view
spark.sql("SELECT * FROM People").show()



+---------+----------+----------+
|FirstName|  LastName| BirthDate|
+---------+----------+----------+
|      Guy|   Gilbert|1981-11-12|
|    Kevin|     Brown|1985-02-01|
|  Roberto|Tamburello|1974-06-12|
|      Rob|   Walters|1974-07-23|
|  Thierry|    D'Hers|1959-02-26|
|    David|   Bradley|1974-10-17|
|   JoLynn|    Dobney|1955-08-16|
|     Ruth|Ellerbrock|1956-01-03|
|     Gail|  Erickson|1952-04-27|
+---------+----------+----------+



In [9]:
spark.sql("""
select FirstName, LastName,BirthDate,
FLOOR(DATEDIFF(CURRENT_DATE(), BirthDate)/365) as age
from people
""").show()

+---------+----------+----------+---+
|FirstName|  LastName| BirthDate|age|
+---------+----------+----------+---+
|      Guy|   Gilbert|1981-11-12| 42|
|    Kevin|     Brown|1985-02-01| 39|
|  Roberto|Tamburello|1974-06-12| 50|
|      Rob|   Walters|1974-07-23| 50|
|  Thierry|    D'Hers|1959-02-26| 65|
|    David|   Bradley|1974-10-17| 50|
|   JoLynn|    Dobney|1955-08-16| 69|
|     Ruth|Ellerbrock|1956-01-03| 68|
|     Gail|  Erickson|1952-04-27| 72|
+---------+----------+----------+---+



In [14]:
from pyspark.sql import functions as F

df_with_age = df.withColumn("age",
                           F.floor(F.date_diff(F.current_date(), F.col("BirthDate"))/365)
                           )

df_with_age.show()

+---------+----------+----------+---+
|FirstName|  LastName| BirthDate|age|
+---------+----------+----------+---+
|      Guy|   Gilbert|1981-11-12| 42|
|    Kevin|     Brown|1985-02-01| 39|
|  Roberto|Tamburello|1974-06-12| 50|
|      Rob|   Walters|1974-07-23| 50|
|  Thierry|    D'Hers|1959-02-26| 65|
|    David|   Bradley|1974-10-17| 50|
|   JoLynn|    Dobney|1955-08-16| 69|
|     Ruth|Ellerbrock|1956-01-03| 68|
|     Gail|  Erickson|1952-04-27| 72|
+---------+----------+----------+---+

