In [None]:
# Uzman olduğumdan değil ama okuduğum kaynaktan anladıklarımı buraya yazıyorum.
# Kaynak: https://sparkbyexamples.com/pyspark-tutorial/#rdd

"""
    Nedir bu DataFrame? 
    
    Spark' taki DataFrame' ın karşılığı olarak R/Python' daki DataFrame' leri gösteriyor lakin 
    onlardan daha zengin bir işleve sahip.
    
    Python ile uğraşmışsanız eğer Pandas DataFrame' ıni duymuşsunuzdur. PySpark DataFrame' ı de Pandas
    DataFrame' ıne benzer lakin şöyle bir fark vardır ki PySpark DataFrame' inin sahip olduğu veriler
    bir Cluster' daki farklı makinelerde depolanır. [Cluster ne demek bilmiyorum? 
    Sanırım birden fazla makinenin bağlanarak oluşturduğu yapıya cluster deniyor.]
    
    PySpark Pandastan daha mı hızlı?
    
    Pandas tek bir node (yani bilgisayar) üzerinde çalışır. Spark ise birden fazla makine üzerinde
    çalıştığı için daha hızlıdır.
    
    Peki ya tek makine üzerinde çalışıyorsak hangisi daha hızlıdır?
"""


In [3]:
from pyspark.sql import SparkSession

spark = SparkSession.builder\
      .getOrCreate()

In [11]:
""" 
    Kısaca DataFrame oluşturma örneği:
    
    En kolay yolunun Python listesi olduğunu söylüyor. Farklı kaynaklardan da oluşturulabilir. Mesela JSON, CSV, 
    Text, DataFrame' ler, Kafka vs.
"""

data = [
    ('James', '', 'Smith', '1991-04-01', 'M', 3000),
    ('Michael', 'Rose', '', '2000-05-19', 'M', 4000),
    ('Robert', '', 'Williams', '1978-09-05', 'M', 4000),
    ('Maria', 'Anne', 'Jones', '1967-12-01', 'F', 4000),
    ('Jen', 'Mary', 'Brown', '1980-02-17', 'F', -1)
]

columns = ["firstname","middlename","lastname","dob","gender","salary"]
df = spark.createDataFrame(data=data, schema = columns)

In [13]:
# DataFrame' ler isimler(names) ve kolon(colums) içeren yapılardır. DataFrame' in Schema' sını görmek için bu 
# fonksiyonu kullanabiliriz.
# firstname mantıken kolon oluyor. firstname değerleri ise isimler(name) oluyor.
df.printSchema()

root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: long (nullable = true)



In [14]:
# DataFrame' ı show etmiş oluyoruz. Eğer 20 taneden fazla satır varsa sadece 20 tanesini gösteriyor.
df.show()

+---------+----------+--------+----------+------+------+
|firstname|middlename|lastname|       dob|gender|salary|
+---------+----------+--------+----------+------+------+
|    James|          |   Smith|1991-04-01|     M|  3000|
|  Michael|      Rose|        |2000-05-19|     M|  4000|
|   Robert|          |Williams|1978-09-05|     M|  4000|
|    Maria|      Anne|   Jones|1967-12-01|     F|  4000|
|      Jen|      Mary|   Brown|1980-02-17|     F|    -1|
+---------+----------+--------+----------+------+------+



In [4]:
# DataFrame filter

# First Generate DataFrame

from pyspark.sql.types import StructType, StructField 
from pyspark.sql.types import StringType, IntegerType, ArrayType
data = [
    (("James","","Smith"),["Java","Scala","C++"],"OH","M"),
    (("Anna","Rose",""),["Spark","Java","C++"],"NY","F"),
    (("Julia","","Williams"),["CSharp","VB"],"OH","F"),
    (("Maria","Anne","Jones"),["CSharp","VB"],"NY","M"),
    (("Jen","Mary","Brown"),["CSharp","VB"],"NY","M"),
    (("Mike","Mary","Williams"),["Python","VB"],"OH","M")
 ]
        
schema = StructType([
     StructField('name', StructType([
        StructField('firstname', StringType(), True),
        StructField('middlename', StringType(), True),
         StructField('lastname', StringType(), True)
     ])),
     StructField('languages', ArrayType(StringType()), True),
     StructField('state', StringType(), True),
     StructField('gender', StringType(), True)
 ])

df = spark.createDataFrame(data = data, schema = schema)
df.printSchema()
df.show(truncate=False)

root
 |-- name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- languages: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- state: string (nullable = true)
 |-- gender: string (nullable = true)

+----------------------+------------------+-----+------+
|name                  |languages         |state|gender|
+----------------------+------------------+-----+------+
|{James, , Smith}      |[Java, Scala, C++]|OH   |M     |
|{Anna, Rose, }        |[Spark, Java, C++]|NY   |F     |
|{Julia, , Williams}   |[CSharp, VB]      |OH   |F     |
|{Maria, Anne, Jones}  |[CSharp, VB]      |NY   |M     |
|{Jen, Mary, Brown}    |[CSharp, VB]      |NY   |M     |
|{Mike, Mary, Williams}|[Python, VB]      |OH   |M     |
+----------------------+------------------+-----+------+



In [23]:
# Filter fonksiyonu. Bu şekilde filter yapmak aklımda kalır.

df.filter(df.state == 'OH').show(truncate=False)

+----------------------+------------------+-----+------+
|name                  |languages         |state|gender|
+----------------------+------------------+-----+------+
|{James, , Smith}      |[Java, Scala, C++]|OH   |M     |
|{Julia, , Williams}   |[CSharp, VB]      |OH   |F     |
|{Mike, Mary, Williams}|[Python, VB]      |OH   |M     |
+----------------------+------------------+-----+------+



In [19]:
# Using SQL col() function. Bu şekilde filter yapmak aklımda kalmaz

from pyspark.sql.functions import col

df.filter(col("state") == "OH") \
    .show(truncate=False)

+----------------------+------------------+-----+------+
|name                  |languages         |state|gender|
+----------------------+------------------+-----+------+
|{James, , Smith}      |[Java, Scala, C++]|OH   |M     |
|{Julia, , Williams}   |[CSharp, VB]      |OH   |F     |
|{Mike, Mary, Williams}|[Python, VB]      |OH   |M     |
+----------------------+------------------+-----+------+



In [24]:
# Bu şekildeki filter' larda güzel

# Using SQL Expression

df.filter("gender == 'M'").show()

df.filter('gender = "F"').show() # Veritabanı mantığıyla çalıştığı için tek = yeter.

# For not equal

df.filter("gender != 'M'").show() # Eşit değil
df.filter("gender <> 'M'").show()

+--------------------+------------------+-----+------+
|                name|         languages|state|gender|
+--------------------+------------------+-----+------+
|    {James, , Smith}|[Java, Scala, C++]|   OH|     M|
|{Maria, Anne, Jones}|      [CSharp, VB]|   NY|     M|
|  {Jen, Mary, Brown}|      [CSharp, VB]|   NY|     M|
|{Mike, Mary, Will...|      [Python, VB]|   OH|     M|
+--------------------+------------------+-----+------+

+-------------------+------------------+-----+------+
|               name|         languages|state|gender|
+-------------------+------------------+-----+------+
|     {Anna, Rose, }|[Spark, Java, C++]|   NY|     F|
|{Julia, , Williams}|      [CSharp, VB]|   OH|     F|
+-------------------+------------------+-----+------+

+-------------------+------------------+-----+------+
|               name|         languages|state|gender|
+-------------------+------------------+-----+------+
|     {Anna, Rose, }|[Spark, Java, C++]|   NY|     F|
|{Julia, , William

In [29]:
# Filter multiple condition. İlki kullanışlı değil ama ikincisi kullanışlı.

df.filter( (df.state  == "OH") & (df.gender  == "M") ) \
    .show(truncate=False)

df.filter("state = 'OH' and gender = 'M'").show(truncate=False)

print("\n\n")

# Bir tane değişken tanımlasam mesela ve ikincisinde kullansam. Aklıma belki ilkini kullanma yöntemleri dinamik
# sorgular çekmektir.

state = 'NY'
gender = 'M'

df.filter(f"state = '{state}' and gender = '{gender}'").show(truncate=False)

+----------------------+------------------+-----+------+
|name                  |languages         |state|gender|
+----------------------+------------------+-----+------+
|{James, , Smith}      |[Java, Scala, C++]|OH   |M     |
|{Mike, Mary, Williams}|[Python, VB]      |OH   |M     |
+----------------------+------------------+-----+------+

+----------------------+------------------+-----+------+
|name                  |languages         |state|gender|
+----------------------+------------------+-----+------+
|{James, , Smith}      |[Java, Scala, C++]|OH   |M     |
|{Mike, Mary, Williams}|[Python, VB]      |OH   |M     |
+----------------------+------------------+-----+------+




+--------------------+------------+-----+------+
|name                |languages   |state|gender|
+--------------------+------------+-----+------+
|{Maria, Anne, Jones}|[CSharp, VB]|NY   |M     |
|{Jen, Mary, Brown}  |[CSharp, VB]|NY   |M     |
+--------------------+------------+-----+------+



In [39]:
# Aklıma bir başka şey olarakta şu geldi: Neden name kolonunda ve languages kolonunda filter yapmamış.

df.select(df['name']['firstname']).show()

df.filter(df['name']['firstname'] == 'James').show()

df.filter("name.firstname = 'Maria'").show()

# languages' ınkini denedim ama yapamadım.

+--------------+
|name.firstname|
+--------------+
|         James|
|          Anna|
|         Julia|
|         Maria|
|           Jen|
|          Mike|
+--------------+

+----------------+------------------+-----+------+
|            name|         languages|state|gender|
+----------------+------------------+-----+------+
|{James, , Smith}|[Java, Scala, C++]|   OH|     M|
+----------------+------------------+-----+------+

+--------------------+------------+-----+------+
|                name|   languages|state|gender|
+--------------------+------------+-----+------+
|{Maria, Anne, Jones}|[CSharp, VB]|   NY|     M|
+--------------------+------------+-----+------+



In [5]:
# Veritabanlarındaki IN özelliği

li = ["OH", "CA", "DE"]

df.filter(df.state.isin(li)).show()

li2 = ['James', 'Julia', 'Maria']

df.filter(df.name.firstname.isin(li2)).show()

df.filter(~df.state.isin(li)).show()
df.filter(df.state.isin(li) == False).show()

+--------------------+------------------+-----+------+
|                name|         languages|state|gender|
+--------------------+------------------+-----+------+
|    {James, , Smith}|[Java, Scala, C++]|   OH|     M|
| {Julia, , Williams}|      [CSharp, VB]|   OH|     F|
|{Mike, Mary, Will...|      [Python, VB]|   OH|     M|
+--------------------+------------------+-----+------+

+--------------------+------------------+-----+------+
|                name|         languages|state|gender|
+--------------------+------------------+-----+------+
|    {James, , Smith}|[Java, Scala, C++]|   OH|     M|
| {Julia, , Williams}|      [CSharp, VB]|   OH|     F|
|{Maria, Anne, Jones}|      [CSharp, VB]|   NY|     M|
+--------------------+------------------+-----+------+

+--------------------+------------------+-----+------+
|                name|         languages|state|gender|
+--------------------+------------------+-----+------+
|      {Anna, Rose, }|[Spark, Java, C++]|   NY|     F|
|{Maria,

In [49]:
# Starts With, Ends With, Contains

df.filter(df.state.startswith('N')).show()

df.filter(df.state.endswith('H')).show()

df.filter(df.state.contains('H')).show()

+--------------------+------------------+-----+------+
|                name|         languages|state|gender|
+--------------------+------------------+-----+------+
|      {Anna, Rose, }|[Spark, Java, C++]|   NY|     F|
|{Maria, Anne, Jones}|      [CSharp, VB]|   NY|     M|
|  {Jen, Mary, Brown}|      [CSharp, VB]|   NY|     M|
+--------------------+------------------+-----+------+

+--------------------+------------------+-----+------+
|                name|         languages|state|gender|
+--------------------+------------------+-----+------+
|    {James, , Smith}|[Java, Scala, C++]|   OH|     M|
| {Julia, , Williams}|      [CSharp, VB]|   OH|     F|
|{Mike, Mary, Will...|      [Python, VB]|   OH|     M|
+--------------------+------------------+-----+------+

+--------------------+------------------+-----+------+
|                name|         languages|state|gender|
+--------------------+------------------+-----+------+
|    {James, , Smith}|[Java, Scala, C++]|   OH|     M|
| {Julia

In [10]:
# like and rlike (regex like)

data2 = [
    (2,"Michael Rose"),
    (3,"Robert Williams"),
    (4,"Rames Rose"),
    (5,"Rames rose")
]

df2 = spark.createDataFrame(data = data2, schema = ["id","name"])

# like - SQL LIKE pattern. Bunda doğrudan eşleşmesine bakıyor
df2.filter(df2.name.like("%rose%")).show()

# Sanırım şu ?i ignore demek sanırım
df2.filter(df2.name.rlike("(?i)^*rose$")).show()

+---+----------+
| id|      name|
+---+----------+
|  5|Rames rose|
+---+----------+

+---+------------+
| id|        name|
+---+------------+
|  2|Michael Rose|
|  4|  Rames Rose|
|  5|  Rames rose|
+---+------------+



In [11]:
# Array column. Yukarıda languages ile ilgili bir arama yapamamıştım.

from pyspark.sql.functions import array_contains


df.filter(array_contains(df.languages,"Java")) \
    .show(truncate=False)

+----------------+------------------+-----+------+
|name            |languages         |state|gender|
+----------------+------------------+-----+------+
|{James, , Smith}|[Java, Scala, C++]|OH   |M     |
|{Anna, Rose, }  |[Spark, Java, C++]|NY   |F     |
+----------------+------------------+-----+------+

