In [2]:
from pyspark.sql.functions import lit
# adds constant or literal values as a new column to the dataframe - takes a literal value and returns a column object
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf
from pyspark.sql.types import StringType

spark = SparkSession.builder.appName('COLUMNCLASS').getOrCreate()

In [3]:
data=[("James",23),("Ann",40)]
df1=spark.createDataFrame(data)
df1.printSchema()
df1.show()

root
 |-- _1: string (nullable = true)
 |-- _2: long (nullable = true)

+-----+---+
|   _1| _2|
+-----+---+
|James| 23|
|  Ann| 40|
+-----+---+



### converting default dataframe to Dataframe with column names

In [5]:
df2=spark.createDataFrame(data).toDF("name","age")  
df2.printSchema()
df2.show()

root
 |-- name: string (nullable = true)
 |-- age: long (nullable = true)

+-----+---+
| name|age|
+-----+---+
|James| 23|
|  Ann| 40|
+-----+---+



### Accessing columns

In [10]:
df2.select(df2["age"]).show()
df2.select(df2["`name`"]).show()
from pyspark.sql.functions import col

df2.select(col("age")).show()

+---+
|age|
+---+
| 23|
| 40|
+---+

+-----+
| name|
+-----+
|James|
|  Ann|
+-----+

+---+
|age|
+---+
| 23|
| 40|
+---+

+-----+
| name|
+-----+
|James|
|  Ann|
+-----+



### creating data with row class

In [11]:
from pyspark.sql import Row
data=[Row(name="James",prop=Row(hair="black",eye="blue")),
      Row(name="Ann",prop=Row(hair="grey",eye="black"))]
dataframe=spark.createDataFrame(data)
dataframe.printSchema()
dataframe.show()

root
 |-- name: string (nullable = true)
 |-- prop: struct (nullable = true)
 |    |-- hair: string (nullable = true)
 |    |-- eye: string (nullable = true)

+-----+-------------+
| name|         prop|
+-----+-------------+
|James|{black, blue}|
|  Ann|{grey, black}|
+-----+-------------+



### accessing strcutType columns

In [13]:
dataframe.select(dataframe.prop.hair).show()
dataframe.select(dataframe["prop.hair"]).show()
dataframe.select(col("prop.hair")).show()
dataframe.select(col("prop.*")).show()
dataframe.select(col("prop")).show()

+---------+
|prop.hair|
+---------+
|    black|
|     grey|
+---------+

+-----+
| hair|
+-----+
|black|
| grey|
+-----+

+-----+
| hair|
+-----+
|black|
| grey|
+-----+

+-----+-----+
| hair|  eye|
+-----+-----+
|black| blue|
| grey|black|
+-----+-----+

+-------------+
|         prop|
+-------------+
|{black, blue}|
|{grey, black}|
+-------------+



In [15]:
data=[(100,2,1),(400,3,4),(300,4,4)]
df3=spark.createDataFrame(data).toDF("col1","col2","col3")
df3.show()

+----+----+----+
|col1|col2|col3|
+----+----+----+
| 100|   2|   1|
| 400|   3|   4|
| 300|   4|   4|
+----+----+----+



### column operators

In [16]:
df3.select(df3.col1 + df3.col2 ).show()
df3.select(df3.col1 - df3.col2).show()
df3.select(df3.col2 > df3.col3).show()

+-------------+
|(col1 + col2)|
+-------------+
|          102|
|          403|
|          304|
+-------------+

+-------------+
|(col1 - col2)|
+-------------+
|           98|
|          397|
|          296|
+-------------+

+-------------+
|(col2 > col3)|
+-------------+
|         true|
|        false|
|        false|
+-------------+



### alias() & name() - both are for changing the column name

In [24]:

df4 = df3.select(col("col2").name("newcol"), col("col3"), col("col1").alias("10multiplies"))
df4.show()

+------+----+------------+
|newcol|col3|10multiplies|
+------+----+------------+
|     2|   1|         100|
|     3|   4|         400|
|     4|   4|         300|
+------+----+------------+



### orderBy()

In [25]:
df4.orderBy(col("10multiplies")).show()

+------+----+------------+
|newcol|col3|10multiplies|
+------+----+------------+
|     2|   1|         100|
|     4|   4|         300|
|     3|   4|         400|
+------+----+------------+



In [26]:
from pyspark.sql.types import StringType, IntegerType

data = [("John Smith", 30, 1), ("Jane", 25, 2), ("Bob Smith", 35, 25), ("Tom", None, 3)]
columns = ["name", "age", "salary"]

dataframe1 = spark.createDataFrame(data, columns)
dataframe1.show()

+----------+----+------+
|      name| age|salary|
+----------+----+------+
|John Smith|  30|     1|
|      Jane|  25|     2|
| Bob Smith|  35|    25|
|       Tom|NULL|     3|
+----------+----+------+



In [27]:
dataframe1 = dataframe1.withColumn("age", col("age").cast(IntegerType()))
dataframe1.show()
dataframe1.printSchema()

+----------+----+------+
|      name| age|salary|
+----------+----+------+
|John Smith|  30|     1|
|      Jane|  25|     2|
| Bob Smith|  35|    25|
|       Tom|NULL|     3|
+----------+----+------+

root
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- salary: long (nullable = true)



### contains() - gives boolean value, checking the value is present in the column or not


In [22]:
df_contains = dataframe1.withColumn("contains", col("name").contains("Smith"))
df_contains.show()

+----------+----+------+--------+
|      name| age|salary|contains|
+----------+----+------+--------+
|John Smith|  30|     1|    true|
|      Jane|  25|     2|   false|
| Bob Smith|  35|    25|    true|
|       Tom|NULL|     3|   false|
+----------+----+------+--------+



### eqNullSafe() - gives boolean value checking the value is equal to the given column value


In [23]:
df_comparison = dataframe1.withColumn("is_age_equal_to_30", col("age").eqNullSafe(25))
df_comparison.show()

+----------+----+------+------------------+
|      name| age|salary|is_age_equal_to_30|
+----------+----+------+------------------+
|John Smith|  30|     1|             false|
|      Jane|  25|     2|              true|
| Bob Smith|  35|    25|             false|
|       Tom|NULL|     3|             false|
+----------+----+------+------------------+



### cast / astype - for changing datatype of the column

In [28]:
dataframe1 = dataframe1.withColumn("salary", col("salary").cast(IntegerType()))
dataframe1.printSchema()

root
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- salary: integer (nullable = true)



### bitwiseAND

In [29]:
dataframe3 = dataframe1.withColumn("bitwise AND", col("age").bitwiseAND(col("salary")))
dataframe3.show()

+----------+----+------+-----------+
|      name| age|salary|bitwise AND|
+----------+----+------+-----------+
|John Smith|  30|     1|          0|
|      Jane|  25|     2|          0|
| Bob Smith|  35|    25|          1|
|       Tom|NULL|     3|       NULL|
+----------+----+------+-----------+



### startswith() - gives boolean if the column value startstwith the given value

In [31]:
dataframe_startswith = dataframe1.withColumn("startswith", col("name").startswith("John"))
dataframe_startswith.show()

+----------+----+------+----------+
|      name| age|salary|startswith|
+----------+----+------+----------+
|John Smith|  30|     1|      true|
|      Jane|  25|     2|     false|
| Bob Smith|  35|    25|     false|
|       Tom|NULL|     3|     false|
+----------+----+------+----------+



### endswith() - gives boolean if the column value ends with the given value

In [32]:
dataframe_endswith = dataframe1.withColumn("endswith", col("name").endswith("ith"))
dataframe_endswith.show()

+----------+----+------+--------+
|      name| age|salary|endswith|
+----------+----+------+--------+
|John Smith|  30|     1|    true|
|      Jane|  25|     2|   false|
| Bob Smith|  35|    25|    true|
|       Tom|NULL|     3|   false|
+----------+----+------+--------+



### desc() and asc() 

In [35]:
dataframe1.orderBy(col("age").desc()).show()  
dataframe1.orderBy(col("salary")).show()  

+----------+----+------+
|      name| age|salary|
+----------+----+------+
| Bob Smith|  35|    25|
|John Smith|  30|     1|
|      Jane|  25|     2|
|       Tom|NULL|     3|
+----------+----+------+

+----------+----+------+
|      name| age|salary|
+----------+----+------+
|John Smith|  30|     1|
|      Jane|  25|     2|
|       Tom|NULL|     3|
| Bob Smith|  35|    25|
+----------+----+------+



### name()

In [36]:
dataframe2 = dataframe1.select("name", col("age").name("sorted_age"), "salary")
dataframe2.show()
dataframe2.printSchema()

+----------+----------+------+
|      name|sorted_age|salary|
+----------+----------+------+
|John Smith|        30|     1|
|      Jane|        25|     2|
| Bob Smith|        35|    25|
|       Tom|      NULL|     3|
+----------+----------+------+

root
 |-- name: string (nullable = true)
 |-- sorted_age: integer (nullable = true)
 |-- salary: integer (nullable = true)



###  ascending order showing null values in the sarting rows

In [37]:
dataframe1.orderBy(col("age").asc_nulls_first()).show()  

+----------+----+------+
|      name| age|salary|
+----------+----+------+
|       Tom|NULL|     3|
|      Jane|  25|     2|
|John Smith|  30|     1|
| Bob Smith|  35|    25|
+----------+----+------+



###  ascending order showing null values in the last rows

In [38]:
dataframe1.orderBy(col("age").asc_nulls_last()).show() 
dataframe_new = dataframe1.select("name", dataframe1["age"].cast(StringType()).alias("age_str"))   
dataframe_new.printSchema()


+----------+----+------+
|      name| age|salary|
+----------+----+------+
|      Jane|  25|     2|
|John Smith|  30|     1|
| Bob Smith|  35|    25|
|       Tom|NULL|     3|
+----------+----+------+

root
 |-- name: string (nullable = true)
 |-- age_str: string (nullable = true)



### between(lowerBound, upperBound) - creates boolean values

In [43]:
dataframe2.show()
df_filtered = dataframe2.filter(dataframe2["sorted_age"].between(30, 40))
df_filtered.show()

+----------+----------+------+
|      name|sorted_age|salary|
+----------+----------+------+
|John Smith|        30|     1|
|      Jane|        25|     2|
| Bob Smith|        35|    25|
|       Tom|      NULL|     3|
+----------+----------+------+

+----------+----------+------+
|      name|sorted_age|salary|
+----------+----------+------+
|John Smith|        30|     1|
| Bob Smith|        35|    25|
+----------+----------+------+



In [41]:
df_with_boolean = dataframe2.withColumn("is >30 and <40", col("sorted_age").between(30, 40))  
df_with_boolean.show()

+----------+----------+------+--------------+
|      name|sorted_age|salary|is >30 and <40|
+----------+----------+------+--------------+
|John Smith|        30|     1|          true|
|      Jane|        25|     2|         false|
| Bob Smith|        35|    25|          true|
|       Tom|      NULL|     3|          NULL|
+----------+----------+------+--------------+



In [44]:
from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import col, when, lit
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

data = [
    ("John", (30, "Male")),
    ("Jane", (25, "Female")),
    ("Bob", (35, "Male"))
]

columns = StructType([       
    StructField('name', StringType(), nullable=True),
    StructField('info',StructType([
             StructField('age', IntegerType(), True),
             StructField('gender', StringType(), True)
             ]))
])
dataframe_new = spark.createDataFrame(data, columns)

dataframe_new.show()

+----+------------+
|name|        info|
+----+------------+
|John|  {30, Male}|
|Jane|{25, Female}|
| Bob|  {35, Male}|
+----+------------+



### getField(column_name) - getting column from the nested columns 

In [46]:
df_with_age = dataframe_new.withColumn("age", col("info").getField("age"))

df_with_age.show()

+----+------------+---+
|name|        info|age|
+----+------------+---+
|John|  {30, Male}| 30|
|Jane|{25, Female}| 25|
| Bob|  {35, Male}| 35|
+----+------------+---+



In [47]:
data = [
    ("John", [30, 40, 50]),
    ("Jane", [25, 35]),
    ("Bob", [35, 45, 55]),
]

columns = ["name", "ages"]
df = spark.createDataFrame(data, columns)

df.show()

+----+------------+
|name|        ages|
+----+------------+
|John|[30, 40, 50]|
|Jane|    [25, 35]|
| Bob|[35, 45, 55]|
+----+------------+



### getItem

In [48]:
df_with_second_age = df.withColumn("second_age", col("ages").getItem(2))
df_with_second_age.show()

+----+------------+----------+
|name|        ages|second_age|
+----+------------+----------+
|John|[30, 40, 50]|        50|
|Jane|    [25, 35]|      NULL|
| Bob|[35, 45, 55]|        55|
+----+------------+----------+



### isNull()

In [49]:
df_with_second_age.filter(df_with_second_age.second_age.isNull()).show()

+----+--------+----------+
|name|    ages|second_age|
+----+--------+----------+
|Jane|[25, 35]|      NULL|
+----+--------+----------+



### isNotNull()

In [50]:
df_with_second_age.filter(df_with_second_age.second_age.isNotNull()).show()

+----+------------+----------+
|name|        ages|second_age|
+----+------------+----------+
|John|[30, 40, 50]|        50|
| Bob|[35, 45, 55]|        55|
+----+------------+----------+



In [51]:
data = [
    ("John", 30, "Civil Engineer"),
    ("Jane", 25, "Teacher"),
    ("Bob", 35, "Doctor"),
    ("Alice", 40, "Engineer"),
]

columns = ["name", "age", "occupation"]
df = spark.createDataFrame(data, columns)

df.show()

+-----+---+--------------+
| name|age|    occupation|
+-----+---+--------------+
| John| 30|Civil Engineer|
| Jane| 25|       Teacher|
|  Bob| 35|        Doctor|
|Alice| 40|      Engineer|
+-----+---+--------------+



### isin()

In [52]:
df_filtered = df.filter(col("occupation").isin(["Engineer", "Doctor"]))
df_filtered.show()

+-----+---+----------+
| name|age|occupation|
+-----+---+----------+
|  Bob| 35|    Doctor|
|Alice| 40|  Engineer|
+-----+---+----------+



### like()

In [54]:
df_filtered_ = df.filter(col("occupation").like("%Engineer%"))
df_filtered_.show()

+-----+---+--------------+
| name|age|    occupation|
+-----+---+--------------+
| John| 30|Civil Engineer|
|Alice| 40|      Engineer|
+-----+---+--------------+



### rlike() - same as like

In [55]:
df_filtered = df.filter(col("occupation").rlike("Doctor|Teacher"))
df_filtered.show()

+----+---+----------+
|name|age|occupation|
+----+---+----------+
|Jane| 25|   Teacher|
| Bob| 35|    Doctor|
+----+---+----------+



### when 

In [57]:
df_with_category = df.withColumn("age_category", 
                                  when(col("age") < 30, "Young")
                                  .when(col("age") >= 30, "Adult")
                                  .otherwise("Unknown")
)
print("DataFrame with age category:")
df_with_category.show()

DataFrame with age category:
+-----+---+--------------+------------+
| name|age|    occupation|age_category|
+-----+---+--------------+------------+
| John| 30|Civil Engineer|       Adult|
| Jane| 25|       Teacher|       Young|
|  Bob| 35|        Doctor|       Adult|
|Alice| 40|      Engineer|       Adult|
+-----+---+--------------+------------+



### drop()

In [58]:
df_without_age = df.drop("age")
df_without_age.show()

+-----+--------------+
| name|    occupation|
+-----+--------------+
| John|Civil Engineer|
| Jane|       Teacher|
|  Bob|        Doctor|
|Alice|      Engineer|
+-----+--------------+



In [59]:
df = spark.createDataFrame([Row(a=Row(b=1, c=2))])
df.show()

+------+
|     a|
+------+
|{1, 2}|
+------+



In [60]:
df.withColumn('a', df['a'].withField('b', lit(3))).select('a.b').show()

+---+
|  b|
+---+
|  3|
+---+

