In [1]:
import findspark

In [2]:
findspark.init("/opt/manual/spark/")

In [3]:
from pyspark.sql import SparkSession, functions as F
from pyspark.sql.types import *

In [4]:
spark = SparkSession.builder \
.appName("Columns Expressions") \
.master("local[2]")\
.getOrCreate()

In [5]:
df = spark.read \
.option("header", True)\
.option("inferSchema",True)\
.option("comression","gzip")\
.csv("file:///home/train/datasets/Hotel_Reviews.csv.gz")

In [6]:
df2 = df.withColumn("Tags",
                   F.split(F.col("Tags"),",")
                   .cast(ArrayType(StringType())))\
.withColumn("Review_Date", F.to_date("Review_Date","M/d/yyyy"))

In [7]:
df2.printSchema()

root
 |-- Hotel_Address: string (nullable = true)
 |-- Additional_Number_of_Scoring: integer (nullable = true)
 |-- Review_Date: date (nullable = true)
 |-- Average_Score: double (nullable = true)
 |-- Hotel_Name: string (nullable = true)
 |-- Reviewer_Nationality: string (nullable = true)
 |-- Negative_Review: string (nullable = true)
 |-- Review_Total_Negative_Word_Counts: integer (nullable = true)
 |-- Total_Number_of_Reviews: integer (nullable = true)
 |-- Positive_Review: string (nullable = true)
 |-- Review_Total_Positive_Word_Counts: integer (nullable = true)
 |-- Total_Number_of_Reviews_Reviewer_Has_Given: integer (nullable = true)
 |-- Reviewer_Score: double (nullable = true)
 |-- Tags: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- days_since_review: string (nullable = true)
 |-- lat: string (nullable = true)
 |-- lng: string (nullable = true)



In [8]:
df2.select("Reviewer_Score",F.expr("Reviewer_Score * 10")).show(3)

+--------------+---------------------+
|Reviewer_Score|(Reviewer_Score * 10)|
+--------------+---------------------+
|           2.9|                 29.0|
|           7.5|                 75.0|
|           7.1|                 71.0|
+--------------+---------------------+
only showing top 3 rows



In [9]:
df2.select(F.col("Reviewer_Score") > 5).show(5)

+--------------------+
|(Reviewer_Score > 5)|
+--------------------+
|               false|
|                true|
|                true|
|               false|
|                true|
+--------------------+
only showing top 5 rows



In [10]:
df2.withColumn("Is_Positive", F.expr("Reviewer_Score > 5.0"))\
.select("Reviewer_Score", "Is_Positive").show(5)

+--------------+-----------+
|Reviewer_Score|Is_Positive|
+--------------+-----------+
|           2.9|      false|
|           7.5|       true|
|           7.1|       true|
|           3.8|      false|
|           6.7|       true|
+--------------+-----------+
only showing top 5 rows



In [11]:
df2.select("Reviewer_Score").show(4)

+--------------+
|Reviewer_Score|
+--------------+
|           2.9|
|           7.5|
|           7.1|
|           3.8|
+--------------+
only showing top 4 rows



In [13]:
df2.select(F.col("Reviewer_Score")).show(4)

+--------------+
|Reviewer_Score|
+--------------+
|           2.9|
|           7.5|
|           7.1|
|           3.8|
+--------------+
only showing top 4 rows



In [14]:
df2.select(df2["Reviewer_Score"]).show(4)

+--------------+
|Reviewer_Score|
+--------------+
|           2.9|
|           7.5|
|           7.1|
|           3.8|
+--------------+
only showing top 4 rows



In [15]:
df2.select(df2.Reviewer_Score).show(4)

+--------------+
|Reviewer_Score|
+--------------+
|           2.9|
|           7.5|
|           7.1|
|           3.8|
+--------------+
only showing top 4 rows



In [17]:
spark.stop()