In [1]:
import findspark
findspark.init("/opt/manual/spark/")

In [2]:
from pyspark.sql import SparkSession, functions as F
from pyspark.sql.types import *

In [4]:
spark = SparkSession.builder \
.appName("df to disk as csv") \
.master("local[2]") \
.getOrCreate()

In [5]:
df = spark.read \
.option("header", True)\
.option("inferSchema",True)\
.option("comression","gzip")\
.csv("file:///home/train/datasets/Hotel_Reviews.csv.gz")

In [6]:
df2 = df.withColumn("Tags",
                   F.split(F.col("Tags"),",")
                   .cast(ArrayType(StringType())))\
.withColumn("Review_Date", F.to_date("Review_Date","M/d/yyyy"))

In [7]:
df2.printSchema()

root
 |-- Hotel_Address: string (nullable = true)
 |-- Additional_Number_of_Scoring: integer (nullable = true)
 |-- Review_Date: date (nullable = true)
 |-- Average_Score: double (nullable = true)
 |-- Hotel_Name: string (nullable = true)
 |-- Reviewer_Nationality: string (nullable = true)
 |-- Negative_Review: string (nullable = true)
 |-- Review_Total_Negative_Word_Counts: integer (nullable = true)
 |-- Total_Number_of_Reviews: integer (nullable = true)
 |-- Positive_Review: string (nullable = true)
 |-- Review_Total_Positive_Word_Counts: integer (nullable = true)
 |-- Total_Number_of_Reviews_Reviewer_Has_Given: integer (nullable = true)
 |-- Reviewer_Score: double (nullable = true)
 |-- Tags: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- days_since_review: string (nullable = true)
 |-- lat: string (nullable = true)
 |-- lng: string (nullable = true)



In [8]:
from pyspark.sql.types import StringType

In [9]:
df2.withColumn("Tags", F.col("Tags").cast(StringType())) \
.write \
.format("csv") \
.mode("overwrite") \
.option("header",True) \
.save("file:///home/train/my_pyspark/hotel_reviews_csv")

In [10]:
! ls -l ~/my_pyspark/hotel_reviews_csv

total 233424
-rw-r--r--. 1 train train 239023746 Aug 11 22:21 part-00000-acf55f3e-8004-4683-9ee5-aeb762dcbbab-c000.csv
-rw-r--r--. 1 train train         0 Aug 11 22:21 _SUCCESS


In [13]:
df3 = spark.read \
.option("header",True) \
.option("inferSchema", True) \
.csv("file:///home/train/my_pyspark/hotel_reviews_csv")

In [14]:
df3.limit(3).toPandas()

Unnamed: 0,Hotel_Address,Additional_Number_of_Scoring,Review_Date,Average_Score,Hotel_Name,Reviewer_Nationality,Negative_Review,Review_Total_Negative_Word_Counts,Total_Number_of_Reviews,Positive_Review,Review_Total_Positive_Word_Counts,Total_Number_of_Reviews_Reviewer_Has_Given,Reviewer_Score,Tags,days_since_review,lat,lng
0,s Gravesandestraat 55 Oost 1092 AA Amsterdam N...,194,2017-08-03,7.7,Hotel Arena,Russia,I am so angry that i made this post available ...,397,1403,Only the park outside of the hotel was beautiful,11,7,2.9,"[[' Leisure trip ', ' Couple ', ' Duplex Dou...",0 days,52.3605759,4.9159683
1,s Gravesandestraat 55 Oost 1092 AA Amsterdam N...,194,2017-08-03,7.7,Hotel Arena,Ireland,No Negative,0,1403,No real complaints the hotel was great great l...,105,7,7.5,"[[' Leisure trip ', ' Couple ', ' Duplex Dou...",0 days,52.3605759,4.9159683
2,s Gravesandestraat 55 Oost 1092 AA Amsterdam N...,194,2017-07-31,7.7,Hotel Arena,Australia,Rooms are nice but for elderly a bit difficult...,42,1403,Location was good and staff were ok It is cute...,21,9,7.1,"[[' Leisure trip ', ' Family with young child...",3 days,52.3605759,4.9159683


In [15]:
spark.stop()