<h2>Topics Covered</h2>
PySpark DataFrames
<br>Reading CSV Datasets with 1st row as header
<br>Checking the Data-Types of Columns(Schema)
<br>Selecting One or More Columns to display
<br>PySpark describe options like Pandas
<br>Adding Columns
<br>Dropping Columns
<br>Renaming Columns

In [2]:
from pyspark.sql import SparkSession as SS

In [3]:
# Building a spark session
spark = SS.builder.appName('Dataframe').getOrCreate()

In [4]:
spark

In [6]:
df_spark = spark.read.option('header','true').csv('Customers_Details.csv',sep=';',inferSchema = True)

In [7]:
df_spark.show()

+-----------+------------+--------------------+
|Customer_ID|Customer_Age|      Customer_Email|
+-----------+------------+--------------------+
|          1|          30|maria_cramer87@gm...|
|          2|          41| j_steal@hotmail.com|
|          3|          50|georg_pipps@outlo...|
|          6|          25|andrea_nelson@gma...|
+-----------+------------+--------------------+



In [8]:
# Checking Schema
df_spark.printSchema()

root
 |-- Customer_ID: integer (nullable = true)
 |-- Customer_Age: integer (nullable = true)
 |-- Customer_Email: string (nullable = true)



In [9]:
df_spark = spark.read.csv('Customers_Details.csv',sep=';',header=True,inferSchema=True)
df_spark.show()

+-----------+------------+--------------------+
|Customer_ID|Customer_Age|      Customer_Email|
+-----------+------------+--------------------+
|          1|          30|maria_cramer87@gm...|
|          2|          41| j_steal@hotmail.com|
|          3|          50|georg_pipps@outlo...|
|          6|          25|andrea_nelson@gma...|
+-----------+------------+--------------------+



In [10]:
# Checking Schema
df_spark.printSchema()

root
 |-- Customer_ID: integer (nullable = true)
 |-- Customer_Age: integer (nullable = true)
 |-- Customer_Email: string (nullable = true)



In [11]:
type(df_spark)

pyspark.sql.dataframe.DataFrame

In [12]:
print(df_spark.columns)
df_spark.head(3)

['Customer_ID', 'Customer_Age', 'Customer_Email']


[Row(Customer_ID=1, Customer_Age=30, Customer_Email='maria_cramer87@gmail.com'),
 Row(Customer_ID=2, Customer_Age=41, Customer_Email='j_steal@hotmail.com'),
 Row(Customer_ID=3, Customer_Age=50, Customer_Email='georg_pipps@outlook.com')]

In [13]:
# Selecting a single column
df_spark.select('Customer_ID').show()

+-----------+
|Customer_ID|
+-----------+
|          1|
|          2|
|          3|
|          6|
+-----------+



In [14]:
# Selecting a multiple column
df_spark.select(['Customer_ID','Customer_Age']).show()

+-----------+------------+
|Customer_ID|Customer_Age|
+-----------+------------+
|          1|          30|
|          2|          41|
|          3|          50|
|          6|          25|
+-----------+------------+



In [15]:
df_spark.dtypes

[('Customer_ID', 'int'), ('Customer_Age', 'int'), ('Customer_Email', 'string')]

In [16]:
df_spark.describe().show()

+-------+-----------------+------------------+--------------------+
|summary|      Customer_ID|      Customer_Age|      Customer_Email|
+-------+-----------------+------------------+--------------------+
|  count|                4|                 4|                   4|
|   mean|              3.0|              36.5|                NULL|
| stddev|2.160246899469287|11.210114480533493|                NULL|
|    min|                1|                25|andrea_nelson@gma...|
|    max|                6|                50|maria_cramer87@gm...|
+-------+-----------------+------------------+--------------------+



In [17]:
# Adding Cloumns
df_spark = df_spark.withColumn('Address',df_spark['Customer_ID']+5000)

In [18]:
df_spark.show() 

+-----------+------------+--------------------+-------+
|Customer_ID|Customer_Age|      Customer_Email|Address|
+-----------+------------+--------------------+-------+
|          1|          30|maria_cramer87@gm...|   5001|
|          2|          41| j_steal@hotmail.com|   5002|
|          3|          50|georg_pipps@outlo...|   5003|
|          6|          25|andrea_nelson@gma...|   5006|
+-----------+------------+--------------------+-------+



In [19]:
df_spark = df_spark.drop('Address')

In [20]:
df_spark.show()

+-----------+------------+--------------------+
|Customer_ID|Customer_Age|      Customer_Email|
+-----------+------------+--------------------+
|          1|          30|maria_cramer87@gm...|
|          2|          41| j_steal@hotmail.com|
|          3|          50|georg_pipps@outlo...|
|          6|          25|andrea_nelson@gma...|
+-----------+------------+--------------------+



In [21]:
# Rename a coulumn
df_spark.withColumnRenamed('Customer_ID','ID').show()

+---+------------+--------------------+
| ID|Customer_Age|      Customer_Email|
+---+------------+--------------------+
|  1|          30|maria_cramer87@gm...|
|  2|          41| j_steal@hotmail.com|
|  3|          50|georg_pipps@outlo...|
|  6|          25|andrea_nelson@gma...|
+---+------------+--------------------+

