In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('PySpark Dataframes').getOrCreate()

22/12/20 00:47:20 WARN Utils: Your hostname, Abhinavs-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 192.168.1.5 instead (on interface en0)
22/12/20 00:47:20 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/12/20 00:47:20 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/12/20 00:47:20 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
spark

In [4]:
# read the dataset
df_pyspark = spark.read.option('header', 'true').csv('test_data-1.csv')

# check the schema
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Experience: string (nullable = true)



In [5]:
# read the dataset by inferring the datatypes

df_pyspark = spark.read.option('header', 'true').csv('test_data-1.csv', inferSchema=True)

df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)



In [6]:
# best way of reading dataset

df_pyspark = spark.read.csv('test_data-1.csv', header=True, inferSchema=True)

df_pyspark.show()

+-----+---+----------+
| Name|Age|Experience|
+-----+---+----------+
| John| 35|        12|
| Jane| 34|        11|
|Stacy| 28|         8|
|Jimmy| 25|         5|
+-----+---+----------+



In [7]:
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)



In [8]:
# get top 3 rows

df_pyspark.head(3)

[Row(Name='John', Age=35, Experience=12),
 Row(Name='Jane', Age=34, Experience=11),
 Row(Name='Stacy', Age=28, Experience=8)]

In [9]:
# select a particular column

df_pyspark.select('Name').show()

+-----+
| Name|
+-----+
| John|
| Jane|
|Stacy|
|Jimmy|
+-----+



In [10]:
# select multiple columns

df_pyspark.select(['Name', 'Experience']).show()

+-----+----------+
| Name|Experience|
+-----+----------+
| John|        12|
| Jane|        11|
|Stacy|         8|
|Jimmy|         5|
+-----+----------+



In [11]:
df_pyspark['Name']

Column<'Name'>

In [12]:
# check datatypes

df_pyspark.dtypes

[('Name', 'string'), ('Age', 'int'), ('Experience', 'int')]

In [13]:
# describe the dataframe

df_pyspark.describe()

DataFrame[summary: string, Name: string, Age: string, Experience: string]

In [14]:
# adding columns in the dataframe

df_pyspark = df_pyspark.withColumn('Experience After 2 Years', df_pyspark['Experience'] + 2)

df_pyspark.show()

+-----+---+----------+------------------------+
| Name|Age|Experience|Experience After 2 Years|
+-----+---+----------+------------------------+
| John| 35|        12|                      14|
| Jane| 34|        11|                      13|
|Stacy| 28|         8|                      10|
|Jimmy| 25|         5|                       7|
+-----+---+----------+------------------------+



In [15]:
# drop the columns

df_pyspark = df_pyspark.drop('Experience After 2 Years')

df_pyspark.show()

+-----+---+----------+
| Name|Age|Experience|
+-----+---+----------+
| John| 35|        12|
| Jane| 34|        11|
|Stacy| 28|         8|
|Jimmy| 25|         5|
+-----+---+----------+



In [16]:
# rename the columns

df_pyspark = df_pyspark.withColumnRenamed('Name', 'New Name')

df_pyspark.show()

+--------+---+----------+
|New Name|Age|Experience|
+--------+---+----------+
|    John| 35|        12|
|    Jane| 34|        11|
|   Stacy| 28|         8|
|   Jimmy| 25|         5|
+--------+---+----------+

