## Read CSV file use Spark DataFrame

In [1]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName('Read Dataframe FROM CSV').getOrCreate()

In [17]:
df_1 = spark.read \
.format('csv')\
.option("header","true")\
.option('inferSchema','true')\
.load("/content/first_100_customers.csv")

In [18]:
df_1.printSchema()

root
 |-- customer_id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- country: string (nullable = true)
 |-- registration_date: date (nullable = true)
 |-- is_active: boolean (nullable = true)



In [8]:
df_1.show(10)

+-----------+----------+---------+-----------+-------+-----------------+---------+
|customer_id|      name|     city|      state|country|registration_date|is_active|
+-----------+----------+---------+-----------+-------+-----------------+---------+
|          0|Customer_0|     Pune|Maharashtra|  India|       2023-06-29|    false|
|          1|Customer_1|Bangalore| Tamil Nadu|  India|       2023-12-07|     true|
|          2|Customer_2|Hyderabad|    Gujarat|  India|       2023-10-27|     true|
|          3|Customer_3|Bangalore|  Karnataka|  India|       2023-10-17|    false|
|          4|Customer_4|Ahmedabad|  Karnataka|  India|       2023-03-14|    false|
|          5|Customer_5|Hyderabad|  Karnataka|  India|       2023-07-28|    false|
|          6|Customer_6|     Pune|      Delhi|  India|       2023-08-29|    false|
|          7|Customer_7|Ahmedabad|West Bengal|  India|       2023-12-28|     true|
|          8|Customer_8|     Pune|  Karnataka|  India|       2023-06-22|     true|
|   

###

### Count Number of columns and rows

In [10]:
len(df_1.columns)

7

In [11]:
df_1.count()

99

### Filter

In [29]:
df_1.filter(df_1.is_active == True).show(4)

+-----------+----------+---------+-----------+-------+-----------------+---------+
|customer_id|      name|     city|      state|country|registration_date|is_active|
+-----------+----------+---------+-----------+-------+-----------------+---------+
|          1|Customer_1|Bangalore| Tamil Nadu|  India|       2023-12-07|     true|
|          2|Customer_2|Hyderabad|    Gujarat|  India|       2023-10-27|     true|
|          7|Customer_7|Ahmedabad|West Bengal|  India|       2023-12-28|     true|
|          8|Customer_8|     Pune|  Karnataka|  India|       2023-06-22|     true|
+-----------+----------+---------+-----------+-------+-----------------+---------+
only showing top 4 rows



In [26]:
df_1.filter(df_1.is_active == True).count()

53

In [28]:
df_1.select("name","is_Active").filter(df_1.is_active == True).show(4)

+----------+---------+
|      name|is_Active|
+----------+---------+
|Customer_1|     true|
|Customer_2|     true|
|Customer_7|     true|
|Customer_8|     true|
+----------+---------+
only showing top 4 rows



#### Second Method

In [30]:
df_1.filter('is_active == True').show(5)

+-----------+----------+---------+-----------+-------+-----------------+---------+
|customer_id|      name|     city|      state|country|registration_date|is_active|
+-----------+----------+---------+-----------+-------+-----------------+---------+
|          1|Customer_1|Bangalore| Tamil Nadu|  India|       2023-12-07|     true|
|          2|Customer_2|Hyderabad|    Gujarat|  India|       2023-10-27|     true|
|          7|Customer_7|Ahmedabad|West Bengal|  India|       2023-12-28|     true|
|          8|Customer_8|     Pune|  Karnataka|  India|       2023-06-22|     true|
|          9|Customer_9|   Mumbai|  Telangana|  India|       2023-01-05|     true|
+-----------+----------+---------+-----------+-------+-----------------+---------+
only showing top 5 rows



### Select Different Columns

In [31]:
selected_columns  = df_1.select("customer_id","name","city")

In [32]:
# without show() function he just show datatype of each column
selected_columns

DataFrame[customer_id: int, name: string, city: string]

In [33]:
selected_columns.show(5)

+-----------+----------+---------+
|customer_id|      name|     city|
+-----------+----------+---------+
|          0|Customer_0|     Pune|
|          1|Customer_1|Bangalore|
|          2|Customer_2|Hyderabad|
|          3|Customer_3|Bangalore|
|          4|Customer_4|Ahmedabad|
+-----------+----------+---------+
only showing top 5 rows

