### PySpark is a build-in open source framework availble in Google colab. Using PySpark is very easy and it needs no !pip install command to install this. Just import the pyspark library.

In [8]:
# Importing PySpark
import pyspark

In [9]:
# Connecting Google colab with google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [10]:
# Import pandas library
import pandas as pd

# Read the 'PySpark_test1.csv' file as df_pd
df_pd = pd.read_csv('/content/drive/MyDrive/Datasets/PySpark_test1.csv')

# Check the type of the df_pd variable
print(type(df_pd))

# Check the head of the 'PySpark_test1.csv' data
df_pd.head()

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,Name,age,Experience,Salary
0,Krish,31,10,30000
1,Sudhanshu,30,8,25000
2,Sunny,29,4,20000
3,Paul,24,3,20000
4,Harsha,21,1,15000


In [11]:
# Importing a Spark Session from PySpark sql module
from pyspark.sql import SparkSession

In [14]:
spark = SparkSession.builder.appName('Practise').getOrCreate()

In [15]:
spark

In [16]:
# Reading the 'PySpark_test1.csv' file using spark object
df_pyspark = spark.read.csv('/content/drive/MyDrive/Datasets/PySpark_test1.csv')

In [17]:
# See the entrire data set
df_pyspark.show() # It is showing the columns as default: C0, C1, C2 and C3

+---------+---+----------+------+
|      _c0|_c1|       _c2|   _c3|
+---------+---+----------+------+
|     Name|age|Experience|Salary|
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



In [20]:
# Converting the column names from default 'C's to their actual header
df_pyspark = spark.read.option('header','true').csv('/content/drive/MyDrive/Datasets/PySpark_test1.csv')

In [21]:
# Get the complete data set
df_pyspark.show()

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



In [23]:
# Get the type of the df_pyspark variable
print(type(df_pyspark))

<class 'pyspark.sql.dataframe.DataFrame'>


In [24]:
# Get the first 3 rows of the df_pyspark variable
df_pyspark.head(3)

[Row(Name='Krish', age='31', Experience='10', Salary='30000'),
 Row(Name='Sudhanshu', age='30', Experience='8', Salary='25000'),
 Row(Name='Sunny', age='29', Experience='4', Salary='20000')]

In [25]:
# Get the Schema of the df_pyspark variable: schema of the 'PySpark_test1.csv' data set
df_pyspark.printSchema() # Just like df.info()

root
 |-- Name: string (nullable = true)
 |-- age: string (nullable = true)
 |-- Experience: string (nullable = true)
 |-- Salary: string (nullable = true)

