# **PySpark Create Session**

# **Installation & Import**

In [1]:
# install pyspark
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
# import pyspark
import pyspark

In [3]:
#check version
pyspark.__version__

'3.3.2'

# **Start Spark Session**

In [4]:
# start spark session
from pyspark.sql import SparkSession

In [5]:
# create spark session
spark=SparkSession.builder.getOrCreate()

In [6]:
# or customize
spark=SparkSession.builder.master('local[*]').appName('YBIFoundation').getOrCreate()

In [7]:
# same as above code but written in separate lines
spark=SparkSession.builder\
.master('local[*]')\
.appName('YBIFoundation')\
.getOrCreate()

In [8]:
spark

# **Import Data**

In [9]:
# read Google Colab sample csv data
sample=spark.read.csv('/content/sample_data/california_housing_train.csv')

In [10]:
sample

DataFrame[_c0: string, _c1: string, _c2: string, _c3: string, _c4: string, _c5: string, _c6: string, _c7: string, _c8: string]

In [11]:
# by default import do not infer header
sample.show()

+-----------+---------+------------------+-----------+--------------+-----------+-----------+-------------+------------------+
|        _c0|      _c1|               _c2|        _c3|           _c4|        _c5|        _c6|          _c7|               _c8|
+-----------+---------+------------------+-----------+--------------+-----------+-----------+-------------+------------------+
|  longitude| latitude|housing_median_age|total_rooms|total_bedrooms| population| households|median_income|median_house_value|
|-114.310000|34.190000|         15.000000|5612.000000|   1283.000000|1015.000000| 472.000000|     1.493600|      66900.000000|
|-114.470000|34.400000|         19.000000|7650.000000|   1901.000000|1129.000000| 463.000000|     1.820000|      80100.000000|
|-114.560000|33.690000|         17.000000| 720.000000|    174.000000| 333.000000| 117.000000|     1.650900|      85700.000000|
|-114.570000|33.640000|         14.000000|1501.000000|    337.000000| 515.000000| 226.000000|     3.191700|    

In [12]:
# read smaple data with header & schema
sample = spark.read.csv('/content/sample_data/california_housing_train.csv',header=True,inferSchema=True)

In [13]:
sample.show()

+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+
|longitude|latitude|housing_median_age|total_rooms|total_bedrooms|population|households|median_income|median_house_value|
+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+
|  -114.31|   34.19|              15.0|     5612.0|        1283.0|    1015.0|     472.0|       1.4936|           66900.0|
|  -114.47|    34.4|              19.0|     7650.0|        1901.0|    1129.0|     463.0|         1.82|           80100.0|
|  -114.56|   33.69|              17.0|      720.0|         174.0|     333.0|     117.0|       1.6509|           85700.0|
|  -114.57|   33.64|              14.0|     1501.0|         337.0|     515.0|     226.0|       3.1917|           73400.0|
|  -114.57|   33.57|              20.0|     1454.0|         326.0|     624.0|     262.0|        1.925|           65500.0|
|  -114.58|   33.63|    

In [14]:
# vertical view of two records
sample.show(2, vertical=True)

-RECORD 0---------------------
 longitude          | -114.31 
 latitude           | 34.19   
 housing_median_age | 15.0    
 total_rooms        | 5612.0  
 total_bedrooms     | 1283.0  
 population         | 1015.0  
 households         | 472.0   
 median_income      | 1.4936  
 median_house_value | 66900.0 
-RECORD 1---------------------
 longitude          | -114.47 
 latitude           | 34.4    
 housing_median_age | 19.0    
 total_rooms        | 7650.0  
 total_bedrooms     | 1901.0  
 population         | 1129.0  
 households         | 463.0   
 median_income      | 1.82    
 median_house_value | 80100.0 
only showing top 2 rows



In [15]:
# configuration for the eager evaluation of PySpark DataFrame
spark.conf.set('spark.sql.repl.eagerEval.enabled', True)
sample

longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
-114.31,34.19,15.0,5612.0,1283.0,1015.0,472.0,1.4936,66900.0
-114.47,34.4,19.0,7650.0,1901.0,1129.0,463.0,1.82,80100.0
-114.56,33.69,17.0,720.0,174.0,333.0,117.0,1.6509,85700.0
-114.57,33.64,14.0,1501.0,337.0,515.0,226.0,3.1917,73400.0
-114.57,33.57,20.0,1454.0,326.0,624.0,262.0,1.925,65500.0
-114.58,33.63,29.0,1387.0,236.0,671.0,239.0,3.3438,74000.0
-114.58,33.61,25.0,2907.0,680.0,1841.0,633.0,2.6768,82400.0
-114.59,34.83,41.0,812.0,168.0,375.0,158.0,1.7083,48500.0
-114.59,33.61,34.0,4789.0,1175.0,3134.0,1056.0,2.1782,58400.0
-114.6,34.83,46.0,1497.0,309.0,787.0,271.0,2.1908,48100.0


# **Stop Spark Session**

In [16]:
spark.stop()