<a href="https://colab.research.google.com/github/augustine-uba1/databricks_machineLearning/blob/main/PySPark.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# PySpark Introduction

In [None]:
## Install pyspark
!pip install pyspark

**Import pyspark**

In [3]:
import pyspark

In [9]:
## import the SparkSession class from the pyspark.sql module. The SparkSession class is the entry point for working with Spark SQL in PySpark
##create a SparkSession object named spark, In the below example, the appName parameter is set to 'Sample'
##getOrCreate method checks if a SparkSession already exists. If it does, it returns the existing SparkSession; otherwise, it creates a new one

from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Sample').getOrCreate()

In [10]:
spark

In [22]:
## read a csv file into a pyspark dataframe. You can specify the read format, csv, parquet, table.. see documentation for more read formats
pyspark_df = spark.read.option('header', 'true').csv('users1.csv')

In [23]:
pyspark_df.show()

+--------+---+----------+
|Username|Age|experience|
+--------+---+----------+
|     Sam| 54|        15|
|  Justin| 31|        12|
|    Phil| 20|         5|
|  Roland| 25|         7|
+--------+---+----------+



In [37]:
type(pyspark_df)

pyspark.sql.dataframe.DataFrame

In [38]:
pyspark_df.dtypes

[('Username', 'string'), ('Age', 'int'), ('experience', 'int')]

In [25]:
pyspark_df.printSchema()

root
 |-- Username: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- experience: string (nullable = true)



In [26]:
pyspark_df.head(5)

[Row(Username='Sam', Age='54', experience='15'),
 Row(Username='Justin', Age='31', experience='12'),
 Row(Username='Phil', Age='20', experience='5'),
 Row(Username='Roland', Age='25', experience='7')]

# Data Wrangling - Dataframe operations
*   viewing datatype schema
*   returning specific columns and column indexing
*   adding, dropimg and renaming columns



In [27]:
## check datatype schema
pyspark_df.printSchema()

root
 |-- Username: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- experience: string (nullable = true)



In [33]:
## here we specify the data types of the columns
pyspark_df = spark.read.csv('users1.csv', header=True, inferSchema = True)
pyspark_df.show()

+--------+---+----------+
|Username|Age|experience|
+--------+---+----------+
|     Sam| 54|        15|
|  Justin| 31|        12|
|    Phil| 20|         5|
|  Roland| 25|         7|
+--------+---+----------+



In [29]:
## check datatype schema
pyspark_df.printSchema()

root
 |-- Username: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- experience: integer (nullable = true)



In [35]:
## selecting multiple columns
pyspark_df.select(['Username', 'experience']).show()

+--------+----------+
|Username|experience|
+--------+----------+
|     Sam|        15|
|  Justin|        12|
|    Phil|         5|
|  Roland|         7|
+--------+----------+



In [47]:
##indexing a column
indexed_col = pyspark_df.select(pyspark_df[ 'experience',][0].alias("indexed_col"))
indexed_col.show()

+-----------+
|indexed_col|
+-----------+
|         15|
|         12|
|          5|
|          7|
+-----------+



In [48]:
## describing (summarysing) values in a dataframe
pyspark_df.describe().show()

+-------+--------+------------------+-----------------+
|summary|Username|               Age|       experience|
+-------+--------+------------------+-----------------+
|  count|       4|                 4|                4|
|   mean|    null|              32.5|             9.75|
| stddev|    null|15.022205785658333|4.573474244670748|
|    min|  Justin|                20|                5|
|    max|     Sam|                54|               15|
+-------+--------+------------------+-----------------+



In [52]:
## adding columns in dataframes
pyspark_df = pyspark_df.withColumn('experience after 10 years', pyspark_df['experience'] + 10)
pyspark_df.show()

+--------+---+----------+-------------------------+
|Username|Age|experience|experience after 10 years|
+--------+---+----------+-------------------------+
|     Sam| 54|        15|                       25|
|  Justin| 31|        12|                       22|
|    Phil| 20|         5|                       15|
|  Roland| 25|         7|                       17|
+--------+---+----------+-------------------------+



In [54]:
## droping a column
pyspark_df = pyspark_df.drop('experience after 10 years')
pyspark_df.show()

+--------+---+----------+
|Username|Age|experience|
+--------+---+----------+
|     Sam| 54|        15|
|  Justin| 31|        12|
|    Phil| 20|         5|
|  Roland| 25|         7|
+--------+---+----------+



In [55]:
## Rename a column
pyspark_df.withColumnRenamed('experience', 'working years').show()

+--------+---+-------------+
|Username|Age|working years|
+--------+---+-------------+
|     Sam| 54|           15|
|  Justin| 31|           12|
|    Phil| 20|            5|
|  Roland| 25|            7|
+--------+---+-------------+

