Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# Python with pyspark

## Installing pyspark and importing data

In [None]:
!pip install pyspark

In [31]:
import pyspark

In [32]:
from pyspark.sql import SparkSession

In [33]:
spark = SparkSession.builder.appName('Practice').getOrCreate()

In [34]:
spark

In [35]:
from google.colab import drive
drive.mount('/content/drive')
import pandas as pd
test1 = pd.read_csv("/content/drive/My Drive/test1.csv")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [36]:
test1

Unnamed: 0,Name,Age,Employment status
0,Alekhya,25,no
1,Jaswanth,31,yes
2,Kishore,55,yes
3,Gayatri,45,yes


In [37]:
df_pyspark = spark.read.csv("/content/drive/My Drive/test1.csv")

In [38]:
df_pyspark.show()

+--------+---+-----------------+
|     _c0|_c1|              _c2|
+--------+---+-----------------+
|    Name|Age|Employment status|
| Alekhya| 25|               no|
|Jaswanth| 31|              yes|
| Kishore| 55|              yes|
| Gayatri| 45|              yes|
+--------+---+-----------------+



In [39]:
#Name, Age were not considered as headers. So, to consider it as headers while reading the file we use below
df_pyspark = spark.read.option('header','true').csv("/content/drive/My Drive/test1.csv")

In [40]:
df_pyspark.show()

+--------+---+-----------------+
|    Name|Age|Employment status|
+--------+---+-----------------+
| Alekhya| 25|               no|
|Jaswanth| 31|              yes|
| Kishore| 55|              yes|
| Gayatri| 45|              yes|
+--------+---+-----------------+



In [42]:
#to check data type od each column 
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Employment status: string (nullable = true)



In [44]:
#We canobserve that integer fields were also shown as string ..Eg :Age So we use inferschema below while reading the dataset
df_pyspark = spark.read.option('header','true').csv("/content/drive/My Drive/test1.csv",inferSchema = True)


In [45]:
#to check data type od each column 
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Employment status: string (nullable = true)



In [85]:
#We can write the above statement together as
df_pyspark1 = spark.read.csv("/content/drive/My Drive/test1.csv",header = True,inferSchema = True)

In [86]:
df_pyspark1

DataFrame[Name: string, Age: int, Employment status: string]

In [87]:
#to check data type od each column 
df_pyspark1.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Employment status: string (nullable = true)



In [88]:
df_pyspark1.show()

+--------+---+-----------------+
|    Name|Age|Employment status|
+--------+---+-----------------+
| Alekhya| 25|               no|
|Jaswanth| 31|              yes|
| Kishore| 55|              yes|
| Gayatri| 45|              yes|
+--------+---+-----------------+



In [89]:
type(df_pyspark1)

pyspark.sql.dataframe.DataFrame

In [90]:
df_pyspark1.columns

['Name', 'Age', 'Employment status']

In [91]:
df_pyspark1.head(3)

[Row(Name='Alekhya', Age=25, Employment status='no'),
 Row(Name='Jaswanth', Age=31, Employment status='yes'),
 Row(Name='Kishore', Age=55, Employment status='yes')]

## Data Handling


*   Selecting data from dataframe

*   Dropping a column from dataframe

*   Creating a new column
*   Renaming a column name












In [92]:
# Selecting a column from dataframe
df_pyspark1.select("Name").show()

+--------+
|    Name|
+--------+
| Alekhya|
|Jaswanth|
| Kishore|
| Gayatri|
+--------+



In [93]:
df_pyspark1.select(["Name","Age"]).show()

+--------+---+
|    Name|Age|
+--------+---+
| Alekhya| 25|
|Jaswanth| 31|
| Kishore| 55|
| Gayatri| 45|
+--------+---+



In [94]:

type(df_pyspark1['Name'])

pyspark.sql.column.Column

In [95]:
#Checking type of each column in dataframe through dtypes
df_pyspark1.dtypes

[('Name', 'string'), ('Age', 'int'), ('Employment status', 'string')]

In [96]:
df_pyspark1.describe().show()

+-------+-------+------------------+-----------------+
|summary|   Name|               Age|Employment status|
+-------+-------+------------------+-----------------+
|  count|      4|                 4|                4|
|   mean|   null|              39.0|             null|
| stddev|   null|13.564659966250536|             null|
|    min|Alekhya|                25|               no|
|    max|Kishore|                55|              yes|
+-------+-------+------------------+-----------------+



In [97]:
df_pyspark1 = df_pyspark1.withColumn('Age after 2 years',df_pyspark1['Age'] + 2)

In [98]:
df_pyspark1.show()

+--------+---+-----------------+-----------------+
|    Name|Age|Employment status|Age after 2 years|
+--------+---+-----------------+-----------------+
| Alekhya| 25|               no|               27|
|Jaswanth| 31|              yes|               33|
| Kishore| 55|              yes|               57|
| Gayatri| 45|              yes|               47|
+--------+---+-----------------+-----------------+



In [99]:
#Dropping columns
df_pyspark1 = df_pyspark1.drop('Age after 2 years')

In [100]:
df_pyspark1.show()

+--------+---+-----------------+
|    Name|Age|Employment status|
+--------+---+-----------------+
| Alekhya| 25|               no|
|Jaswanth| 31|              yes|
| Kishore| 55|              yes|
| Gayatri| 45|              yes|
+--------+---+-----------------+



In [101]:
df_pyspark1 = df_pyspark1.withColumnRenamed('Name','Employee_name')

In [102]:
df_pyspark1.show()

+-------------+---+-----------------+
|Employee_name|Age|Employment status|
+-------------+---+-----------------+
|      Alekhya| 25|               no|
|     Jaswanth| 31|              yes|
|      Kishore| 55|              yes|
|      Gayatri| 45|              yes|
+-------------+---+-----------------+

