In [1]:
# Starting the spark session 

from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Practice').getOrCreate()

In [2]:
# check spark session
spark

In [3]:
# Read the data set
df_pyspark = spark.read.option('header','true').csv('test.csv', inferSchema=True)

In [4]:
# Checking the schema
df_pyspark.printSchema()

root
 |-- name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- experience: integer (nullable = true)
 |-- salary: integer (nullable = true)



In [5]:
# Read the data set
df_pyspark = spark.read.csv('test.csv', header = True, inferSchema=True)

In [6]:
# Looking at the dataset 
df_pyspark.show()

+---------+------------+---+----------+------+
|     name|  department|age|experience|salary|
+---------+------------+---+----------+------+
|  Bhaskar|data science| 27|        10|  2000|
|    krish|         IOT| 31|        12| 35000|
|Sudhanshu|    Big data| 29|        24| 46999|
|    krish|    Big data| 21|         1| 50000|
|Sudhanshu|    Big data| 23|         2|346768|
|Sudhanshu|         IOT| 44|         3| 60000|
|    krish|data science| 33|         4|    45|
|   Mahesh|data science| 34|        10| 38000|
+---------+------------+---+----------+------+



In [7]:
# Print schema
df_pyspark.printSchema()

root
 |-- name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- experience: integer (nullable = true)
 |-- salary: integer (nullable = true)



In [8]:
# Datatype check 
type(df_pyspark)

pyspark.sql.dataframe.DataFrame

In [9]:
# Columns name check 
df_pyspark.columns

['name', 'department', 'age', 'experience', 'salary']

In [10]:
# top 3 data in dataframe
df_pyspark.head(3)

[Row(name='Bhaskar', department='data science', age=27, experience=10, salary=2000),
 Row(name='krish', department='IOT', age=31, experience=12, salary=35000),
 Row(name='Sudhanshu', department='Big data', age=29, experience=24, salary=46999)]

In [11]:
# looking data in one specific column
df_pyspark.select('name').show()

+---------+
|     name|
+---------+
|  Bhaskar|
|    krish|
|Sudhanshu|
|    krish|
|Sudhanshu|
|Sudhanshu|
|    krish|
|   Mahesh|
+---------+



In [12]:
type(df_pyspark.select('name'))

pyspark.sql.dataframe.DataFrame

In [13]:
# Selecting multiple columns 
df_pyspark.select(['name', 'age']).show()

+---------+---+
|     name|age|
+---------+---+
|  Bhaskar| 27|
|    krish| 31|
|Sudhanshu| 29|
|    krish| 21|
|Sudhanshu| 23|
|Sudhanshu| 44|
|    krish| 33|
|   Mahesh| 34|
+---------+---+



In [14]:
# Checking datatypes
df_pyspark.dtypes

[('name', 'string'),
 ('department', 'string'),
 ('age', 'int'),
 ('experience', 'int'),
 ('salary', 'int')]

In [15]:
df_pyspark.describe().show()

+-------+-------+------------+-----------------+-----------------+------------------+
|summary|   name|  department|              age|       experience|            salary|
+-------+-------+------------+-----------------+-----------------+------------------+
|  count|      8|           8|                8|                8|                 8|
|   mean|   NULL|        NULL|            30.25|             8.25|           72351.5|
| stddev|   NULL|        NULL|7.186296483088988|7.611082145698563|112980.42651716269|
|    min|Bhaskar|    Big data|               21|                1|                45|
|    max|  krish|data science|               44|               24|            346768|
+-------+-------+------------+-----------------+-----------------+------------------+



In [16]:
# Adding one column in given dataset
df_pyspark = df_pyspark.withColumn('Exp after 2 years', df_pyspark.experience + 2)

In [17]:
df_pyspark.show()

+---------+------------+---+----------+------+-----------------+
|     name|  department|age|experience|salary|Exp after 2 years|
+---------+------------+---+----------+------+-----------------+
|  Bhaskar|data science| 27|        10|  2000|               12|
|    krish|         IOT| 31|        12| 35000|               14|
|Sudhanshu|    Big data| 29|        24| 46999|               26|
|    krish|    Big data| 21|         1| 50000|                3|
|Sudhanshu|    Big data| 23|         2|346768|                4|
|Sudhanshu|         IOT| 44|         3| 60000|                5|
|    krish|data science| 33|         4|    45|                6|
|   Mahesh|data science| 34|        10| 38000|               12|
+---------+------------+---+----------+------+-----------------+



In [18]:
# Droping the column
df_pyspark = df_pyspark.drop('Exp after 2 years')

In [19]:
df_pyspark.show()

+---------+------------+---+----------+------+
|     name|  department|age|experience|salary|
+---------+------------+---+----------+------+
|  Bhaskar|data science| 27|        10|  2000|
|    krish|         IOT| 31|        12| 35000|
|Sudhanshu|    Big data| 29|        24| 46999|
|    krish|    Big data| 21|         1| 50000|
|Sudhanshu|    Big data| 23|         2|346768|
|Sudhanshu|         IOT| 44|         3| 60000|
|    krish|data science| 33|         4|    45|
|   Mahesh|data science| 34|        10| 38000|
+---------+------------+---+----------+------+



In [20]:
# Rename a column
df_pyspark = df_pyspark.withColumnRenamed('name', 'new_name')

In [21]:
df_pyspark.show()

+---------+------------+---+----------+------+
| new_name|  department|age|experience|salary|
+---------+------------+---+----------+------+
|  Bhaskar|data science| 27|        10|  2000|
|    krish|         IOT| 31|        12| 35000|
|Sudhanshu|    Big data| 29|        24| 46999|
|    krish|    Big data| 21|         1| 50000|
|Sudhanshu|    Big data| 23|         2|346768|
|Sudhanshu|         IOT| 44|         3| 60000|
|    krish|data science| 33|         4|    45|
|   Mahesh|data science| 34|        10| 38000|
+---------+------------+---+----------+------+



### Pyspark video 3


In [22]:
# Starting the spark session 
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('practice').getOrCreate()

In [23]:
# reading a csv file
df_pyspark = spark.read.csv('test.csv', header = True, inferSchema=True)

In [24]:
df_pyspark.show()

+---------+------------+---+----------+------+
|     name|  department|age|experience|salary|
+---------+------------+---+----------+------+
|  Bhaskar|data science| 27|        10|  2000|
|    krish|         IOT| 31|        12| 35000|
|Sudhanshu|    Big data| 29|        24| 46999|
|    krish|    Big data| 21|         1| 50000|
|Sudhanshu|    Big data| 23|         2|346768|
|Sudhanshu|         IOT| 44|         3| 60000|
|    krish|data science| 33|         4|    45|
|   Mahesh|data science| 34|        10| 38000|
+---------+------------+---+----------+------+



In [25]:
# Dropping null values 
df_pyspark.na.drop().show()

+---------+------------+---+----------+------+
|     name|  department|age|experience|salary|
+---------+------------+---+----------+------+
|  Bhaskar|data science| 27|        10|  2000|
|    krish|         IOT| 31|        12| 35000|
|Sudhanshu|    Big data| 29|        24| 46999|
|    krish|    Big data| 21|         1| 50000|
|Sudhanshu|    Big data| 23|         2|346768|
|Sudhanshu|         IOT| 44|         3| 60000|
|    krish|data science| 33|         4|    45|
|   Mahesh|data science| 34|        10| 38000|
+---------+------------+---+----------+------+



In [26]:
df_pyspark.na.drop(thresh = 2).show()

+---------+------------+---+----------+------+
|     name|  department|age|experience|salary|
+---------+------------+---+----------+------+
|  Bhaskar|data science| 27|        10|  2000|
|    krish|         IOT| 31|        12| 35000|
|Sudhanshu|    Big data| 29|        24| 46999|
|    krish|    Big data| 21|         1| 50000|
|Sudhanshu|    Big data| 23|         2|346768|
|Sudhanshu|         IOT| 44|         3| 60000|
|    krish|data science| 33|         4|    45|
|   Mahesh|data science| 34|        10| 38000|
+---------+------------+---+----------+------+



In [27]:
df_pyspark.na.drop(subset = ['experience']).show()

+---------+------------+---+----------+------+
|     name|  department|age|experience|salary|
+---------+------------+---+----------+------+
|  Bhaskar|data science| 27|        10|  2000|
|    krish|         IOT| 31|        12| 35000|
|Sudhanshu|    Big data| 29|        24| 46999|
|    krish|    Big data| 21|         1| 50000|
|Sudhanshu|    Big data| 23|         2|346768|
|Sudhanshu|         IOT| 44|         3| 60000|
|    krish|data science| 33|         4|    45|
|   Mahesh|data science| 34|        10| 38000|
+---------+------------+---+----------+------+



In [28]:
# Missing value imputation 
df_pyspark.na.fill('Missing Values', ['age','experience']).show()

+---------+------------+---+----------+------+
|     name|  department|age|experience|salary|
+---------+------------+---+----------+------+
|  Bhaskar|data science| 27|        10|  2000|
|    krish|         IOT| 31|        12| 35000|
|Sudhanshu|    Big data| 29|        24| 46999|
|    krish|    Big data| 21|         1| 50000|
|Sudhanshu|    Big data| 23|         2|346768|
|Sudhanshu|         IOT| 44|         3| 60000|
|    krish|data science| 33|         4|    45|
|   Mahesh|data science| 34|        10| 38000|
+---------+------------+---+----------+------+



In [29]:
df_pyspark.show()

+---------+------------+---+----------+------+
|     name|  department|age|experience|salary|
+---------+------------+---+----------+------+
|  Bhaskar|data science| 27|        10|  2000|
|    krish|         IOT| 31|        12| 35000|
|Sudhanshu|    Big data| 29|        24| 46999|
|    krish|    Big data| 21|         1| 50000|
|Sudhanshu|    Big data| 23|         2|346768|
|Sudhanshu|         IOT| 44|         3| 60000|
|    krish|data science| 33|         4|    45|
|   Mahesh|data science| 34|        10| 38000|
+---------+------------+---+----------+------+



#### Filter
##### & | ~

In [30]:
df_pyspark.filter("salary <= 20000").show()

+-------+------------+---+----------+------+
|   name|  department|age|experience|salary|
+-------+------------+---+----------+------+
|Bhaskar|data science| 27|        10|  2000|
|  krish|data science| 33|         4|    45|
+-------+------------+---+----------+------+



In [31]:
df_pyspark.filter("salary <= 20000").select(['name','age']).show()

+-------+---+
|   name|age|
+-------+---+
|Bhaskar| 27|
|  krish| 33|
+-------+---+



In [32]:
# Multiple filters 
df_pyspark.filter((df_pyspark['salary'] >= 20000) & 
                  (df_pyspark['age'] > 20)).show()

+---------+------------+---+----------+------+
|     name|  department|age|experience|salary|
+---------+------------+---+----------+------+
|    krish|         IOT| 31|        12| 35000|
|Sudhanshu|    Big data| 29|        24| 46999|
|    krish|    Big data| 21|         1| 50000|
|Sudhanshu|    Big data| 23|         2|346768|
|Sudhanshu|         IOT| 44|         3| 60000|
|   Mahesh|data science| 34|        10| 38000|
+---------+------------+---+----------+------+



In [33]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('practive').getOrCreate()

In [34]:
df_pyspark = spark.read.csv('test.csv',header = True, inferSchema=True)

In [35]:
df_pyspark.show()

+---------+------------+---+----------+------+
|     name|  department|age|experience|salary|
+---------+------------+---+----------+------+
|  Bhaskar|data science| 27|        10|  2000|
|    krish|         IOT| 31|        12| 35000|
|Sudhanshu|    Big data| 29|        24| 46999|
|    krish|    Big data| 21|         1| 50000|
|Sudhanshu|    Big data| 23|         2|346768|
|Sudhanshu|         IOT| 44|         3| 60000|
|    krish|data science| 33|         4|    45|
|   Mahesh|data science| 34|        10| 38000|
+---------+------------+---+----------+------+



In [36]:
df_pyspark.printSchema()

root
 |-- name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- experience: integer (nullable = true)
 |-- salary: integer (nullable = true)



In [37]:
# groupBy to find the maximum salary 

df_pyspark.groupby('name').sum('salary').show()

+---------+-----------+
|     name|sum(salary)|
+---------+-----------+
|Sudhanshu|     453767|
|  Bhaskar|       2000|
|    krish|      85045|
|   Mahesh|      38000|
+---------+-----------+



In [38]:
# GroupBy department which get the maximum salary

df_pyspark.groupby('department').sum('salary').show()

+------------+-----------+
|  department|sum(salary)|
+------------+-----------+
|         IOT|      95000|
|data science|      40045|
|    Big data|     443767|
+------------+-----------+



In [39]:
# GroupBy department and count there occurance
df_pyspark.groupby('department').count().show()

+------------+-----+
|  department|count|
+------------+-----+
|         IOT|    2|
|data science|    3|
|    Big data|    3|
+------------+-----+



In [40]:
df_pyspark.agg({'salary':'sum'}).show()

+-----------+
|sum(salary)|
+-----------+
|     578812|
+-----------+



In [41]:
df_pyspark.groupby('department').agg({'salary':'sum'}).show()

+------------+-----------+
|  department|sum(salary)|
+------------+-----------+
|         IOT|      95000|
|data science|      40045|
|    Big data|     443767|
+------------+-----------+



In [42]:
df_pyspark.groupby('name').sum('salary').show()

+---------+-----------+
|     name|sum(salary)|
+---------+-----------+
|Sudhanshu|     453767|
|  Bhaskar|       2000|
|    krish|      85045|
|   Mahesh|      38000|
+---------+-----------+

