- Create a spark session

In [4]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Dataframe_session_name').getOrCreate()

In [5]:
spark

- Read a CSV file

In [6]:
df = spark.read.csv('BasicDataframe.csv')
df

DataFrame[_c0: string, _c1: string, _c2: string, _c3: string, _c4: string]

In [7]:
df.show()

+-------+---+------+----------+----------+
|    _c0|_c1|   _c2|       _c3|       _c4|
+-------+---+------+----------+----------+
|   Name|Age|Salary|Department|Experience|
|  Pawan| 26| 50000| Furniture|         4|
|Chandar| 55| 70000| Furniture|        20|
| Sanjay| 22| 35000|Operations|         1|
|  Anand| 25| 20000| Furniture|         1|
+-------+---+------+----------+----------+



## Dataframes

- Making top row as header

In [8]:
df = spark.read.option('header','true').csv('BasicDataframe.csv')
df.show()

+-------+---+------+----------+----------+
|   Name|Age|Salary|Department|Experience|
+-------+---+------+----------+----------+
|  Pawan| 26| 50000| Furniture|         4|
|Chandar| 55| 70000| Furniture|        20|
| Sanjay| 22| 35000|Operations|         1|
|  Anand| 25| 20000| Furniture|         1|
+-------+---+------+----------+----------+



In [13]:
df.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Salary: integer (nullable = true)
 |-- Department: string (nullable = true)
 |-- Experience: integer (nullable = true)



In [9]:
from pyspark.sql.functions import col
df = df.withColumn('Salary', col('Salary').cast('integer'))
df = df.withColumn('Age', col('Age').cast('integer'))
df = df.withColumn('Experience', col('Experience').cast('integer'))
df.show()

+-------+---+------+----------+----------+
|   Name|Age|Salary|Department|Experience|
+-------+---+------+----------+----------+
|  Pawan| 26| 50000| Furniture|         4|
|Chandar| 55| 70000| Furniture|        20|
| Sanjay| 22| 35000|Operations|         1|
|  Anand| 25| 20000| Furniture|         1|
+-------+---+------+----------+----------+



In [14]:
df.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Salary: integer (nullable = true)
 |-- Department: string (nullable = true)
 |-- Experience: integer (nullable = true)



In [17]:
df = spark.read.csv(header=True, path='BasicDataframe.csv', inferSchema=True)
df.show()

+-------+---+------+----------+----------+
|   Name|Age|Salary|Department|Experience|
+-------+---+------+----------+----------+
|  Pawan| 26| 50000| Furniture|       4.0|
|Chandar| 55| 70000| Furniture|      20.0|
| Sanjay| 22| 35000|Operations|       1.0|
|  Anand| 25| 20000| Furniture|       1.0|
+-------+---+------+----------+----------+



In [18]:
df.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Salary: integer (nullable = true)
 |-- Department: string (nullable = true)
 |-- Experience: double (nullable = true)



In [15]:
type(df)

pyspark.sql.dataframe.DataFrame

In [None]:
df.head()

Row(Name='Pawan', Age=26, Salary=50000, Department='Furniture', Experience=4)

In [None]:
df.head(3)

[Row(Name='Pawan', Age=26, Salary=50000, Department='Furniture', Experience=4),
 Row(Name='Chandar', Age=55, Salary=70000, Department='Furniture', Experience=20),
 Row(Name='Sanjay', Age=22, Salary=35000, Department='Operations', Experience=1)]

In [19]:
df.tail(2)

[Row(Name='Sanjay', Age=22, Salary=35000, Department='Operations', Experience=1.0),
 Row(Name='Anand', Age=25, Salary=20000, Department='Furniture', Experience=1.0)]

In [20]:
df.isEmpty()

False

In [21]:
df.explain()

== Physical Plan ==
FileScan csv [Name#169,Age#170,Salary#171,Department#172,Experience#173] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/d:/DEVELOPMENT/Big Data/Pyspark/BasicDataframe.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<Name:string,Age:int,Salary:int,Department:string,Experience:double>




In [24]:
df.distinct().count()

4

In [None]:
df.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Salary: integer (nullable = true)
 |-- Department: string (nullable = true)
 |-- Experience: integer (nullable = true)



In [None]:
type(df)

pyspark.sql.dataframe.DataFrame

In [None]:
df.columns

['Name', 'Age', 'Salary', 'Department', 'Experience']

In [None]:
print(df.count())

4


- Selection

In [None]:
df.select('Name').show()  # dataframe type

+-------+
|   Name|
+-------+
|  Pawan|
|Chandar|
| Sanjay|
|  Anand|
+-------+



In [None]:
df.select('Name').where(df['Salary']>50000).show()

+-------+
|   Name|
+-------+
|Chandar|
+-------+



In [None]:
df.select('Name').where(df['Salary']>50000).count()

1

In [None]:
df.select(['Name', 'Age']).show()  # dataframe type

+-------+---+
|   Name|Age|
+-------+---+
|  Pawan| 26|
|Chandar| 55|
| Sanjay| 22|
|  Anand| 25|
+-------+---+



In [None]:
df.dtypes

[('Name', 'string'),
 ('Age', 'int'),
 ('Salary', 'int'),
 ('Department', 'string'),
 ('Experience', 'int')]

In [None]:
df.describe()

DataFrame[summary: string, Name: string, Age: string, Salary: string, Department: string, Experience: string]

In [None]:
df.describe().show()

+-------+------+------------------+-----------------+----------+---------------+
|summary|  Name|               Age|           Salary|Department|     Experience|
+-------+------+------------------+-----------------+----------+---------------+
|  count|     4|                 4|                4|         4|              4|
|   mean|  NULL|              32.0|          43750.0|      NULL|            6.5|
| stddev|  NULL|15.427248620541512|21360.00936329383|      NULL|9.1104335791443|
|    min| Anand|                22|            20000| Furniture|              1|
|    max|Sanjay|                55|            70000|Operations|             20|
+-------+------+------------------+-----------------+----------+---------------+



- CRUD columns

In [None]:
df = df.withColumn('5 years later', df['Age']+5)
df.show()

+-------+---+------+----------+----------+-------------+
|   Name|Age|Salary|Department|Experience|5 years later|
+-------+---+------+----------+----------+-------------+
|  Pawan| 26| 50000| Furniture|         4|           31|
|Chandar| 55| 70000| Furniture|        20|           60|
| Sanjay| 22| 35000|Operations|         1|           27|
|  Anand| 25| 20000| Furniture|         1|           30|
+-------+---+------+----------+----------+-------------+



In [None]:
df = df.withColumnRenamed('5 years later', 'Age+5')
df.show()

+-------+---+------+----------+----------+-----+
|   Name|Age|Salary|Department|Experience|Age+5|
+-------+---+------+----------+----------+-----+
|  Pawan| 26| 50000| Furniture|         4|   31|
|Chandar| 55| 70000| Furniture|        20|   60|
| Sanjay| 22| 35000|Operations|         1|   27|
|  Anand| 25| 20000| Furniture|         1|   30|
+-------+---+------+----------+----------+-----+



In [None]:
df = df.drop('Age+5')
df.show()

+-------+---+------+----------+----------+
|   Name|Age|Salary|Department|Experience|
+-------+---+------+----------+----------+
|  Pawan| 26| 50000| Furniture|         4|
|Chandar| 55| 70000| Furniture|        20|
| Sanjay| 22| 35000|Operations|         1|
|  Anand| 25| 20000| Furniture|         1|
+-------+---+------+----------+----------+



## Filter

In [None]:
df.show()

+-------+---+------+----------+----------+
|   Name|Age|Salary|Department|Experience|
+-------+---+------+----------+----------+
|  Pawan| 26| 50000| Furniture|         4|
|Chandar| 55| 70000| Furniture|        20|
| Sanjay| 22| 35000|Operations|         1|
|  Anand| 25| 20000| Furniture|         1|
+-------+---+------+----------+----------+



In [None]:
df.filter('Salary<=40000').show()

+------+---+------+----------+----------+
|  Name|Age|Salary|Department|Experience|
+------+---+------+----------+----------+
|Sanjay| 22| 35000|Operations|         1|
| Anand| 25| 20000| Furniture|         1|
+------+---+------+----------+----------+



In [None]:
df.filter('Salary<=40000').select(['Name','Age']).show()

+------+---+
|  Name|Age|
+------+---+
|Sanjay| 22|
| Anand| 25|
+------+---+



In [None]:
df.filter(df['Salary']<=40000).select(['Name','Age']).show()

+------+---+
|  Name|Age|
+------+---+
|Sanjay| 22|
| Anand| 25|
+------+---+



- AND

In [None]:
df.filter((df['Salary']>40000) & (df['Salary']<60000)).select(['Name','Age']).show()

+-----+---+
| Name|Age|
+-----+---+
|Pawan| 26|
+-----+---+



- NOT

In [None]:
df.filter(~((df['Salary']>40000) & (df['Salary']<60000))).select(['Name','Age']).show()

+-------+---+
|   Name|Age|
+-------+---+
|Chandar| 55|
| Sanjay| 22|
|  Anand| 25|
+-------+---+



- OR

In [None]:
df.filter((df['Salary']<=40000) | (df['Salary']>60000)).select(['Name','Age']).show()

+-------+---+
|   Name|Age|
+-------+---+
|Chandar| 55|
| Sanjay| 22|
|  Anand| 25|
+-------+---+



## Group by and Agg.

In [None]:
df.show()

+-------+---+------+----------+----------+
|   Name|Age|Salary|Department|Experience|
+-------+---+------+----------+----------+
|  Pawan| 26| 50000| Furniture|         4|
|Chandar| 55| 70000| Furniture|        20|
| Sanjay| 22| 35000|Operations|         1|
|  Anand| 25| 20000| Furniture|         1|
+-------+---+------+----------+----------+



In [None]:
type(df.groupBy('Department'))

pyspark.sql.group.GroupedData

In [None]:
df.groupBy('Department').count().show()

+----------+-----+
|Department|count|
+----------+-----+
| Furniture|    3|
|Operations|    1|
+----------+-----+



In [None]:
df.groupBy('Department', 'Experience').count().show()

+----------+----------+-----+
|Department|Experience|count|
+----------+----------+-----+
| Furniture|        20|    1|
| Furniture|         1|    1|
|Operations|         1|    1|
| Furniture|         4|    1|
+----------+----------+-----+



In [None]:
df.groupBy('Department').min('Salary').show()

+----------+-----------+
|Department|min(Salary)|
+----------+-----------+
| Furniture|      20000|
|Operations|      35000|
+----------+-----------+



In [None]:
df.groupBy('Department').mean().show()

+----------+------------------+------------------+-----------------+
|Department|          avg(Age)|       avg(Salary)|  avg(Experience)|
+----------+------------------+------------------+-----------------+
| Furniture|35.333333333333336|46666.666666666664|8.333333333333334|
|Operations|              22.0|           35000.0|              1.0|
+----------+------------------+------------------+-----------------+



In [None]:
df.groupBy('Department').max('Salary').show()

+----------+-----------+
|Department|max(Salary)|
+----------+-----------+
| Furniture|      70000|
|Operations|      35000|
+----------+-----------+



In [None]:
df.groupBy('Department').sum('Salary').show()

+----------+-----------+
|Department|sum(Salary)|
+----------+-----------+
| Furniture|     140000|
|Operations|      35000|
+----------+-----------+



In [None]:
from pyspark.sql import SparkSession
from pyspark import SparkConf
conf = SparkConf().setAppName('Map_and_filter').setMaster("local[*]")
spark = SparkSession(conf = conf)

n = [1,2,3,4,5]
rdd = spark.parallelize(n)

sq_rdd = rdd.map(lambda x: x * x)

filtered_rdd = sq_rdd.filter(lambda x: x > 16)

res = filtered_rdd.collect()
print(res)

spark.stop()

AttributeError: 'SparkSession' object has no attribute 'parallelize'