In [22]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('DF examples').getOrCreate()
columns = ["Seqno", "Quote"]
data = [("1", "Be the change that you wish to see in the world"),
("2", "Everyone thinks of changing the world, but no one thinks of changing himself."),
("3", "The purpose of our lives is to be happy"),
("4", "Be Cool")]
df = spark.createDataFrame(data, columns)
df.printSchema()
df.show()


root
 |-- Seqno: string (nullable = true)
 |-- Quote: string (nullable = true)

+-----+--------------------+
|Seqno|               Quote|
+-----+--------------------+
|    1|Be the change tha...|
|    2|Everyone thinks o...|
|    3|The purpose of ou...|
|    4|             Be Cool|
+-----+--------------------+



In [23]:
# Display Full Column Contents
df.show(truncate = False)
# Display 2 rows and full column contents
df.show(2, truncate = False)
# Display 2 rows and truncate column by length
df.show(2, truncate=25)


+-----+-----------------------------------------------------------------------------+
|Seqno|Quote                                                                        |
+-----+-----------------------------------------------------------------------------+
|1    |Be the change that you wish to see in the world                              |
|2    |Everyone thinks of changing the world, but no one thinks of changing himself.|
|3    |The purpose of our lives is to be happy                                      |
|4    |Be Cool                                                                      |
+-----+-----------------------------------------------------------------------------+

+-----+-----------------------------------------------------------------------------+
|Seqno|Quote                                                                        |
+-----+-----------------------------------------------------------------------------+
|1    |Be the change that you wish to see in the worl

In [24]:
# Display DataFrames rows & columns vertically
df.show(n=3, truncate=25, vertical=True)

-RECORD 0--------------------------
 Seqno | 1                         
 Quote | Be the change that you... 
-RECORD 1--------------------------
 Seqno | 2                         
 Quote | Everyone thinks of cha... 
-RECORD 2--------------------------
 Seqno | 3                         
 Quote | The purpose of our liv... 
only showing top 3 rows



In [26]:
# Get row count
rows = df.count()
print(f"DataFrame Rows count : {rows}")

# Get distinct row count
rows = df.distinct().count()
print(f"DataFrame Rows Distinct count : {rows}")

# Get columns count
cols = len(df.columns)
print(f"DataFrame Columns count : {cols}")


# Get Column count Using len(df.dtypes) method
col = len(df.dtypes)
print(f"DataFrame Column count: {col}")



DataFrame Rows count : 4
DataFrame Rows Distinct count : 4
DataFrame Columns count : 2
DataFrame Column count: 2


In [27]:

# Using functions.count()
from pyspark.sql.functions import count
df.select(count(df.Quote)).show()
df.select(count(df.Seqno), count(df.Quote)).show()


+------------+
|count(Quote)|
+------------+
|           4|
+------------+

+------------+------------+
|count(Seqno)|count(Quote)|
+------------+------------+
|           4|           4|
+------------+------------+



In [28]:

# Using agg
df.agg({'Seqno':'count','Quote':'count'}).show()

# Using groupBy().count()
df.groupBy("Seqno").count().show()


+------------+------------+
|count(Quote)|count(Seqno)|
+------------+------------+
|           4|           4|
+------------+------------+

+-----+-----+
|Seqno|count|
+-----+-----+
|    1|    1|
|    2|    1|
|    3|    1|
|    4|    1|
+-----+-----+



In [31]:

# PySpark SQL Count
df.createOrReplaceTempView("EMP")
spark.sql("SELECT Count(*) FROM EMP").show()
spark.sql("SELECT COUNT(distinct Seqno) FROM EMP").show()
spark.sql("SELECT Seqno,COUNT(*) FROM EMP GROUP BY Seqno").show()

+--------+
|count(1)|
+--------+
|       4|
+--------+

+---------------------+
|count(DISTINCT Seqno)|
+---------------------+
|                    4|
+---------------------+

+-----+--------+
|Seqno|count(1)|
+-----+--------+
|    1|       1|
|    2|       1|
|    3|       1|
|    4|       1|
+-----+--------+



In [39]:
# Count distinct values
data = [("James", "Sales", 3000),
    ("Michael", "Sales", 4600),
    ("Robert", "Sales", 4100),
    ("Maria", "Finance", 3000),
    ("James", "Sales", 3000),
    ("Scott", "Finance", 3300),
    ("Jen", "Finance", 3900),
    ("Jeff", "Marketing", 3000),
    ("Kumar", "Marketing", 2000),
    ("Saif", "Sales", 4100)
  ]
columns = ["Name","Dept","Salary"]
df = spark.createDataFrame(data=data,schema=columns)
df.distinct().show()
print("Distinct Count: " + str(df.distinct().count()))


+-------+---------+------+
|   Name|     Dept|Salary|
+-------+---------+------+
|  James|    Sales|  3000|
|Michael|    Sales|  4600|
| Robert|    Sales|  4100|
|  Maria|  Finance|  3000|
|  Scott|  Finance|  3300|
|    Jen|  Finance|  3900|
|   Jeff|Marketing|  3000|
|  Kumar|Marketing|  2000|
|   Saif|    Sales|  4100|
+-------+---------+------+

Distinct Count: 9


In [40]:

# Using countDistrinct()
from pyspark.sql.functions import countDistinct
df2=df.select(countDistinct("Dept","Salary"))
df2.show()

print("Distinct Count of Department & Salary: "+ str(df2.collect()[0][0]))

+----------------------------+
|count(DISTINCT Dept, Salary)|
+----------------------------+
|                           8|
+----------------------------+

Distinct Count of Department & Salary: 8


In [43]:
# Filter not in

# PySpark not isin()
listValues = ["Sales","Finance"]
df.filter(~df.Dept.isin(listValues)).show()

from pyspark.sql.functions import col
df.filter(~col("Dept").isin(listValues)).show()

# Using NOT IN operator
df.filter("Dept not in ('Sales','Finance')" ).show()

# Using == operator
df.filter(df.Dept.isin(listValues) == False).show()

# PySpark SQL NOT IN
df.createOrReplaceTempView("TAB")
spark.sql("SELECT * FROM TAB WHERE " +
    "Dept NOT IN ('Sales','Finance')").show()

+-----+---------+------+
| Name|     Dept|Salary|
+-----+---------+------+
| Jeff|Marketing|  3000|
|Kumar|Marketing|  2000|
+-----+---------+------+

+-----+---------+------+
| Name|     Dept|Salary|
+-----+---------+------+
| Jeff|Marketing|  3000|
|Kumar|Marketing|  2000|
+-----+---------+------+

+-----+---------+------+
| Name|     Dept|Salary|
+-----+---------+------+
| Jeff|Marketing|  3000|
|Kumar|Marketing|  2000|
+-----+---------+------+

+-----+---------+------+
| Name|     Dept|Salary|
+-----+---------+------+
| Jeff|Marketing|  3000|
|Kumar|Marketing|  2000|
+-----+---------+------+

+-----+---------+------+
| Name|     Dept|Salary|
+-----+---------+------+
| Jeff|Marketing|  3000|
|Kumar|Marketing|  2000|
+-----+---------+------+

