# pyspark Dataframes

- Filter Operations.
- &,|,==
- ~

In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName('dataframe').getOrCreate()

In [1]:
import pandas as pd
import numpy as np

# Create a DataFrame with random data
data = {
    'Name': ['John', 'Alice', 'Bob', 'Charlie', 'David'] * 6,
    'Age': np.random.randint(20, 40, 30),
    'Salary': np.random.randint(40000, 80000, 30),
    'City': ['New York', 'San Francisco', 'Los Angeles', 'Chicago', 'Boston'] * 6,
    'Department': ['HR', 'Engineering', 'Sales', 'Marketing', 'Finance'] * 6
}

df = pd.DataFrame(data)

# Display the DataFrame
print("DataFrame:")
print(df)

# Write the DataFrame to a CSV file
df.to_csv('test1.csv', index=False)

print("\nDataFrame has been written to 'sample_data.csv'")


DataFrame:
       Name  Age  Salary           City   Department
0      John   30   72901       New York           HR
1     Alice   31   72817  San Francisco  Engineering
2       Bob   32   64682    Los Angeles        Sales
3   Charlie   20   62500        Chicago    Marketing
4     David   27   56232         Boston      Finance
5      John   31   43894       New York           HR
6     Alice   37   44318  San Francisco  Engineering
7       Bob   32   41379    Los Angeles        Sales
8   Charlie   31   63345        Chicago    Marketing
9     David   26   58870         Boston      Finance
10     John   38   79745       New York           HR
11    Alice   29   63187  San Francisco  Engineering
12      Bob   36   67289    Los Angeles        Sales
13  Charlie   33   42526        Chicago    Marketing
14    David   22   79208         Boston      Finance
15     John   26   69270       New York           HR
16    Alice   25   43699  San Francisco  Engineering
17      Bob   36   72481    Los Ang

In [4]:
df_pyspark = spark.read.csv('test1.csv', header=True, inferSchema=True)

In [6]:
df_pyspark.show(5)

+-------+---+------+-------------+-----------+
|   Name|Age|Salary|         City| Department|
+-------+---+------+-------------+-----------+
|   John| 30| 72901|     New York|         HR|
|  Alice| 31| 72817|San Francisco|Engineering|
|    Bob| 32| 64682|  Los Angeles|      Sales|
|Charlie| 20| 62500|      Chicago|  Marketing|
|  David| 27| 56232|       Boston|    Finance|
+-------+---+------+-------------+-----------+
only showing top 5 rows



## Filter Operations.

In [9]:
### Salary of people less than or equal to 2000.
df_pyspark.filter("Salary<=50000").show()

+-------+---+------+-------------+-----------+
|   Name|Age|Salary|         City| Department|
+-------+---+------+-------------+-----------+
|   John| 31| 43894|     New York|         HR|
|  Alice| 37| 44318|San Francisco|Engineering|
|    Bob| 32| 41379|  Los Angeles|      Sales|
|Charlie| 33| 42526|      Chicago|  Marketing|
|  Alice| 25| 43699|San Francisco|Engineering|
|  David| 31| 48601|       Boston|    Finance|
|Charlie| 24| 48547|      Chicago|  Marketing|
|  David| 28| 41422|       Boston|    Finance|
|Charlie| 38| 47926|      Chicago|  Marketing|
+-------+---+------+-------------+-----------+



In [10]:
df_pyspark.filter("Salary<=50000").select(['Name','Age']).show()

+-------+---+
|   Name|Age|
+-------+---+
|   John| 31|
|  Alice| 37|
|    Bob| 32|
|Charlie| 33|
|  Alice| 25|
|  David| 31|
|Charlie| 24|
|  David| 28|
|Charlie| 38|
+-------+---+



In [12]:
df_pyspark.filter(df_pyspark['Salary'] <= 50000).show()

+-------+---+------+-------------+-----------+
|   Name|Age|Salary|         City| Department|
+-------+---+------+-------------+-----------+
|   John| 31| 43894|     New York|         HR|
|  Alice| 37| 44318|San Francisco|Engineering|
|    Bob| 32| 41379|  Los Angeles|      Sales|
|Charlie| 33| 42526|      Chicago|  Marketing|
|  Alice| 25| 43699|San Francisco|Engineering|
|  David| 31| 48601|       Boston|    Finance|
|Charlie| 24| 48547|      Chicago|  Marketing|
|  David| 28| 41422|       Boston|    Finance|
|Charlie| 38| 47926|      Chicago|  Marketing|
+-------+---+------+-------------+-----------+



In [18]:
df_pyspark.filter((df_pyspark['Salary'] <= 60000) & 
                  (df_pyspark['Salary']>= 50000)).show()

+-----+---+------+-------------+-----------+
| Name|Age|Salary|         City| Department|
+-----+---+------+-------------+-----------+
|David| 27| 56232|       Boston|    Finance|
|David| 26| 58870|       Boston|    Finance|
| John| 38| 55512|     New York|         HR|
|Alice| 24| 58956|San Francisco|Engineering|
| John| 33| 59229|     New York|         HR|
|David| 20| 50269|       Boston|    Finance|
+-----+---+------+-------------+-----------+



In [20]:
df_pyspark.filter((df_pyspark['Salary'] <= 60000) |
                  (df_pyspark['Salary']>= 50000)).show()

+-------+---+------+-------------+-----------+
|   Name|Age|Salary|         City| Department|
+-------+---+------+-------------+-----------+
|   John| 30| 72901|     New York|         HR|
|  Alice| 31| 72817|San Francisco|Engineering|
|    Bob| 32| 64682|  Los Angeles|      Sales|
|Charlie| 20| 62500|      Chicago|  Marketing|
|  David| 27| 56232|       Boston|    Finance|
|   John| 31| 43894|     New York|         HR|
|  Alice| 37| 44318|San Francisco|Engineering|
|    Bob| 32| 41379|  Los Angeles|      Sales|
|Charlie| 31| 63345|      Chicago|  Marketing|
|  David| 26| 58870|       Boston|    Finance|
|   John| 38| 79745|     New York|         HR|
|  Alice| 29| 63187|San Francisco|Engineering|
|    Bob| 36| 67289|  Los Angeles|      Sales|
|Charlie| 33| 42526|      Chicago|  Marketing|
|  David| 22| 79208|       Boston|    Finance|
|   John| 26| 69270|     New York|         HR|
|  Alice| 25| 43699|San Francisco|Engineering|
|    Bob| 36| 72481|  Los Angeles|      Sales|
|Charlie| 29|

In [22]:
df_pyspark.filter(~(df_pyspark['Salary'] <= 50000)).show()

+-------+---+------+-------------+-----------+
|   Name|Age|Salary|         City| Department|
+-------+---+------+-------------+-----------+
|   John| 30| 72901|     New York|         HR|
|  Alice| 31| 72817|San Francisco|Engineering|
|    Bob| 32| 64682|  Los Angeles|      Sales|
|Charlie| 20| 62500|      Chicago|  Marketing|
|  David| 27| 56232|       Boston|    Finance|
|Charlie| 31| 63345|      Chicago|  Marketing|
|  David| 26| 58870|       Boston|    Finance|
|   John| 38| 79745|     New York|         HR|
|  Alice| 29| 63187|San Francisco|Engineering|
|    Bob| 36| 67289|  Los Angeles|      Sales|
|  David| 22| 79208|       Boston|    Finance|
|   John| 26| 69270|     New York|         HR|
|    Bob| 36| 72481|  Los Angeles|      Sales|
|Charlie| 29| 63446|      Chicago|  Marketing|
|   John| 38| 55512|     New York|         HR|
|  Alice| 24| 58956|San Francisco|Engineering|
|    Bob| 20| 78489|  Los Angeles|      Sales|
|   John| 33| 59229|     New York|         HR|
|  Alice| 29|