In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('ops').getOrCreate()

In [5]:
# CSV has option to inferSchema which json doesnt have
file_path = "file:///var/lib/spark/jupyter/data/appl_stock.csv"
df = spark.read.csv(file_path, inferSchema=True, header=True)

In [6]:
df.printSchema()

root
 |-- Date: timestamp (nullable = true)
 |-- Open: double (nullable = true)
 |-- High: double (nullable = true)
 |-- Low: double (nullable = true)
 |-- Close: double (nullable = true)
 |-- Volume: integer (nullable = true)
 |-- Adj Close: double (nullable = true)



In [9]:
df['Date', 'Open', 'Close', 'Volume']

DataFrame[Date: timestamp, Open: double, Close: double, Volume: int]

In [9]:
df.head(3)[0]

Row(Date='2010-01-04', Open=213.429998, High=214.499996, Low=212.38000099999996, Close=214.009998, Volume=123432400, Adj Close=27.727039)

### Filter operation using dataframes

In [10]:
df.filter('Close < 500')

DataFrame[Date: timestamp, Open: double, High: double, Low: double, Close: double, Volume: int, Adj Close: double]

### Combine multiple fuctions

In [11]:
df.filter('Close < 500').select(['open', 'close'])

DataFrame[open: double, close: double]

### Preferable syntax for filter

In [15]:
df.filter(df['close'] < 500).select(['open', 'close'])

DataFrame[open: double, close: double]

## Filtering on multiple conditions
### Common Errors:
#### 1. Use of "and", "or" rather than "&", "|"
    df.filter(df['close'] < 200 and df['open'] > 100).show()
#### 2. Encapsulation of conditions within the parenthesis ; even after correction the operators it will throw an error due to parenthesis
    df.filter(df['close'] < 200 & df['open'] > 100).show()
# Correct syntax:

In [18]:
df.filter( (df['close'] < 200) & (df['open'] > 100) ).show()

+--------------------+------------------+------------------+------------------+------------------+---------+------------------+
|                Date|              Open|              High|               Low|             Close|   Volume|         Adj Close|
+--------------------+------------------+------------------+------------------+------------------+---------+------------------+
|2010-01-22 00:00:...|206.78000600000001|        207.499996|            197.16|            197.75|220441900|         25.620401|
|2010-01-28 00:00:...|        204.930004|        205.500004|        198.699995|        199.289995|293375600|25.819922000000002|
|2010-01-29 00:00:...|        201.079996|        202.199995|        190.250002|        192.060003|311488100|         24.883208|
|2010-02-01 00:00:...|192.36999699999998|             196.0|191.29999899999999|        194.729998|187469100|         25.229131|
|2010-02-02 00:00:...|        195.909998|        196.319994|193.37999299999998|        195.859997|174585