In [0]:
# Install Java, Spark, and Findspark
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://www-us.apache.org/dist/spark/spark-2.3.2/spark-2.3.2-bin-hadoop2.7.tgz
!tar xf spark-2.3.2-bin-hadoop2.7.tgz
!pip install -q findspark

In [0]:
# Set Environment Variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.3.2-bin-hadoop2.7"

In [0]:
import findspark
findspark.init()

## Filtering Data

In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Wine").getOrCreate()

In [5]:
from pyspark import SparkFiles

url ="https://s3.amazonaws.com/dataviz-curriculum/day_1/wine.csv"
spark.sparkContext.addFile(url)
df = spark.read.csv(SparkFiles.get("wine.csv"), sep=",", header=True)
df.show()

+-------+--------------------+--------------------+------+-----+------------------+--------------------+-----------------+------------------+--------------------+
|country|         description|         designation|points|price|          province|            region_1|         region_2|           variety|              winery|
+-------+--------------------+--------------------+------+-----+------------------+--------------------+-----------------+------------------+--------------------+
|     US|This tremendous 1...|   Martha's Vineyard|    96|  235|        California|         Napa Valley|             Napa|Cabernet Sauvignon|               Heitz|
|  Spain|Ripe aromas of fi...|Carodorum Selecci...|    96|  110|    Northern Spain|                Toro|             null|     Tinta de Toro|Bodega Carmen Rod...|
|     US|Mac Watson honors...|Special Selected ...|    96|   90|        California|      Knights Valley|           Sonoma|   Sauvignon Blanc|            Macauley|
|     US|This spent 20

In [6]:
df.printSchema()

root
 |-- country: string (nullable = true)
 |-- description: string (nullable = true)
 |-- designation: string (nullable = true)
 |-- points: string (nullable = true)
 |-- price: string (nullable = true)
 |-- province: string (nullable = true)
 |-- region_1: string (nullable = true)
 |-- region_2: string (nullable = true)
 |-- variety: string (nullable = true)
 |-- winery: string (nullable = true)



In [7]:
# Order a dataframe by ascending values
df.orderBy(df["points"].asc()).head(5)

[Row(country='US', description='Brisk and clean, this dry white is the ', designation=None, points=None, price=None, province=None, region_1=None, region_2=None, variety=None, winery=None),
 Row(country='US', description='Ripe ', designation=None, points=None, price=None, province=None, region_1=None, region_2=None, variety=None, winery=None),
 Row(country='Italy', description='This offers generous tones of cherry, dried raspberry, moist tobacco, cured meat and white truffle. It is ', designation=None, points=None, price=None, province=None, region_1=None, region_2=None, variety=None, winery=None),
 Row(country='Italy', description='"This nicely structured Pinot Grigio shows a ""ramato"" or copper color and slightly oxidized aromas of butterscotch', designation=' bitter almond and ripe melon. The structure is bigger and more defined than a standard Italian white."', points=None, price='88', province='24', region_1='Northeastern Italy', region_2='Venezia Giulia', variety=None, winery='P

In [8]:
# Import functions
from pyspark.sql.functions import avg
df.select(avg("points")).show()

+-----------------+
|      avg(points)|
+-----------------+
|87.88834105383143|
+-----------------+



In [9]:
# Using SQL
df.filter("price<20").show()

+---------+--------------------+--------------------+------+-----+----------------+--------------------+--------------------+--------------------+--------------------+
|  country|         description|         designation|points|price|        province|            region_1|            region_2|             variety|              winery|
+---------+--------------------+--------------------+------+-----+----------------+--------------------+--------------------+--------------------+--------------------+
| Bulgaria|This Bulgarian Ma...|             Bergul̩|    90|   15|        Bulgaria|                null|                null|              Mavrud|        Villa Melnik|
|    Spain|Earthy plum and c...|              Amandi|    90|   17|         Galicia|       Ribeira Sacra|                null|             Menc�_a|      Don Bernardino|
|       US|There's a lot to ...|                null|    90|   18|      California|Russian River Valley|              Sonoma|          Chardonnay|            De

In [10]:
# Filter by price on certain columns
df.filter("price<20").select(['points','country', 'winery','price']).show()

+------+---------+--------------------+-----+
|points|  country|              winery|price|
+------+---------+--------------------+-----+
|    90| Bulgaria|        Villa Melnik|   15|
|    90|    Spain|      Don Bernardino|   17|
|    90|       US|            De Loach|   18|
|    91|       US|   Trinity Vineyards|   19|
|    91| Portugal|Adega Cooperativa...|   15|
|    86|       US|      Belle Ambiance|   10|
|    86| Portugal| Adega de Cantanhede|   12|
|    86|       US|            Parducci|   13|
|    86| Portugal|    Quinta do Portal|   10|
|    86|   France|               Rigal|   14|
|    86|       US|     The Naked Grape|   18|
|    86|   France|   Georges Vigouroux|   15|
|    86|   France|   Georges Vigouroux|   10|
|    86|       US| Martinez & Martinez|   17|
|    86|       US|           Ironstone|   12|
|    86|       US|       Leaping Horse|   10|
|    86|       US|        Kitchen Sink|   13|
|    86| Portugal|  Wines & Winemakers|   12|
|    86|Argentina|              Zo

### Using Python Comparison Operators

In [11]:
# Same results only this time using python
df.filter(df["price"] < 200).show()

+-------+--------------------+--------------------+------+-----+------------------+------------------+-----------------+------------------+--------------------+
|country|         description|         designation|points|price|          province|          region_1|         region_2|           variety|              winery|
+-------+--------------------+--------------------+------+-----+------------------+------------------+-----------------+------------------+--------------------+
|  Spain|Ripe aromas of fi...|Carodorum Selecci...|    96|  110|    Northern Spain|              Toro|             null|     Tinta de Toro|Bodega Carmen Rod...|
|     US|Mac Watson honors...|Special Selected ...|    96|   90|        California|    Knights Valley|           Sonoma|   Sauvignon Blanc|            Macauley|
|     US|This spent 20 mon...|             Reserve|    96|   65|            Oregon| Willamette Valley|Willamette Valley|        Pinot Noir|               Ponzi|
| France|This is the top w...|    

In [12]:
df.filter( (df["price"] < 200) | (df['points'] > 80) ).show()

+-------+--------------------+--------------------+------+-----+------------------+--------------------+-----------------+------------------+--------------------+
|country|         description|         designation|points|price|          province|            region_1|         region_2|           variety|              winery|
+-------+--------------------+--------------------+------+-----+------------------+--------------------+-----------------+------------------+--------------------+
|     US|This tremendous 1...|   Martha's Vineyard|    96|  235|        California|         Napa Valley|             Napa|Cabernet Sauvignon|               Heitz|
|  Spain|Ripe aromas of fi...|Carodorum Selecci...|    96|  110|    Northern Spain|                Toro|             null|     Tinta de Toro|Bodega Carmen Rod...|
|     US|Mac Watson honors...|Special Selected ...|    96|   90|        California|      Knights Valley|           Sonoma|   Sauvignon Blanc|            Macauley|
|     US|This spent 20

In [13]:
df.filter(df["country"] == "US").show()

+-------+--------------------+--------------------+------+-----+----------+--------------------+-----------------+------------------+--------------------+
|country|         description|         designation|points|price|  province|            region_1|         region_2|           variety|              winery|
+-------+--------------------+--------------------+------+-----+----------+--------------------+-----------------+------------------+--------------------+
|     US|This tremendous 1...|   Martha's Vineyard|    96|  235|California|         Napa Valley|             Napa|Cabernet Sauvignon|               Heitz|
|     US|Mac Watson honors...|Special Selected ...|    96|   90|California|      Knights Valley|           Sonoma|   Sauvignon Blanc|            Macauley|
|     US|This spent 20 mon...|             Reserve|    96|   65|    Oregon|   Willamette Valley|Willamette Valley|        Pinot Noir|               Ponzi|
|     US|This re-named vin...|              Silice|    95|   65|    Or