In [None]:
#Quick cell to make jupyter notebook use the full screen width
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
# Basic utility functions
import logging
import os
import pathlib
from src.log import logger
from src import paths
from src.utils import list_dir

# data functions
from src.data import DataSource
from src import workflow

In [None]:
logger.setLevel(logging.DEBUG)

## Getting the Wine Reviews Dataset

https://www.kaggle.com/zynicide/wine-reviews

License: CC BY-NC-SA 4.0

About: 

### Content

This dataset contains three files:

  * `winemag-data-130k-v2.csv` contains 13 columns and 130k rows of wine reviews.

  * `winemag-data_first150k.csv` contains 10 columns and 150k rows of wine reviews. (Does not have Taster info)

  * `winemag-data-130k-v2.json` contains 6919 nodes of wine reviews.

The data consists of 13 fields:

* Points: the number of points WineEnthusiast rated the wine on a scale of 1-100 (though they say they only post reviews for wines that score >=80)
* Title: the title of the wine review, which often contains the vintage if you're interested in extracting that feature
* Variety: the type of grapes used to make the wine (ie Pinot Noir)
* Description: a few sentences from a sommelier describing the wine's taste, smell, look, feel, etc.
* Country: the country that the wine is from
* Province: the province or state that the wine is from
* Region 1: the wine growing area in a province or state (ie Napa)
* Region 2: sometimes there are more specific regions specified within a wine growing area (ie Rutherford inside the Napa Valley), but this value can sometimes be blank
* Winery: the winery that made the wine
* Designation: the vineyard within the winery where the grapes that made the wine are from
* Price: the cost for a bottle of the wine
* Taster Name: name of the person who tasted and reviewed the wine
* Taster Twitter Handle: Twitter handle for the person who tasted ane reviewed the wine

UPDATED 11/24/2017 Title, Taster Name, and Taster Twitter Handle were collected and the issue with duplicate entires was resolved

### Acknowledgements

The data was scraped from WineEnthusiast during the week of June 15th, 2017. The code for the scraper can be found here if you have any more specific questions about data collection that I didn't address. (https://github.com/zackthoutt/wine-deep-learning)

UPDATE 11/24/2017
After feedback from users of the dataset I scraped the reviews again on November 22nd, 2017. This time around I collected the title of each review, which you can parse the year out of, the tasters name, and the taster's Twitter handle. This should also fix the duplicate entry issue.

## Add this as a DataSource

In [None]:
ds_name = 'wine_reviews'
dsrc = DataSource(ds_name)

In [None]:
filename = 'wine_reviews.zip'
shasum = 'sha1:844f943eda69e7a472a8d6752d71d436f6caefc8'

In [None]:
message = f"""
Please download {filename} from the Kaggle webpage at:
   https://www.kaggle.com/zynicide/wine-reviews
   
When you select “Download” from Kaggle, it will download all three files together in a .zip file. Name this file wine_reviews.zip and put it in your data/raw directory.

Its SHA-1 Hash should be: {shasum} 

This will require you to create a Kaggle account, and consent to their terms of service.
"""

In [None]:
dsrc.add_manual_download(message, file_name=filename, hash_value=shasum, force=True)

In [None]:
dsrc.file_dict[filename]

In [None]:
dsrc.fetch()

We will now download this file (offline) into our ~/Downloads directory. Modify your download location as necessary. The Kaggle file downloads as `1442_8172_bundle_archive.zip` so change the name to `wine_reviews.zip`  as well.

In [None]:
home_location = pathlib.PosixPath(os.environ['HOME']) 
downloaded_location = home_location / "Downloads" / filename

In [None]:
raw_data_path = paths['raw_data_path']  # hack so we can do the shell command below

In [None]:
!cp $downloaded_location $raw_data_path

In [None]:
!ls -la $raw_data_path

In [None]:
unpack_path = dsrc.unpack()

In [None]:
!ls -la $unpack_path

## Save progress!

In [None]:
workflow.add_datasource(dsrc)

In [None]:
workflow.datasource_catalog()

## How to get a DataSource you've already started working with

In [None]:
workflow.datasource_catalog(keys_only=True)

In [None]:
dsrc = DataSource.from_catalog('wine_reviews')

In [None]:
dsrc.file_dict

## Time to add metadata and license info
We'll use the metadata info from the beginning of this file coming from the Kaggle/GitHub pages.

In [None]:
license = "CC BY-NC-SA 4.0"

In [None]:
metadata = """
### Content

This dataset contains three files:

  * `winemag-data-130k-v2.csv` contains 13 columns and 130k rows of wine reviews.

  * `winemag-data_first150k.csv` contains 10 columns and 150k rows of wine reviews. (Does not have Taster info)

  * `winemag-data-130k-v2.json` contains 6919 nodes of wine reviews.

The data consists of 13 fields:

* Points: the number of points WineEnthusiast rated the wine on a scale of 1-100 (though they say they only post reviews for wines that score >=80)
* Title: the title of the wine review, which often contains the vintage if you're interested in extracting that feature
* Variety: the type of grapes used to make the wine (ie Pinot Noir)
* Description: a few sentences from a sommelier describing the wine's taste, smell, look, feel, etc.
* Country: the country that the wine is from
* Province: the province or state that the wine is from
* Region 1: the wine growing area in a province or state (ie Napa)
* Region 2: sometimes there are more specific regions specified within a wine growing area (ie Rutherford inside the Napa Valley), but this value can sometimes be blank
* Winery: the winery that made the wine
* Designation: the vineyard within the winery where the grapes that made the wine are from
* Price: the cost for a bottle of the wine
* Taster Name: name of the person who tasted and reviewed the wine
* Taster Twitter Handle: Twitter handle for the person who tasted ane reviewed the wine

UPDATED 11/24/2017 Title, Taster Name, and Taster Twitter Handle were collected and the issue with duplicate entires was resolved

### Acknowledgements

The data was scraped from WineEnthusiast during the week of June 15th, 2017. The code for the scraper can be found here if you have any more specific questions about data collection that I didn't address. (https://github.com/zackthoutt/wine-deep-learning)

UPDATE 11/24/2017
After feedback from users of the dataset I scraped the reviews again on November 22nd, 2017. This time around I collected the title of each review, which you can parse the year out of, the tasters name, and the taster's Twitter handle. This should also fix the duplicate entry issue."""

In [None]:
dsrc.add_metadata(contents=metadata)

In [None]:
dsrc.add_metadata(contents=license, kind='LICENSE')

In [None]:
dsrc.file_dict

In [None]:
dsrc.file_dict.keys()

In [None]:
dsrc.fetch()

In [None]:
dsrc.fetch()

In [None]:
dsrc.unpack()

In [None]:
workflow.add_datasource(dsrc)

In [None]:
workflow.datasource_catalog()

## Next: explore the data and create a Dataset

We'll pick this up in the next notebook: [92-Create-Wine-Reviews-Dataset.ipynb](92-Create-Wine-Reviews-Dataset.ipynb)