In [1]:
# import via notebook
!mongoimport --type json -d wine -c ratings --drop --jsonArray ../Resources/clean_wine_data_final.json

2024-05-06T18:18:47.196-0700	connected to: mongodb://localhost/
2024-05-06T18:18:47.197-0700	dropping: wine.ratings
2024-05-06T18:18:49.221-0700	76736 document(s) imported successfully. 0 document(s) failed to import.


In [2]:
# Importing Necessary Libraries
from pymongo import MongoClient
from pprint import pprint
import pandas as pd
import tensorflow as tf

In [3]:
# Create an instance of MongoClient
mongo = MongoClient(port=27017)

In [4]:
# confirm that our new database was created
mongo.list_database_names()

['admin',
 'air_b_and_b',
 'autosaurus',
 'classDB',
 'config',
 'gardenDB',
 'local',
 'met',
 'uk_food',
 'wine']

In [5]:
db = mongo['wine']
db.list_collection_names()

['ratings']

In [6]:
#assign the collection to a variable
wine_df = db['ratings']

In [7]:
cursor = db["ratings"].find({})
json_data = list(cursor)

In [8]:
wine_df = pd.DataFrame(json_data)
wine_df.head()

Unnamed: 0,_id,country,description,points,price,province,region,title,variety,winery,rating_category,type
0,663981777c97be4d34cbc47a,US,"Tart and snappy, the flavors of lime flesh and...",87,14,Oregon,Willamette Valley,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm,Good,White
1,663981777c97be4d34cbc47b,US,"Much like the regular bottling from 2012, this...",87,65,Oregon,Willamette Valley,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks,Good,Red
2,663981777c97be4d34cbc47c,Argentina,Raw black-cherry aromas are direct and simple ...,87,13,Mendoza Province,Mendoza,Gaucho Andino 2011 Winemaker Selection Malbec ...,Malbec,Gaucho Andino,Good,Red
3,663981777c97be4d34cbc47d,US,"Soft, supple plum envelopes an oaky structure ...",87,19,California,Napa Valley,Kirkland Signature 2011 Mountain Cuvée Caberne...,Cabernet Sauvignon,Kirkland Signature,Good,Red
4,663981777c97be4d34cbc47e,US,Ripe aromas of dark berries mingle with ample ...,87,23,Virginia,Virginia,Quiévremont 2012 Vin de Maison Red (Virginia),Red Blend,Quiévremont,Good,Red Blend


In [9]:
# Drop id columns
wine_df.drop(columns=['_id', 'description', 'title', 'winery'], inplace=True)

In [10]:
wine_df

Unnamed: 0,country,points,price,province,region,variety,rating_category,type
0,US,87,14,Oregon,Willamette Valley,Pinot Gris,Good,White
1,US,87,65,Oregon,Willamette Valley,Pinot Noir,Good,Red
2,Argentina,87,13,Mendoza Province,Mendoza,Malbec,Good,Red
3,US,87,19,California,Napa Valley,Cabernet Sauvignon,Good,Red
4,US,87,23,Virginia,Virginia,Red Blend,Good,Red Blend
...,...,...,...,...,...,...,...,...
76731,France,90,32,Alsace,Alsace,Pinot Gris,Very Good,White
76732,Argentina,91,55,Mendoza Province,Uco Valley,Red Blend,Excellent,Red Blend
76733,US,90,35,California,Napa Valley,Zinfandel,Very Good,Red
76734,Spain,91,52,Catalonia,Priorat,Red Blend,Excellent,Red Blend


Examine columns for encoding

In [11]:
wine_df.nunique()

country              7
points              21
price              335
province            62
region             998
variety             20
rating_category      6
type                 5
dtype: int64

In [12]:
wine_df['province'].value_counts()

province
California            31397
Washington             7387
Oregon                 4816
Tuscany                4225
Bordeaux               3389
                      ...  
Nevada                    2
Northwestern Italy        2
Iowa                      2
Kentucky                  1
Hawaii                    1
Name: count, Length: 62, dtype: int64

In [13]:
wine_df['province'].value_counts()[wine_df['province'].value_counts() > 1]

province
California              31397
Washington               7387
Oregon                   4816
Tuscany                  4225
Bordeaux                 3389
Burgundy                 3046
Mendoza Province         2724
Piedmont                 2291
New York                 2223
Northern Spain           1770
Alsace                   1444
Loire Valley             1084
Provence                 1011
Catalonia                 968
Southwest France          872
Rhône Valley              744
Veneto                    664
Sicily & Sardinia         648
South Australia           623
Virginia                  544
Languedoc-Roussillon      481
Northeastern Italy        466
France Other              444
Central Italy             411
Other                     356
Central Spain             338
Champagne                 321
Lombardy                  300
Southern Italy            208
Levante                   201
Western Australia         200
Victoria                  175
Australia Other           148
B

In [14]:
# columns to drop: region, province
wine_df.drop(columns=['region', 'province'], inplace=True)

# columns for encoding: country, variety, rating catagory and type

In [15]:
#country
countries = []
country_encoding_count = []
counter = 0
for unique in wine_df['country'].unique():
    countries.append(unique)
for country in countries:
    counter += 1
    country_encoding_count.append(counter)

print(countries)
print(country_encoding_count)

['US', 'Argentina', 'Italy', 'France', 'Australia', 'Spain', 'Canada']
[1, 2, 3, 4, 5, 6, 7]


In [16]:
country_dict = dict(zip(countries, country_encoding_count))
country_dict

{'US': 1,
 'Argentina': 2,
 'Italy': 3,
 'France': 4,
 'Australia': 5,
 'Spain': 6,
 'Canada': 7}

In [17]:
wine_df['country'] = wine_df['country'].map(country_dict)
wine_df.head()

Unnamed: 0,country,points,price,variety,rating_category,type
0,1,87,14,Pinot Gris,Good,White
1,1,87,65,Pinot Noir,Good,Red
2,2,87,13,Malbec,Good,Red
3,1,87,19,Cabernet Sauvignon,Good,Red
4,1,87,23,Red Blend,Good,Red Blend


In [18]:
# variety
variety = []
variety_encoding_count = []
counter = 0
for unique in wine_df['variety'].unique():
    variety.append(unique)
for i in variety:
    counter += 1
    variety_encoding_count.append(counter)
variety_dict = dict(zip(variety, variety_encoding_count))
wine_df['variety'] = wine_df['variety'].map(variety_dict)
wine_df.head()

Unnamed: 0,country,points,price,variety,rating_category,type
0,1,87,14,1,Good,White
1,1,87,65,2,Good,Red
2,2,87,13,3,Good,Red
3,1,87,19,4,Good,Red
4,1,87,23,5,Good,Red Blend


In [19]:
variety_dict


{'Pinot Gris': 1,
 'Pinot Noir': 2,
 'Malbec': 3,
 'Cabernet Sauvignon': 4,
 'Red Blend': 5,
 'Merlot': 6,
 'White Blend': 7,
 'Sauvignon Blanc': 8,
 'Riesling': 9,
 'Chardonnay': 10,
 'Sangiovese': 11,
 'Cabernet Franc': 12,
 'Bordeaux-style Red Blend': 13,
 'Zinfandel': 14,
 'Rosé': 15,
 'Nebbiolo': 16,
 'Rhône-style Red Blend': 17,
 'Syrah': 18,
 'Sparkling Blend': 19,
 'Tempranillo': 20}

In [20]:
# rating_category_dict
rating_category = []
rating_category_encoding_count = []
counter = 0
for unique in wine_df['rating_category'].unique():
    rating_category.append(unique)
for i in variety:
    counter += 1
    rating_category_encoding_count.append(counter)
rating_category_dict = dict(zip(rating_category, rating_category_encoding_count))
wine_df['rating_category'] = wine_df['rating_category'].map(rating_category_dict)
wine_df.head()

Unnamed: 0,country,points,price,variety,rating_category,type
0,1,87,14,1,1,White
1,1,87,65,2,1,Red
2,2,87,13,3,1,Red
3,1,87,19,4,1,Red
4,1,87,23,5,1,Red Blend


In [21]:
# type
type = []
type_encoding_count = []
counter = 0
for unique in wine_df['type'].unique():
    type.append(unique)
for i in variety:
    counter += 1
    type_encoding_count.append(counter)
type_category_dict = dict(zip(type, type_encoding_count))
wine_df['type'] = wine_df['type'].map(type_category_dict)
wine_df.head()

Unnamed: 0,country,points,price,variety,rating_category,type
0,1,87,14,1,1,1
1,1,87,65,2,1,2
2,2,87,13,3,1,2
3,1,87,19,4,1,2
4,1,87,23,5,1,3
