In [1]:
# import via notebook
!mongoimport --type json -d wine -c ratings --drop --jsonArray ../Resources/clean_wine_data_final.json

2024-05-07T19:17:50.085-0700	connected to: mongodb://localhost/
2024-05-07T19:17:50.086-0700	dropping: wine.ratings
2024-05-07T19:17:52.205-0700	77931 document(s) imported successfully. 0 document(s) failed to import.


In [2]:
# Importing Necessary Libraries
from pymongo import MongoClient
from pprint import pprint
import pandas as pd
import tensorflow as tf

In [3]:
# Create an instance of MongoClient
mongo = MongoClient(port=27017)

In [4]:
# confirm that our new database was created
mongo.list_database_names()

['admin',
 'air_b_and_b',
 'autosaurus',
 'classDB',
 'config',
 'gardenDB',
 'local',
 'met',
 'uk_food',
 'wine']

In [5]:
db = mongo['wine']
db.list_collection_names()

['ratings']

In [6]:
#assign the collection to a variable
wine_df = db['ratings']

In [7]:
cursor = db["ratings"].find({})
json_data = list(cursor)

In [8]:
wine_df = pd.DataFrame(json_data)
wine_df.head()

Unnamed: 0,_id,country,description,points,price,province,region,title,variety,winery,rating_category,type,vintage
0,663ae0cef549a4273a08660a,US,"Tart and snappy, the flavors of lime flesh and...",87,14,Oregon,Willamette Valley,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm,Good,White,2013
1,663ae0cef549a4273a08660b,US,Building on 150 years and six generations of w...,87,12,California,Central Coast,Mirassou 2012 Chardonnay (Central Coast),Chardonnay,Mirassou,Good,White,2012
2,663ae0cef549a4273a08660c,Argentina,Raw black-cherry aromas are direct and simple ...,87,13,Mendoza Province,Mendoza,Gaucho Andino 2011 Winemaker Selection Malbec ...,Malbec,Gaucho Andino,Good,Red,2011
3,663ae0cef549a4273a08660d,US,Ripe aromas of dark berries mingle with ample ...,87,23,Virginia,Virginia,Quiévremont 2012 Vin de Maison Red (Virginia),Red Blend,Quiévremont,Good,Red,2012
4,663ae0cef549a4273a08660e,US,"A sleek mix of tart berry, stem and herb, alon...",87,20,Oregon,Oregon,Acrobat 2013 Pinot Noir (Oregon),Pinot Noir,Acrobat,Good,Red,2013


In [9]:
# Drop id columns
wine_df.drop(columns=['_id', 'description', 'title', 'winery'], inplace=True)

In [10]:
wine_df

Unnamed: 0,country,points,price,province,region,variety,rating_category,type,vintage
0,US,87,14,Oregon,Willamette Valley,Pinot Gris,Good,White,2013
1,US,87,12,California,Central Coast,Chardonnay,Good,White,2012
2,Argentina,87,13,Mendoza Province,Mendoza,Malbec,Good,Red,2011
3,US,87,23,Virginia,Virginia,Red Blend,Good,Red,2012
4,US,87,20,Oregon,Oregon,Pinot Noir,Good,Red,2013
...,...,...,...,...,...,...,...,...,...
77926,US,91,36,California,Mendocino,Chardonnay,Excellent,White,2006
77927,US,90,35,California,Napa Valley,Chardonnay,Very Good,White,2013
77928,Italy,90,20,Sicily & Sardinia,Terre Siciliane,Red Blend,Very Good,Red,2012
77929,US,90,35,California,Napa Valley,Zinfandel,Very Good,Red,2012


Examine columns for encoding

In [11]:
wine_df.nunique()

country               7
points               21
price               341
province             63
region             1000
variety              21
rating_category       6
type                  4
vintage              40
dtype: int64

In [12]:
wine_df['province'].value_counts()

province
California            31456
Washington             7387
Oregon                 4821
Tuscany                4225
Bordeaux               3390
                      ...  
Nevada                    2
Northwestern Italy        2
Kentucky                  1
Rhode Island              1
Hawaii                    1
Name: count, Length: 63, dtype: int64

In [13]:
wine_df['province'].value_counts()[wine_df['province'].value_counts() > 1]

province
California              31456
Washington               7387
Oregon                   4821
Tuscany                  4225
Bordeaux                 3390
Burgundy                 3058
Mendoza Province         2731
Piedmont                 2292
New York                 2252
Northern Spain           1770
Alsace                   1446
Champagne                1361
Loire Valley             1085
Provence                 1011
Catalonia                 981
Southwest France          872
Rhône Valley              744
Veneto                    666
Sicily & Sardinia         648
South Australia           625
Virginia                  545
Languedoc-Roussillon      481
Northeastern Italy        470
France Other              448
Central Italy             412
Other                     356
Central Spain             338
Lombardy                  302
Southern Italy            208
Levante                   201
Western Australia         200
Victoria                  175
Australia Other           151
B

In [14]:
# columns to drop: region, province
wine_df.drop(columns=['region', 'province'], inplace=True)

# columns for encoding: country, variety, rating catagory and type

In [15]:
#country
countries = []
country_encoding_count = []
counter = 0
for unique in wine_df['country'].unique():
    countries.append(unique)
for country in countries:
    counter += 1
    country_encoding_count.append(counter)

print(countries)
print(country_encoding_count)

['US', 'Argentina', 'Italy', 'France', 'Australia', 'Spain', 'Canada']
[1, 2, 3, 4, 5, 6, 7]


In [16]:
country_dict = dict(zip(countries, country_encoding_count))
country_dict

{'US': 1,
 'Argentina': 2,
 'Italy': 3,
 'France': 4,
 'Australia': 5,
 'Spain': 6,
 'Canada': 7}

In [17]:
wine_df['country'] = wine_df['country'].map(country_dict)
wine_df.head()

Unnamed: 0,country,points,price,variety,rating_category,type,vintage
0,1,87,14,Pinot Gris,Good,White,2013
1,1,87,12,Chardonnay,Good,White,2012
2,2,87,13,Malbec,Good,Red,2011
3,1,87,23,Red Blend,Good,Red,2012
4,1,87,20,Pinot Noir,Good,Red,2013


In [18]:
# variety
variety = []
variety_encoding_count = []
counter = 0
for unique in wine_df['variety'].unique():
    variety.append(unique)
for i in variety:
    counter += 1
    variety_encoding_count.append(counter)
variety_dict = dict(zip(variety, variety_encoding_count))
wine_df['variety'] = wine_df['variety'].map(variety_dict)
wine_df.head()

Unnamed: 0,country,points,price,variety,rating_category,type,vintage
0,1,87,14,1,Good,White,2013
1,1,87,12,2,Good,White,2012
2,2,87,13,3,Good,Red,2011
3,1,87,23,4,Good,Red,2012
4,1,87,20,5,Good,Red,2013


In [19]:
variety_dict


{'Pinot Gris': 1,
 'Chardonnay': 2,
 'Malbec': 3,
 'Red Blend': 4,
 'Pinot Noir': 5,
 'White Blend': 6,
 'Merlot': 7,
 'Cabernet Sauvignon': 8,
 'Sauvignon Blanc': 9,
 'Riesling': 10,
 'Champagne Blend': 11,
 'Sangiovese': 12,
 'Cabernet Franc': 13,
 'Rosé': 14,
 'Bordeaux-style Red Blend': 15,
 'Zinfandel': 16,
 'Syrah': 17,
 'Nebbiolo': 18,
 'Rhône-style Red Blend': 19,
 'Sparkling Blend': 20,
 'Tempranillo': 21}

In [20]:
# rating_category_dict
rating_category = []
rating_category_encoding_count = []
counter = 0
for unique in wine_df['rating_category'].unique():
    rating_category.append(unique)
for i in rating_category:
    counter += 1
    rating_category_encoding_count.append(counter)
rating_category_dict = dict(zip(rating_category, rating_category_encoding_count))
wine_df['rating_category'] = wine_df['rating_category'].map(rating_category_dict)
wine_df.head()

Unnamed: 0,country,points,price,variety,rating_category,type,vintage
0,1,87,14,1,1,White,2013
1,1,87,12,2,1,White,2012
2,2,87,13,3,1,Red,2011
3,1,87,23,4,1,Red,2012
4,1,87,20,5,1,Red,2013


In [21]:
rating_category_dict

{'Good': 1,
 'Very Good': 2,
 'Excellent': 3,
 'Acceptable': 4,
 'Superb': 5,
 'Classic': 6}

In [22]:
# type
type = []
type_encoding_count = []
counter = 0
for unique in wine_df['type'].unique():
    type.append(unique)
for i in type:
    counter += 1
    type_encoding_count.append(counter)
type_category_dict = dict(zip(type, type_encoding_count))
wine_df['type'] = wine_df['type'].map(type_category_dict)
wine_df.head()

Unnamed: 0,country,points,price,variety,rating_category,type,vintage
0,1,87,14,1,1,1,2013
1,1,87,12,2,1,1,2012
2,2,87,13,3,1,2,2011
3,1,87,23,4,1,2,2012
4,1,87,20,5,1,2,2013


In [23]:
type_category_dict

{'White': 1, 'Red': 2, 'Sparkling': 3, 'Rosé': 4}

In [24]:
# rating_category_dict
vintage = []
vintage_encoding_count = []
counter = 0
for unique in wine_df['vintage'].unique():
    vintage.append(unique)
for i in vintage:
    counter += 1
    vintage_encoding_count.append(counter)
vintage_category_dict = dict(zip(vintage, vintage_encoding_count))
wine_df['vintage'] = wine_df['vintage'].map(vintage_category_dict)
wine_df.head()

Unnamed: 0,country,points,price,variety,rating_category,type,vintage
0,1,87,14,1,1,1,1
1,1,87,12,2,1,1,2
2,2,87,13,3,1,2,3
3,1,87,23,4,1,2,2
4,1,87,20,5,1,2,1


In [None]:
# # Split our preprocessed data into our features and target arrays
# y = wine_dummies_df['']
# X = wine_dummies_df.drop (columns = '')

# # Split the preprocessed data into a training and testing dataset
# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [None]:
# # Create a StandardScaler instances
# scaler = StandardScaler()

# # Fit the StandardScaler
# X_scaler = scaler.fit(X_train)

# # Scale the data
# X_train_scaled = X_scaler.transform(X_train)
# X_test_scaled = X_scaler.transform(X_test)