# Loading

In [1]:
# import via notebook
!mongoimport --type json -d wine -c ratings --drop --jsonArray ../Resources/clean_wine_data_final.json

2024-05-07T19:50:44.609-0700	connected to: mongodb://localhost/
2024-05-07T19:50:44.609-0700	dropping: wine.ratings
2024-05-07T19:50:46.705-0700	77931 document(s) imported successfully. 0 document(s) failed to import.


In [2]:
# Importing Necessary Libraries
from pymongo import MongoClient
from pprint import pprint
import pandas as pd
import tensorflow as tf
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
%matplotlib inline

# Loading

In [3]:
# Create an instance of MongoClient
mongo = MongoClient(port=27017)

In [4]:
# Create an instance of MongoClient
mongo = MongoClient(port=27017)

In [5]:
# Saving class to a varible
db = mongo['wine']
db.list_collection_names()

['ratings']

In [6]:
#assign the collection to a variable
wine_df = db['ratings']

In [7]:
cursor = db["ratings"].find({})
json_data = list(cursor)

In [8]:
wine_df = pd.DataFrame(json_data)
wine_df.head()

Unnamed: 0,_id,country,description,points,price,province,region,title,variety,winery,rating_category,type,vintage
0,663ae8840a70a05d2e5496fb,France,This has great depth of flavor with its fresh ...,87,27,Alsace,Alsace,Jean-Baptiste Adam 2012 Les Natures Pinot Gris...,Pinot Gris,Jean-Baptiste Adam,Good,White,2012
1,663ae8840a70a05d2e5496fc,US,"Slightly reduced, this wine offers a chalky, t...",87,34,California,Alexander Valley,Louis M. Martini 2012 Cabernet Sauvignon (Alex...,Cabernet Sauvignon,Louis M. Martini,Good,Red,2012
2,663ae8840a70a05d2e5496fd,US,Building on 150 years and six generations of w...,87,12,California,Central Coast,Mirassou 2012 Chardonnay (Central Coast),Chardonnay,Mirassou,Good,White,2012
3,663ae8840a70a05d2e5496fe,US,"Soft, supple plum envelopes an oaky structure ...",87,19,California,Napa Valley,Kirkland Signature 2011 Mountain Cuvée Caberne...,Cabernet Sauvignon,Kirkland Signature,Good,Red,2011
4,663ae8840a70a05d2e5496ff,Argentina,Raw black-cherry aromas are direct and simple ...,87,13,Mendoza Province,Mendoza,Gaucho Andino 2011 Winemaker Selection Malbec ...,Malbec,Gaucho Andino,Good,Red,2011


# DF manipulation

In [9]:
# Drop the non-beneficial ID columns
wine_df = wine_df.drop(columns = ['_id','description','title', 'winery', 'region'])
wine_df.head()

Unnamed: 0,country,points,price,province,variety,rating_category,type,vintage
0,France,87,27,Alsace,Pinot Gris,Good,White,2012
1,US,87,34,California,Cabernet Sauvignon,Good,Red,2012
2,US,87,12,California,Chardonnay,Good,White,2012
3,US,87,19,California,Cabernet Sauvignon,Good,Red,2011
4,Argentina,87,13,Mendoza Province,Malbec,Good,Red,2011


In [10]:
# Create target
wine_df['target'] =wine_df['points']>=90
wine_df.tail()

Unnamed: 0,country,points,price,province,variety,rating_category,type,vintage,target
77926,US,90,22,California,Zinfandel,Very Good,Red,2011,True
77927,France,90,32,Alsace,Pinot Gris,Very Good,White,2012,True
77928,France,90,28,Alsace,Pinot Gris,Very Good,White,2013,True
77929,US,90,75,Oregon,Pinot Noir,Very Good,Red,2004,True
77930,France,90,57,Alsace,Pinot Gris,Very Good,White,2010,True


In [11]:
#Drop uneeded columns
wine_df = wine_df.drop(columns = ['points','rating_category', 'province', 'type'])

In [12]:
# Review
wine_df.head()

Unnamed: 0,country,price,variety,vintage,target
0,France,27,Pinot Gris,2012,False
1,US,34,Cabernet Sauvignon,2012,False
2,US,12,Chardonnay,2012,False
3,US,19,Cabernet Sauvignon,2011,False
4,Argentina,13,Malbec,2011,False


In [13]:
# Check for null
wine_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77931 entries, 0 to 77930
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   country  77931 non-null  object
 1   price    77931 non-null  int64 
 2   variety  77931 non-null  object
 3   vintage  77931 non-null  object
 4   target   77931 non-null  bool  
dtypes: bool(1), int64(1), object(3)
memory usage: 2.5+ MB


In [14]:
# Check for invalid unique values
wine_df['vintage'].unique()

array(['2012', '2011', '2013', '2007', '2010', '2014', '2015', 'No Year',
       '2009', '2016', '2004', '2003', '2006', '2001', '2008', '2005',
       '2002', '1991', '1999', '1887', '1997', '2000', '2017', '1996',
       '1998', '1995', '1994', '1992', '1990', '1988', '1872', '1989',
       '1993', '1882', '1985', '1904', '1986', '1987', '1919', '1945'],
      dtype=object)

In [15]:
# Convert No year to null and drop 
wine_df['vintage'] = wine_df['vintage'].replace('No Year', pd.NA)
wine_df.head()

Unnamed: 0,country,price,variety,vintage,target
0,France,27,Pinot Gris,2012,False
1,US,34,Cabernet Sauvignon,2012,False
2,US,12,Chardonnay,2012,False
3,US,19,Cabernet Sauvignon,2011,False
4,Argentina,13,Malbec,2011,False


In [16]:
wine_df['vintage'].unique()

array(['2012', '2011', '2013', '2007', '2010', '2014', '2015', <NA>,
       '2009', '2016', '2004', '2003', '2006', '2001', '2008', '2005',
       '2002', '1991', '1999', '1887', '1997', '2000', '2017', '1996',
       '1998', '1995', '1994', '1992', '1990', '1988', '1872', '1989',
       '1993', '1882', '1985', '1904', '1986', '1987', '1919', '1945'],
      dtype=object)

In [17]:
wine_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77931 entries, 0 to 77930
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   country  77931 non-null  object
 1   price    77931 non-null  int64 
 2   variety  77931 non-null  object
 3   vintage  75256 non-null  object
 4   target   77931 non-null  bool  
dtypes: bool(1), int64(1), object(3)
memory usage: 2.5+ MB


In [18]:
# Drop created nulls
wine_df.dropna()

Unnamed: 0,country,price,variety,vintage,target
0,France,27,Pinot Gris,2012,False
1,US,34,Cabernet Sauvignon,2012,False
2,US,12,Chardonnay,2012,False
3,US,19,Cabernet Sauvignon,2011,False
4,Argentina,13,Malbec,2011,False
...,...,...,...,...,...
77926,US,22,Zinfandel,2011,True
77927,France,32,Pinot Gris,2012,True
77928,France,28,Pinot Gris,2013,True
77929,US,75,Pinot Noir,2004,True


In [19]:
# Convert categorical data to numeric with `pd.get_dummies`
wine_dummies_df = pd.get_dummies(wine_df, dtype = int)
wine_dummies_df.head()

Unnamed: 0,price,target,country_Argentina,country_Australia,country_Canada,country_France,country_Italy,country_Spain,country_US,variety_Bordeaux-style Red Blend,...,vintage_2008,vintage_2009,vintage_2010,vintage_2011,vintage_2012,vintage_2013,vintage_2014,vintage_2015,vintage_2016,vintage_2017
0,27,False,0,0,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,34,False,0,0,0,0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
2,12,False,0,0,0,0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
3,19,False,0,0,0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0
4,13,False,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


# Future use

In [20]:
# Split our preprocessed data into our features and target arrays
y = wine_dummies_df['target']
X = wine_dummies_df.drop (columns = 'target')

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [21]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)