In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
import pickle
from sklearn.model_selection import train_test_split
from scipy.stats import pearsonr
import statsmodels.api as sm
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from nltk.corpus import stopwords
from scipy.sparse import hstack
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
from sklearn.neighbors import KNeighborsClassifier as KNN
%matplotlib inline

  from pandas.core import datetools


In [2]:
data = pd.read_csv('data/winemag-data_first150k.csv')
data.head(5)

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,variety,winery
0,0,US,This tremendous 100% varietal wine hails from ...,Martha's Vineyard,96,235.0,California,Napa Valley,Napa,Cabernet Sauvignon,Heitz
1,1,Spain,"Ripe aromas of fig, blackberry and cassis are ...",Carodorum Selección Especial Reserva,96,110.0,Northern Spain,Toro,,Tinta de Toro,Bodega Carmen Rodríguez
2,2,US,Mac Watson honors the memory of a wine once ma...,Special Selected Late Harvest,96,90.0,California,Knights Valley,Sonoma,Sauvignon Blanc,Macauley
3,3,US,"This spent 20 months in 30% new French oak, an...",Reserve,96,65.0,Oregon,Willamette Valley,Willamette Valley,Pinot Noir,Ponzi
4,4,France,"This is the top wine from La Bégude, named aft...",La Brûlade,95,66.0,Provence,Bandol,,Provence red blend,Domaine de la Bégude


In [3]:
data[data.duplicated('description',keep=False)].sort_values('description').head(5)
data = data.drop_duplicates('description')
data = data[pd.notnull(data.price)]
data = data[pd.notnull(data.country)]
data.shape

(89105, 11)

In [98]:
print data.description[400]
print data.description[200]

The aromas that come together are those of scorched earth, grilled porcini, roasted coffee bean and a bit of burnt rubber. The mature palate offers dried cherry, blackberry confiture, mocha, vanilla and a hint of game alongside tongue-drying tannins that clench the finish. Give this a few years to unfold then drink sooner rather than later.
Intense notes of crushed stone and slate permeate this stately off-dry Riesling. Pert lemon and tangerine flavors are zippy and fresh, but the wine is more a showcase for its brisk, stony style.


In [11]:
# leave everything except for description out
X = data.drop(['Unnamed: 0','country','designation','points','province','region_1','region_2','variety','winery'], axis = 1)
y = data.variety

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

wine =data.variety.unique().tolist()
wine.sort()
print wine

((59991, 2), (19997, 2), (59991,), (19997,))
['Aglianico', 'Albari\xc3\xb1o', 'Barbera', 'Bordeaux-style Red Blend', 'Bordeaux-style White Blend', 'Cabernet Franc', 'Cabernet Sauvignon', 'Carmen\xc3\xa8re', 'Champagne Blend', 'Chardonnay', 'Chenin Blanc', 'Corvina, Rondinella, Molinara', 'Gamay', 'Garnacha', 'Gew\xc3\xbcrztraminer', 'Glera', 'Grenache', 'Gr\xc3\xbcner Veltliner', 'Malbec', 'Meritage', 'Merlot', 'Moscato', 'Nebbiolo', "Nero d'Avola", 'Petite Sirah', 'Pinot Blanc', 'Pinot Grigio', 'Pinot Gris', 'Pinot Noir', 'Port', 'Portuguese Red', 'Portuguese White', 'Prosecco', 'Red Blend', 'Rh\xc3\xb4ne-style Red Blend', 'Rh\xc3\xb4ne-style White Blend', 'Riesling', 'Ros\xc3\xa9', 'Sangiovese', 'Sangiovese Grosso', 'Sauvignon Blanc', 'Shiraz', 'Sparkling Blend', 'Syrah', 'Tempranillo', 'Tempranillo Blend', 'Torront\xc3\xa9s', 'Verdejo', 'Viognier', 'White Blend', 'Zinfandel']


In [12]:
# Store the train test split 
with open('X_train', 'wb') as fp1:
    pickle.dump(X_train, fp1)

with open('X_test', 'wb') as fp2:
    pickle.dump(X_test, fp2)
    
with open('y_train', 'wb') as fp3:
    pickle.dump(y_train, fp3)

with open('y_test', 'wb') as fp4:
    pickle.dump(y_test, fp4)

In [13]:
# reimport 
with open ('X_train', 'rb') as fp1:
    X_train_pk = pickle.load(fp1)

with open ('X_test', 'rb') as fp2:
    X_test_pk = pickle.load(fp2)
    
with open ('y_train', 'rb') as fp3:
    y_train_pk = pickle.load(fp3)

with open ('y_test', 'rb') as fp4:
    y_test_pk = pickle.load(fp4)

In [14]:
output = set()
for x in data.variety:
    x = x.lower()
    x = x.split()
    for y in x:
        output.add(y)

variety_list =sorted(output)
print variety_list[:10]
print len(variety_list) # too many classes?

['aglianico', 'albari\xc3\xb1o', 'barbera', 'blanc', 'blend', 'bordeaux-style', 'cabernet', 'carmen\xc3\xa8re', 'champagne', 'chardonnay']
54


In [15]:
extras = ['.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}', 'cab',"%"]
stop = set(stopwords.words('english'))
stop.update(variety_list)
stop.update(extras)

# features
vect = CountVectorizer(stop_words = stop)
X_train_dtm = vect.fit_transform(X_train.description)
price = X_train.price.values[:,None]
X_train_dtm = hstack((X_train_dtm, price))
print X_train_dtm.shape

X_test_dtm = vect.transform(X_test.description)
price_test = X_test.price.values[:,None]
X_test_dtm = hstack((X_test_dtm, price_test))
print X_test_dtm.shape

(59991, 24953)
(19997, 24953)


In [16]:
# store the train and test data for all different methods
# save words into importable object
with open('data_train', 'wb') as fp5:
    pickle.dump(X_train_dtm, fp5)

with open('data_test', 'wb') as fp6:
    pickle.dump(X_test_dtm, fp6)

In [17]:
# read back
with open ('data_train', 'rb') as fp5:
    X_train_dtm = pickle.load(fp5)

with open ('data_test', 'rb') as fp6:
    X_test_dtm = pickle.load(fp6)

In [19]:
models = {}
for z in wine:
    model = LogisticRegression()
    y = y_train == z
    model.fit(X_train_dtm, y)
    models[z] = model
testing_probs = pd.DataFrame(columns = wine)

# print score
for variety in wine:
    testing_probs[variety] = models[variety].predict_proba(X_test_dtm)[:,1]

predicted_wine = testing_probs.idxmax(axis=1)

comparison = pd.DataFrame({'actual':y_test.values, 'predicted':predicted_wine.values})

print('Accuracy Score:',accuracy_score(comparison.actual, comparison.predicted)*100,"%")
print comparison.head(20)

('Accuracy Score:', 52.945721460305194, '%')
                   actual           predicted
0              Chardonnay          Chardonnay
1               Aglianico           Aglianico
2              Pinot Noir          Pinot Noir
3               Carmenère              Malbec
4      Cabernet Sauvignon  Cabernet Sauvignon
5   Rhône-style Red Blend           Red Blend
6               Carmenère            Garnacha
7         Sauvignon Blanc     Sauvignon Blanc
8                  Shiraz  Cabernet Sauvignon
9              Pinot Noir          Pinot Noir
10               Riesling            Riesling
11             Pinot Noir          Pinot Noir
12        Sauvignon Blanc     Sauvignon Blanc
13               Nebbiolo            Nebbiolo
14             Sangiovese           Red Blend
15           Nero d'Avola           Red Blend
16                  Syrah  Cabernet Sauvignon
17               Albariño            Albariño
18               Viognier          Pinot Gris
19     Cabernet Sauvignon  Cabernet

# Logistic Regression

In [21]:
len(set(y_train))

51

In [44]:
models = {}
for z in wine:
    model = LogisticRegression()
    y = y_train == z
    model.fit(X_train_dtm, y)
    models[z] = model
testing_probs = pd.DataFrame(columns = wine)
training_probs = pd.DataFrame(columns = wine)

# predict for training set 
for variety in wine:
    training_probs[variety] = models[variety].predict_proba(X_train_dtm)[:,1]
train_predicted_wine = training_probs.idxmax(axis=1)

# predict for test set
for variety in wine:
    testing_probs[variety] = models[variety].predict_proba(X_test_dtm)[:,1]
test_predicted_wine = testing_probs.idxmax(axis=1)

train_comparison = pd.DataFrame({'actual':y_train.values, 'predicted':train_predicted_wine.values})
test_comparison = pd.DataFrame({'actual':y_test.values, 'predicted':test_predicted_wine.values})

print('Training Set Accuracy Score:',accuracy_score(train_comparison.actual, train_comparison.predicted)*100,"%")
print('Test Set Accuracy Score:',accuracy_score(test_comparison.actual, test_comparison.predicted)*100,"%")
print test_comparison.head(20)

('Training Set Accuracy Score:', 84.552682902435365, '%')
('Test Set Accuracy Score:', 54.438165724858735, '%')
                actual                 predicted
0           Pinot Noir                Pinot Noir
1   Cabernet Sauvignon        Cabernet Sauvignon
2      Champagne Blend                Chardonnay
3          Tempranillo                    Malbec
4      Champagne Blend                Chardonnay
5     Grüner Veltliner                 Red Blend
6               Shiraz        Cabernet Sauvignon
7            Red Blend                 Red Blend
8                Syrah                     Syrah
9            Red Blend        Cabernet Sauvignon
10          Chardonnay                Chardonnay
11          Pinot Noir                Pinot Noir
12            Grenache                Sangiovese
13            Riesling                  Riesling
14          Chardonnay                Chardonnay
15           Red Blend                 Red Blend
16          Pinot Noir                Pinot Noir
17    

In [92]:
print len(wine)

51


In [48]:
# target names
target_names = ['Aglianico', 'Albarino', 'Barbera', 'Bordeaux-style Red Blend', 'Bordeaux-style White Blend',
 'Cabernet Franc', 'Cabernet Sauvignon', 'Carmenere', 'Champagne Blend', 'Chardonnay', 'Chenin Blanc', 
 'Corvina, Rondinella, Molinara', 'Gamay', 'Garnacha', 'Gewurztraminer', 'Glera', 'Grenache', 
 'Gruner Veltliner ', 'Malbec', 'Meritage', 'Merlot', 'Moscato', 'Nebbiolo', "Nero d'Avola", 
 'Petite Sirah', 'Pinot Blanc', 'Pinot Grigio', 'Pinot Gris', 'Pinot Noir', 'Port', 'Portuguese Red', 
 'Portuguese White', 'Prosecco', 'Red Blend', 'Rhone-style Red Blend', 'Rhone-style White Blend', 'Riesling', 
 'Rose', 'Sangiovese', 'Sangiovese Grosso', 'Sauvignon Blanc', 'Shiraz', 
 'Sparkling Blend', 'Syrah', 'Tempranillo', 'Tempranillo Blend', 'Torrontes', 'Verdejo', 
 'Viognier', 'White Blend', 'Zinfandel']

In [49]:
# from sklearn.metrics import classification_report
y_true = y_test.values
y_pred = test_predicted_wine.values 
print(classification_report(y_true, y_pred, target_names=target_names))

                               precision    recall  f1-score   support

                    Aglianico       0.60      0.21      0.31        57
                     Albarino       0.62      0.49      0.55        77
                      Barbera       0.58      0.26      0.36       117
     Bordeaux-style Red Blend       0.52      0.51      0.52       803
   Bordeaux-style White Blend       0.51      0.28      0.36        94
               Cabernet Franc       0.17      0.05      0.08       200
           Cabernet Sauvignon       0.46      0.62      0.53      2009
                    Carmenere       0.47      0.35      0.40       121
              Champagne Blend       0.58      0.40      0.47       168
                   Chardonnay       0.64      0.84      0.73      2172
                 Chenin Blanc       0.37      0.18      0.24        89
Corvina, Rondinella, Molinara       0.79      0.71      0.75       227
                        Gamay       0.71      0.35      0.47        68
     

# Tuning Logistic Regression 
- different levels of regularization
- L1 vs. L2 penalty

In [50]:
# L2 Penalty
params = [0.001, 0.01, 0.1, 1, 10, 100, 1000]

for p in params:
    
    models = {}
    for z in wine:
        model = LogisticRegression(penalty='l2',C=p)
        y = y_train == z
        model.fit(X_train_dtm, y)
        models[z] = model
    testing_probs = pd.DataFrame(columns = wine)
    training_probs = pd.DataFrame(columns = wine)

    # predict for training set 
    for variety in wine:
        training_probs[variety] = models[variety].predict_proba(X_train_dtm)[:,1]
    train_predicted_wine = training_probs.idxmax(axis=1)

    # predict for test set
    for variety in wine:
        testing_probs[variety] = models[variety].predict_proba(X_test_dtm)[:,1]
    test_predicted_wine = testing_probs.idxmax(axis=1)

    train_comparison = pd.DataFrame({'actual':y_train.values, 'predicted':train_predicted_wine.values})
    test_comparison = pd.DataFrame({'actual':y_test.values, 'predicted':test_predicted_wine.values})
    
    print('The regularization constant is', p)
    print('Training Set Accuracy Score:',accuracy_score(train_comparison.actual, train_comparison.predicted)*100,"%")
    print('Test Set Accuracy Score:',accuracy_score(test_comparison.actual, test_comparison.predicted)*100,"%")

  np.exp(prob, prob)


('The regularization constant is', 0.001)
('Training Set Accuracy Score:', 35.218616125752192, '%')
('Test Set Accuracy Score:', 34.045106766014904, '%')
('The regularization constant is', 0.01)
('Training Set Accuracy Score:', 49.020686436298774, '%')
('Test Set Accuracy Score:', 45.996899534930243, '%')
('The regularization constant is', 0.1)
('Training Set Accuracy Score:', 65.588171559067206, '%')
('Test Set Accuracy Score:', 53.798069710456566, '%')
('The regularization constant is', 1)
('Training Set Accuracy Score:', 84.552682902435365, '%')
('Test Set Accuracy Score:', 54.438165724858735, '%')
('The regularization constant is', 10)
('Training Set Accuracy Score:', 94.709206380957141, '%')
('Test Set Accuracy Score:', 50.457568635295289, '%')
('The regularization constant is', 100)
('Training Set Accuracy Score:', 97.627977529962834, '%')
('Test Set Accuracy Score:', 46.056908536280446, '%')
('The regularization constant is', 1000)
('Training Set Accuracy Score:', 98.19139537597

In [56]:
# L1 PENAULTY
params = [0.001, 0.01, 0.1, 1, 10, 100, 1000]

for p in params:
    
    models = {}
    for z in wine:
        model = LogisticRegression(penalty='l1',C = p)
        y = y_train == z
        model.fit(X_train_dtm, y)
        models[z] = model
    testing_probs = pd.DataFrame(columns = wine)
    training_probs = pd.DataFrame(columns = wine)

    # predict for training set 
    for variety in wine:
        training_probs[variety] = models[variety].predict_proba(X_train_dtm)[:,1]
    train_predicted_wine = training_probs.idxmax(axis=1)

    # predict for test set
    for variety in wine:
        testing_probs[variety] = models[variety].predict_proba(X_test_dtm)[:,1]
    test_predicted_wine = testing_probs.idxmax(axis=1)

    train_comparison = pd.DataFrame({'actual':y_train.values, 'predicted':train_predicted_wine.values})
    test_comparison = pd.DataFrame({'actual':y_test.values, 'predicted':test_predicted_wine.values})
    
    print('The regularization constant is', p)
    print('Training Set Accuracy Score:',accuracy_score(train_comparison.actual, train_comparison.predicted)*100,"%")
    print('Test Set Accuracy Score:',accuracy_score(test_comparison.actual, test_comparison.predicted)*100,"%")

('The regularization constant is', 0.001)
('Training Set Accuracy Score:', 17.707656148422263, '%')
('Test Set Accuracy Score:', 17.357603640546081, '%')
('The regularization constant is', 0.01)
('Training Set Accuracy Score:', 37.82567385107766, '%')
('Test Set Accuracy Score:', 36.885532829924486, '%')
('The regularization constant is', 0.1)
('Training Set Accuracy Score:', 53.468020203030456, '%')
('Test Set Accuracy Score:', 50.632594889233381, '%')
('The regularization constant is', 1)
('Training Set Accuracy Score:', 74.346151922788422, '%')
('Test Set Accuracy Score:', 55.068260239035858, '%')
('The regularization constant is', 10)
('Training Set Accuracy Score:', 96.267773499358228, '%')
('Test Set Accuracy Score:', 48.497274591188678, '%')
('The regularization constant is', 100)
('Training Set Accuracy Score:', 98.731476388124889, '%')
('Test Set Accuracy Score:', 42.396359453918087, '%')
('The regularization constant is', 1000)
('Training Set Accuracy Score:', 99.318231067993

# Possible Improvements
- Regularization?
- Fewer features for text (is it fixable by regularization though)?
- More predicting variables

In [53]:
print set(X_train_dtm.data)

set([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0, 47.0, 48.0, 49.0, 50.0, 51.0, 52.0, 53.0, 54.0, 55.0, 56.0, 57.0, 58.0, 59.0, 60.0, 61.0, 62.0, 63.0, 64.0, 65.0, 66.0, 67.0, 68.0, 69.0, 70.0, 71.0, 72.0, 73.0, 74.0, 75.0, 76.0, 77.0, 78.0, 79.0, 80.0, 81.0, 82.0, 83.0, 84.0, 85.0, 86.0, 87.0, 88.0, 89.0, 90.0, 91.0, 92.0, 93.0, 94.0, 95.0, 96.0, 97.0, 98.0, 99.0, 100.0, 101.0, 102.0, 103.0, 104.0, 105.0, 106.0, 107.0, 108.0, 109.0, 110.0, 111.0, 112.0, 113.0, 114.0, 115.0, 116.0, 117.0, 118.0, 119.0, 120.0, 121.0, 122.0, 123.0, 124.0, 125.0, 126.0, 127.0, 128.0, 129.0, 130.0, 131.0, 132.0, 133.0, 134.0, 135.0, 136.0, 137.0, 138.0, 535.0, 140.0, 141.0, 142.0, 143.0, 144.0, 145.0, 146.0, 147.0, 148.0, 149.0, 150.0, 151.0, 152.0, 153.0, 154.0, 155.0, 156.0, 157.0, 158.

In [70]:
# the numbers
words = vect.get_feature_names()
print len(words) 
print words[:500]
print words[494:500]

25415
[u'00', u'000', u'002', u'008', u'01', u'01s', u'02', u'02s', u'03', u'03s', u'04', u'04s', u'05', u'056', u'05s', u'06', u'061', u'064', u'06s', u'07', u'07s', u'08', u'080', u'082', u'08s', u'09', u'093', u'10', u'100', u'1000', u'100g', u'101', u'103', u'104', u'105', u'106', u'107', u'108', u'10th', u'11', u'110', u'1100s', u'111', u'112', u'114', u'115', u'117', u'1170', u'11th', u'12', u'120', u'1200', u'122', u'1232', u'125', u'126', u'128', u'12g', u'12th', u'13', u'130', u'135', u'138', u'1396', u'13th', u'14', u'140', u'143', u'1475', u'1492', u'1498', u'14g', u'14th', u'15', u'150', u'1500', u'1500s', u'150th', u'151', u'1522', u'153', u'154', u'15g', u'15th', u'16', u'160', u'1600', u'1618', u'165', u'166', u'169', u'1698', u'16th', u'17', u'170', u'171', u'1716', u'174', u'1744', u'175', u'1789', u'17g', u'17th', u'18', u'180', u'1806', u'1811', u'1819', u'183', u'1843', u'1844', u'185', u'1850s', u'1860s', u'1865', u'1873', u'1875', u'1877', u'1880s', u'1882', u'188

In [59]:
# loop over # of max_features
max_features = [999, 2999, 4999, 6999, 8999]
for m in max_features:
    vect_select = CountVectorizer(stop_words = stop, max_features = m)
    X_train_select = vect_select.fit_transform(X_train.description)
    price = X_train.price.values[:,None]

    X_train_select = hstack((X_train_select, price))
    print X_train_select.shape

    X_test_select = vect_select.transform(X_test.description)
    price_test = X_test.price.values[:,None]
    X_test_select = hstack((X_test_select, price_test))
    print X_test_select.shape

    models = {}
    for z in wine:
        model = LogisticRegression(penalty='l2', C = 1.0)
        y = y_train == z
        model.fit(X_train_select, y)
        models[z] = model
        
    testing_probs = pd.DataFrame(columns = wine)
    training_probs = pd.DataFrame(columns = wine)

    # predict for training set 
    for variety in wine:
        training_probs[variety] = models[variety].predict_proba(X_train_select)[:,1]
    train_predicted_wine = training_probs.idxmax(axis=1)

    # predict for test set
    for variety in wine:
        testing_probs[variety] = models[variety].predict_proba(X_test_select)[:,1]
    test_predicted_wine = testing_probs.idxmax(axis=1)

    train_comparison = pd.DataFrame({'actual':y_train.values, 'predicted':train_predicted_wine.values})
    test_comparison = pd.DataFrame({'actual':y_test.values, 'predicted':test_predicted_wine.values})
    
    print('The number of word features is', m)
    print('Training Set Accuracy Score:',accuracy_score(train_comparison.actual, train_comparison.predicted)*100,"%")
    print('Test Set Accuracy Score:',accuracy_score(test_comparison.actual, test_comparison.predicted)*100,"%")

(59991, 1000)
(19997, 1000)
('The number of word features is', 999)
('Training Set Accuracy Score:', 59.133870080512075, '%')
('Test Set Accuracy Score:', 47.782167325098762, '%')
(59991, 3000)
(19997, 3000)
('The number of word features is', 2999)
('Training Set Accuracy Score:', 73.079295227617479, '%')
('Test Set Accuracy Score:', 52.327849177376606, '%')
(59991, 5000)
(19997, 5000)
('The number of word features is', 4999)
('Training Set Accuracy Score:', 78.180060342384692, '%')
('Test Set Accuracy Score:', 53.438015702355358, '%')
(59991, 7000)
(19997, 7000)
('The number of word features is', 6999)
('Training Set Accuracy Score:', 80.603757230251205, '%')
('Test Set Accuracy Score:', 53.813071960794126, '%')
(59991, 9000)
(19997, 9000)
('The number of word features is', 8999)
('Training Set Accuracy Score:', 81.983964261305857, '%')
('Test Set Accuracy Score:', 54.088113216982549, '%')


In [60]:
# loop over # of max_features
max_features = [9999, 10999, 11999]
for m in max_features:
    vect_select = CountVectorizer(stop_words = stop, max_features = m)
    X_train_select = vect_select.fit_transform(X_train.description)
    price = X_train.price.values[:,None]

    X_train_select = hstack((X_train_select, price))
    print X_train_select.shape

    X_test_select = vect_select.transform(X_test.description)
    price_test = X_test.price.values[:,None]
    X_test_select = hstack((X_test_select, price_test))
    print X_test_select.shape

    models = {}
    for z in wine:
        model = LogisticRegression(penalty='l2', C = 1.0)
        y = y_train == z
        model.fit(X_train_select, y)
        models[z] = model
        
    testing_probs = pd.DataFrame(columns = wine)
    training_probs = pd.DataFrame(columns = wine)

    # predict for training set 
    for variety in wine:
        training_probs[variety] = models[variety].predict_proba(X_train_select)[:,1]
    train_predicted_wine = training_probs.idxmax(axis=1)

    # predict for test set
    for variety in wine:
        testing_probs[variety] = models[variety].predict_proba(X_test_select)[:,1]
    test_predicted_wine = testing_probs.idxmax(axis=1)

    train_comparison = pd.DataFrame({'actual':y_train.values, 'predicted':train_predicted_wine.values})
    test_comparison = pd.DataFrame({'actual':y_test.values, 'predicted':test_predicted_wine.values})
    
    print('The number of word features is', m)
    print('Training Set Accuracy Score:',accuracy_score(train_comparison.actual, train_comparison.predicted)*100,"%")
    print('Test Set Accuracy Score:',accuracy_score(test_comparison.actual, test_comparison.predicted)*100,"%")

(59991, 10000)
(19997, 10000)
('The number of word features is', 9999)
('Training Set Accuracy Score:', 82.394025770532238, '%')
('Test Set Accuracy Score:', 54.173125968895327, '%')
(59991, 11000)
(19997, 11000)
('The number of word features is', 10999)
('Training Set Accuracy Score:', 82.720741444550015, '%')
('Test Set Accuracy Score:', 54.128119217882684, '%')
(59991, 12000)
(19997, 12000)
('The number of word features is', 11999)
('Training Set Accuracy Score:', 83.064126285609504, '%')
('Test Set Accuracy Score:', 54.173125968895327, '%')


## We can see that the optimal params are:
### max_features = 8999
### C = 1.0
### penalty = 'l1'

# Apply SVM

In [74]:
# features
vect_svm = CountVectorizer(stop_words = stop, max_features = 8999)

In [75]:
# Training Support Vector Machines - SVM and calculating its performance
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import TfidfTransformer

text_clf_svm = Pipeline([('vect', vect_svm), ('tfidf', TfidfTransformer()),
                         ('clf-svm', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, n_iter=5, random_state=42))])

text_clf_svm = text_clf_svm.fit(X_train.description, y_train)
predicted_svm = text_clf_svm.predict(X_test.description)
np.mean(predicted_svm == y_test)



0.49662449367405109

In [99]:
# Training Support Vector Machines - SVM and calculating its performance
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import TfidfTransformer

text_clf_svm = Pipeline([('vect', vect_svm),
                         ('clf-svm', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, n_iter=5, random_state=42))])

text_clf_svm = text_clf_svm.fit(X_train.description, y_train)
predicted_svm = text_clf_svm.predict(X_test.description)
np.mean(predicted_svm == y_test)



0.51652747912186825

In [100]:
# Tune SVM
# All the parameters name start with the classifier name (remember the arbitrary name we gave). 
# E.g. vect__ngram_range; here we are telling to use unigram and bigrams and choose the one which is optimal.

# doing grid search for SVM# Simila 
from sklearn.model_selection import GridSearchCV
parameters_svm = {'vect__ngram_range': [(1, 1), (1, 2)],'clf-svm__alpha': (0.001, 0.01, 0.1, 1, 10, 100, 1000)}

gs_clf_svm = GridSearchCV(text_clf_svm, parameters_svm, n_jobs=-1)
gs_clf_svm = gs_clf_svm.fit(X_train.description, y_train)

gs_clf_svm.best_score_
gs_clf_svm.best_params_

{'clf-svm__alpha': 0.001, 'vect__ngram_range': (1, 1)}

In [101]:
gs_clf_svm.cv_results_ 

{'mean_fit_time': array([  9.79290795,  20.96530461,  10.79501573,  22.88069646,
         10.3427763 ,  20.78903762,  11.26008534,  20.85040498,
         10.60415967,  19.20159396,   9.75585739,  18.26846464,
          9.62529008,  16.31044737]),
 'mean_score_time': array([ 3.119229  ,  5.48436499,  3.15463575,  4.37587126,  3.10733581,
         5.62439426,  2.77667753,  4.98052597,  2.8078146 ,  4.54225334,
         2.55060561,  4.31158137,  2.45465899,  3.31421796]),
 'mean_test_score': array([ 0.51349369,  0.51055992,  0.44663366,  0.45176777,  0.35041923,
         0.35668684,  0.28974346,  0.29327732,  0.27409111,  0.27619143,
         0.04777383,  0.05162441,  0.02855428,  0.02855428]),
 'mean_train_score': array([ 0.65336393,  0.70851381,  0.48301429,  0.50220092,  0.35872874,
         0.36680507,  0.29263604,  0.29693668,  0.27563354,  0.27804233,
         0.04734676,  0.05137934,  0.02854396,  0.02854396]),
 'param_clf-svm__alpha': masked_array(data = [0.001 0.001 0.01 0.01 0.1

# Naive Bayes

In [102]:
# Training Naive Bayes (NB) classifier on training data.
from sklearn.naive_bayes import MultinomialNB
text_clf = Pipeline([('vect', CountVectorizer(stop_words = stop, max_features = 8999)), ('clf', MultinomialNB())])
text_clf = text_clf.fit(X_train.description, y_train)
predicted = text_clf.predict(X_test.description)
np.mean(predicted == y_test)

0.48592288843326498

In [103]:
alpha = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
for a in alpha:
    text_clf = Pipeline([('vect', CountVectorizer(stop_words = stop, max_features = 8999)), 
                          ('clf', MultinomialNB(alpha = a))])
    text_clf = text_clf.fit(X_train.description, y_train)
    predicted = text_clf.predict(X_test.description)
    print np.mean(predicted == y_test), 'with alpha value of', a

0.45511826774 with alpha value of 0.001
0.464169625444 with alpha value of 0.01
0.473821073161 with alpha value of 0.1
0.485922888433 with alpha value of 1
0.376506475971 with alpha value of 10
0.301395209281 with alpha value of 100
0.280542081312 with alpha value of 1000


# Include More Variables - Slightly worse results

In [106]:
print 'The number of countries is', len(set(data.country)), 'points include', set(data.points)
print len(set(data.points))
print 'The number of wineries included is', len(set(data.winery)), 'The number of wines is', len(data)

The number of countries is 41 points include set([96, 97, 98, 99, 100, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95])
21
The number of wineries included is 13023 The number of wines is 79988


In [144]:
# Try including country and points to model
print set(data.country), 'number of countries', len(set(data.country))
data['country_label'] = pd.Categorical.from_array(data.country).labels
print set(data.country_label)

set(['Canada', 'Turkey', 'Italy', 'Czech Republic', 'Lebanon', 'Lithuania', 'Luxembourg', 'France', 'Slovakia', 'Argentina', 'Israel', 'Australia', 'Mexico', 'Montenegro', 'Slovenia', 'Germany', 'Bosnia and Herzegovina', 'Chile', 'China', 'Serbia', 'Spain', 'Ukraine', 'US-France', 'Georgia', 'Macedonia', 'Moldova', 'Morocco', 'Croatia', 'Japan', 'Switzerland', 'New Zealand', 'Brazil', 'Bulgaria', 'Romania', 'Albania', 'England', 'Portugal', 'South Africa', 'Uruguay', 'India', 'US', 'Austria', 'Greece', 'Hungary', 'South Korea', 'Cyprus']) number of countries 46


  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app


set([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45])


In [146]:
# leave everything except for description out
X_new = data.drop(['Unnamed: 0','designation','country','province','region_1','region_2','variety','winery'], axis = 1)
y_new = data.variety
X_train, X_test, y_train, y_test = train_test_split(X_new, y_new, random_state=1)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

((66828, 4), (22277, 4), (66828,), (22277,))


In [156]:
# SOME WEIRD DIMENSION ISSUE HERE - GET IT FIXED
vect_select = CountVectorizer(stop_words = stop) #MAX FEATURES = SOMETHING

X_train_select_desc = vect_select.fit_transform(X_train.description)
X_test_select_desc = vect_select.transform(X_test.description)

train_price = X_train.price.values[:,None]
test_price = X_test.price.values[:,None]

# include new features 
train_points =X_train.points.values[:, None]
test_points =X_test.points.values[:, None]

# check dimensions
print X_train_select_desc.shape,len(train_price), len(train_country), len(train_points)
print X_test_select_desc.shape,len(test_price), len(test_country), len(test_points)

(66828, 26359) 66828 66828 66828
(22277, 26359) 22277 22277 22277


In [157]:
# hstack - something still wrong with country! points is included 
X_train_select = hstack((X_train_select_desc, train_price, train_points))
X_test_select = hstack((X_test_select_desc, test_price, test_points))

In [159]:
# fit logistic regression again with default params again
models = {}
for z in wine:
    model = LogisticRegression()
    y = y_train == z
    model.fit(X_train_select, y)
    models[z] = model
testing_probs = pd.DataFrame(columns = wine)

# print score
for variety in wine:
    testing_probs[variety] = models[variety].predict_proba(X_test_select)[:,1]

predicted_wine = testing_probs.idxmax(axis=1)
comparison = pd.DataFrame({'actual':y_test.values, 'predicted':predicted_wine.values})

print('Accuracy Score with points as additional predictor:',accuracy_score(comparison.actual, comparison.predicted)*100,"%")
print comparison.head(10)

('Accuracy Score with points as additional predictor:', 49.104457512232344, '%')
               actual           predicted
0  Cabernet Sauvignon              Merlot
1    Grüner Veltliner    Grüner Veltliner
2  Cabernet Sauvignon  Cabernet Sauvignon
3   Sangiovese Grosso          Pinot Noir
4           Red Blend           Red Blend
5          Sangiovese          Pinot Noir
6            Riesling            Riesling
7          Sangiovese          Sangiovese
8    Portuguese White    Portuguese White
9      Cabernet Franc  Cabernet Sauvignon


In [161]:
# adding country as predictor 
train_country =X_train.country_label[:, None]
test_country =X_test.country_label[:, None]

# hstack - something still wrong with country! points is included 
X_train_select = hstack((X_train_select_desc, train_country, train_price, train_points))
X_test_select = hstack((X_test_select_desc, test_country, test_price, test_points))
models = {}

for z in wine:
    model = LogisticRegression()
    y = y_train == z
    model.fit(X_train_select, y)
    models[z] = model
testing_probs = pd.DataFrame(columns = wine)

# print score
for variety in wine:
    testing_probs[variety] = models[variety].predict_proba(X_test_select)[:,1]

predicted_wine = testing_probs.idxmax(axis=1)
comparison = pd.DataFrame({'actual':y_test.values, 'predicted':predicted_wine.values})

print('Accuracy Score with points and country as additional predictor:',accuracy_score(comparison.actual, comparison.predicted)*100,"%")
print comparison.head(10)

('Accuracy Score with points and country as additional predictor:', 50.608250662117882, '%')
               actual         predicted
0  Cabernet Sauvignon            Merlot
1    Grüner Veltliner  Grüner Veltliner
2  Cabernet Sauvignon         Zinfandel
3   Sangiovese Grosso        Pinot Noir
4           Red Blend         Red Blend
5          Sangiovese        Pinot Noir
6            Riesling          Riesling
7          Sangiovese        Sangiovese
8    Portuguese White  Portuguese White
9      Cabernet Franc         Red Blend
