In [30]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.sparse import hstack
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
%matplotlib inline

In [31]:
wine_base = pd.read_csv("Our_dataset/StemmedWord2vecTop3_parsed_weather_labeled.csv", index_col=0)
#wine_base = pd.read_csv("Our_dataset/winemag-data-130k-v2.csv", index_col=0)
wine_base = wine_base.reset_index()
wine_base= wine_base[pd.notnull(wine_base['description'])]
wine_base.head(2)

Unnamed: 0,vintage,country,description,points,price,province,region_1,taster_name,variety,winery,...,pr_5,pr_6,pr_7,pr_8,pr_9,tas_5,tas_6,tas_7,tas_8,tas_9
0,1952,8,year ii oak deep oldgold color rich sweet wood...,95,499.0,164,1201,13,376,10276,...,120.181,54.8968,13.3656,12.3677,71.6485,16.0018,20.1771,21.6355,21.3967,18.5723
1,1952,8,astonish age rich toffe warm intens concentr d...,96,415.0,164,1201,13,376,1703,...,120.181,54.8968,13.3656,12.3677,71.6485,16.0018,20.1771,21.6355,21.3967,18.5723


In [32]:
X = wine_base.drop(['points'], axis=1)
Y = wine_base['points'].copy()

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.20, random_state=42)

In [33]:
list(X_train)

#list of attribute that you can select to train the model

['vintage',
 'country',
 'description',
 'price',
 'province',
 'region_1',
 'taster_name',
 'variety',
 'winery',
 'similarityTop3WinesByVariety',
 'word_count',
 'tf_grouped_1',
 'tf_grouped_2',
 'tf_grouped_3',
 'tfIdf_grouped_1',
 'tfIdf_grouped_2',
 'tfIdf_grouped_3',
 'tf_fullData_1',
 'tf_fullData_2',
 'tf_fullData_3',
 'tfIdf_fullData_1',
 'tfIdf_fullData_2',
 'tfIdf_fullData_3',
 'pr_5',
 'pr_6',
 'pr_7',
 'pr_8',
 'pr_9',
 'tas_5',
 'tas_6',
 'tas_7',
 'tas_8',
 'tas_9']

In [42]:
#select only CountVectorizer_data or TfidfVectorizer_data o none, but NOT the two togheder
CountVectorizer_data=True
TfidfVectorizer_data=False

#here select the actibute that you want to use during the training
features=['price','vintage','country','region_1','taster_name','winery','variety','province','similarityTop3WinesByVariety', 'pr_5', 'pr_6', 'pr_7', 'pr_8', 'pr_9', 'tas_5', 'tas_6', 'tas_7', 'tas_8', 'tas_9','word_count']
features=[]

In [68]:
if(CountVectorizer_data |TfidfVectorizer_data ):
    
    if(CountVectorizer_data):
        vect = CountVectorizer(min_df=5,ngram_range=(1, 3))
        vect.fit(X_train['description'])
        print("vocabulary size: {}".format(len(vect.vocabulary_)))
        X_train_vectored_cv = vect.transform(X_train['description'])
        X_train_final = X_train_vectored_cv
    else:
        vect = TfidfVectorizer(smooth_idf=True, sublinear_tf=False, analyzer='word',min_df=50, ngram_range=(1, 3))
        vect.fit(X_train['description'])
        print("vocabulary size: {}".format(len(vect.vocabulary_)))
        X_train_vectored_tfidf = vect.transform(X_train['description'])
        X_train_final = X_train_vectored_tfidf
        
    for feature in features:
        X_train_final = hstack((X_train_final,np.array(X_train[feature])[:,None]))
else:
    
    X_train_final = X_train.loc[:,features]
       

vocabulary size: 4772


# Training error

In [69]:
lr = LinearRegression()
lr.fit(X_train_final, y_train)


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [70]:
y_train_pred = lr.predict(X_train_final)
mse = mean_squared_error(y_train, y_train_pred)
rmse = np.sqrt(mse)
rmse

1.9026960047751693

# Test error

In [71]:
if(CountVectorizer_data | TfidfVectorizer_data ):
    X_test_final = vect.transform(X_test['description'])
    for feature in features:
        X_test_final = hstack((X_test_final,np.array(X_test[feature])[:,None]))
else:
    X_test_final = X_test.loc[:,features]

y_test_pred = lr.predict(X_test_final)
mse = mean_squared_error(y_test, y_test_pred)
rmse = np.sqrt(mse)
rmse

1.999243281454173

In [72]:
print('Coefficients: \n', lr.coef_)


print(len( lr.coef_))

Coefficients: 
 [ 0.56856127 -0.8906486   0.27548894 ... -1.08512404 -0.06398397
 -0.63492357]
4772


In [73]:
feature_names = np.array(vect.get_feature_names())
feature_names

array(['05', '06', '5050', ..., 'zip', 'zippi', 'zweigelt'], dtype='<U28')

In [74]:
# Sort the coefficients from the model
sorted_coef_index = lr.coef_.argsort()

# Find the 10 smallest and 10 largest coefficients
# The 10 largest coefficients are being indexed using [:-11:-1] 
# so the list returned is in order of largest to smallest
print('Smallest Coefs:\n{}\n'.format(feature_names[sorted_coef_index[:10]]))
print('Largest Coefs: \n{}'.format(feature_names[sorted_coef_index[:-11:-1]]))

Smallest Coefs:
['vinho verd' 'dark cook' 'gala appl' 'woodland berri' 'sauvignon petit'
 'touriga nacion' 'accept' 'simpl' 'wateri' 'strang']

Largest Coefs: 
['dark cook spice' 'barrel sampl' 'verd' 'woodland' 'gala' 'stun'
 'walla walla' 'superb' 'now2025' 'nacion']


In [51]:
#basic feature                                              2.859065103992044
#basic feature + manual wordcount                           2.5488092414557646
#basic feature + word2vect                                  2.7374282971613084
#basic feature + weather                                    2.817176644052167
#basic feature + tf_grouped                                 2.8167889079501047
#basic feature + tf_fullData                                2.820394158650923
#basic feature + tfIdf_grouped                              2.800941096964716
#basic feature + tfIdf_fullData_1                           2.823836138011456        
#basic feature + tfidf                                      1.8287619171940812
#basic feature + sklearn word count                         1.8273230684121524  
#(this show that tfidf and sklearn word counttwo are almost equivalent so i just use one of that)
#since sklean wordcount is the best i check the possible improvement on that

#basic feature + sklearn word count + manual word count     1.8269534098049776    +0.001  +-
#basic feature + sklearn word count + word2vect             1.8175582513592656    +0.01   +-
#basic feature + sklearn word count + weater                1.8143524722884636    +0.01   +-
#basic feature + sklearn word count + weater + word2vect    1.8066107753300489    +0.02   +-
#bac fe + skl word ct + wear + word t + manual word count   1.8065145709633865    +0.02   ++-




My conclusion:


both the custom tfidf and tf grouped get a better result that on the all dataset.
the sklean wordsount and tfidf are way better that the other feature, and they are very similar

both word2vect and weater information add new information on the data,
the word2vect add much more infomration that weather on the basic feature, but in the basic information + word count the improvement is almost the same.

for the manual wordcount as i sensed add much more infomation (compared to word2vect and weather) on the basic feature, but they add very few information on the basic feature + sklean word count, meaning that they close to be a sub set of infomation of the sklean word count.

the best think to do it will be check with a p-test the significance of the improvement obtained.
to check if a 0.02 is significant same for 0.01 0.001

the best will be also to do the same test as i did with another much comples model
to validate it
