**WINE QUALITY REGRESSION COMPETITION**

Armando La Rocca 
279401
s279401@studenti.polito.it

In [1]:
import pandas as pd
import numpy as np 
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords as sw
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OneHotEncoder
import collections
from sklearn.model_selection import train_test_split
from scipy.sparse import coo_matrix, hstack
from sklearn.metrics import r2_score
from sklearn.model_selection import ParameterGrid
from sklearn.neural_network import MLPRegressor

In [2]:
# This class is used to clean and tokenize the descriptions 

class LemmaTokenizer(object): 
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
        
    def __call__(self, document): 

        lemmas = []
        for t in word_tokenize(document): 
            t = t.strip()
            t = re.sub('\W', '', t) #Remove symbols
            t = re.sub(r'\d+', '', t) #Remove numbers
            lemma = self.lemmatizer.lemmatize(t)
            lemmas.append(lemma) 
        
        return lemmas

**DATASET PREPARATION AND PREPROCESSING**

In [3]:
df = pd.read_csv("./competition_dataset/dev.tsv","\t")

# Remove duplicates and transform to lower case
df =  df.drop_duplicates()
df= df.applymap(lambda s:s.lower() if type(s) == str else s)

# Fill features (region_1,region_2 and descriptions) with missing values
df["region_2"] = df["region_2"].fillna("other")
df["region_1"] = df["region_1"].fillna("other")
df["designation"] = df["designation"].fillna("other")

# Drop samples with Nan in other features
df = df.dropna()

# Mapping same designations in the same way
d_tr = df.designation.values

new = []
for i in range(len(d_tr)):
    
    if (str(d_tr[i]) == "riserva") or (str(d_tr[i]) == "reserva") or (str(d_tr[i]) == "réserve"):
        new.append("reserve")
        
    elif (str(d_tr[i]) == "gran reserva")  :
        new.append("grand reserve")
        
    elif (str(d_tr[i]) == "reserva especial")  :
        new.append("special reserve")
        
    elif (str(d_tr[i]) == "red") or (str(d_tr[i]) == "rosso") or (str(d_tr[i]) == "tinto") :
        new.append("red wine")
        
    elif (str(d_tr[i]) == "rosé of") or (str(d_tr[i]) == "rosado") or (str(d_tr[i]) == "rosato") :
        new.append("rosé")
        
    else: new.append(d_tr[i])
        
df["designation"] = new


Approach A1

In [4]:
#DIVIDE DESCRIPTIONS IN 3 CATEGORIES

labels = []
good_review = []
bad_review = []
medium_review = []

descr = df["description"].values
    
for i in range(len(descr)):    
    x = df.quality.values[i]
    if x<=35:
        labels.append(0)
        bad_review.append(descr[i])
    if x>35 and x<65:
        labels.append(1)
        medium_review.append(descr[i])
    if x>=65:
        labels.append(2)
        good_review.append(descr[i])
        
labels = np.array(labels)
good_review = np.array(good_review)
medium_review = np.array(medium_review)
bad_review = np.array(bad_review)

lemmaTokenizer = LemmaTokenizer()
vectorizer = CountVectorizer(tokenizer=lemmaTokenizer,max_df=0.7, min_df=0.05, stop_words=sw.words('english')) 
vectorizer.fit(good_review)
print(len(vectorizer.get_feature_names()))
print(len(vectorizer.stop_words_ ),"\n")
good_ = dict(vectorizer.vocabulary_).keys()

vectorizer = CountVectorizer(tokenizer=lemmaTokenizer,max_df=0.7, min_df=0.05, stop_words=sw.words('english')) 
vectorizer.fit(medium_review)
print(len(vectorizer.get_feature_names()))
print(len(vectorizer.stop_words_ ),"\n")
medium_ = dict(vectorizer.vocabulary_).keys()

vectorizer = CountVectorizer(tokenizer=lemmaTokenizer,max_df=0.7, min_df=0.05, stop_words=sw.words('english')) 
vectorizer.fit(bad_review)
print(len(vectorizer.get_feature_names()))
print(len(vectorizer.stop_words_ ),"\n")
bad_ = dict(vectorizer.vocabulary_).keys()

intersection_set = (set(good_) ^ set(bad_) ^ set(medium_))
print(len(intersection_set))

list_set = list(intersection_set)
dictionary = {}

for i in range(len(list_set)) : 
    key = list_set[i]
    dictionary[ key ]  = i 

100
9565 

76
31708 

70
13540 

100


**VALIDATION PART**


Train/validation split

In [9]:
y = df.quality
df = df.drop(columns="quality")

X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.2, random_state=1)

descr_train = X_train.description 
descr_test = X_test.description 

X_train = X_train.drop(columns="description")
X_test = X_test.drop(columns="description")

One hot encoding

In [10]:
enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(X_train.values)

hot_train =  enc.transform(X_train.values)
hot_test = enc.transform(X_test.values)

Encoding of description feature

In [11]:
lemmaTokenizer = LemmaTokenizer()

# IF APPROACH A1
#vectorizer = CountVectorizer() 
#vectorizer.fit(list_set)

# IF APPROACH A2 
vectorizer = TfidfVectorizer(tokenizer=lemmaTokenizer, max_df=0.5, min_df=10,stop_words=sw.words('english'), use_idf=False) 
vectorizer.fit(descr_train)

count_train = vectorizer.transform(descr_train)
count_test = vectorizer.transform(descr_test)

# IF APPROACH A1 OR A2
# COMMENT IF APPROACH ND 
sub_train = hstack([hot_train, count_train])
sub_test = hstack([hot_test, count_test])

Random Forest gridsearch

In [12]:
params_rf = {
    "n_estimators" : [200,500],
    "max_depth" : [2000,None],
    "max_features": [5,10,"sqrt"]
}


for config in ParameterGrid(params_rf):
    regr_ = RandomForestRegressor(**config)
    
    #IF appraoches A1 or A2
    regr_.fit(sub_train, y_train)
    
    #IF appraoche ND
    #regr_.fit(hot_train, y_train)
    
    y_pred = regr_.predict(sub_test)
    
    print(config)
    print(r2_score(y_test, y_pred))
    print("\n")
    

KeyboardInterrupt: 

MLP gridsearch

In [14]:
params_rf = {
    "hidden_layer_sizes":[100,500],
    "solver" : ["sgd"],
    "learning_rate" : ["adaptive"],
    "early_stopping" : [True],
    "learning_rate_init" : [0.001,0.01],
    "max_iter" : [200],
    "verbose" : [True]
}

for config in ParameterGrid(params_rf):
    regr = MLPRegressor( **config)
    
    #IF appraoches A1 or A2
    regr.fit(sub_train, y_train)
    
    #IF appraoche ND
    #regr_.fit(hot_train, y_train)
    
    y_pred = regr.predict(hot_test)

    print(r2_score(y_test, y_pred))
    print("\n")

Iteration 1, loss = 79.71004242
Validation score: 0.501668


ValueError: dimension mismatch

**EVALUATION PART**

Eval preparation with the same preprocessing steps

In [4]:
eval_ = pd.read_csv("./competition_dataset/eval.tsv","\t")

eval_ = eval_.applymap(lambda s:s.lower() if type(s) == str else s)
eval_["region_2"] = eval_["region_2"].fillna("other")
eval_["region_1"] = eval_["region_1"].fillna("other")
eval_["designation"] = eval_["designation"].fillna("other")

d_tr = eval_.designation.values
new = []

for i in range(len(d_tr)):
    
    if (str(d_tr[i]) == "riserva") or (str(d_tr[i]) == "reserva") or (str(d_tr[i]) == "réserve"):
        new.append("reserve")
        
    elif (str(d_tr[i]) == "gran reserva")  :
        new.append("grand reserve")
        
    elif (str(d_tr[i]) == "reserva especial")  :
        new.append("special reserve")
        
    elif (str(d_tr[i]) == "red") or (str(d_tr[i]) == "rosso") or (str(d_tr[i]) == "tinto") :
        new.append("red wine")
        
    elif (str(d_tr[i]) == "rosé of") or (str(d_tr[i]) == "rosado") or (str(d_tr[i]) == "rosato") :
        new.append("rosé")
        
    else: new.append(d_tr[i])
        
eval_["designation"] = new

Train and test set preparation

In [5]:
descr_train = df.description 
descr_test = eval_.description 
y = df.quality

df = df.drop(columns="quality")
X_train = df.drop(columns="description")
X_test = eval_.drop(columns="description")

One hot encoding

In [6]:
enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(X_train.values)

hot_train =  enc.transform(X_train.values)
hot_test = enc.transform(X_test.values)

Description encoding  ( in the following sections are reported the best configurations obtained in the validation)

In [7]:
lemmaTokenizer = LemmaTokenizer()
vectorizer = TfidfVectorizer(tokenizer=lemmaTokenizer,max_df=0.5, min_df=10, stop_words=sw.words('english'),use_idf=False) 
vectorizer.fit(descr_train)

count_train = vectorizer.transform(descr_train)
count_test = vectorizer.transform(descr_test)

sub_train = hstack([hot_train, count_train])
sub_test = hstack([hot_test, count_test])

Best model trained on all the development set with the best parameters obtained in the validation

In [15]:
regr = MLPRegressor(learning_rate='adaptive', hidden_layer_sizes=500,\
                     learning_rate_init= 0.01, max_iter= 120, solver= 'sgd', verbose=True,\
                     validation_fraction=0)
regr.fit(sub_train, y.values)
y_pred = regr.predict(sub_test)


Iteration 1, loss = 41.11691496


In [None]:
# Save results to a csv file
consegna = pd.DataFrame({"Id": range(0,len(y_pred)) , "Predicted": y_pred})
consegna.to_csv("279401.csv",index=False)