In [161]:
import csv
import ast

    
# READ THE CSV FILE         

'''-------------- file_to_lst ---------------------------
IN: the csv filename where the data are stored
    features_INDEX: list of the indexes of the features 
    we want to keep 
RESULT: the data in a list
-------------------------------------------------------'''

def file_to_lst(filename, features):
    file = open(filename, "r")
    reader = csv.reader(file, delimiter=",")
    
    # INDEX DEFINITION 
    name_columns = next(reader)

    INDEX_POINTS = name_columns.index("points") 
    
    features_INDEX = []
    for i, feature in enumerate(features):
        features_INDEX.append(name_columns.index(feature))
        
        
    collected_data = []

    # NOT USED HERE: ID (0), designation (2), title (8)
    
    for r in reader:
        collected_data.append([])
        collected_data[-1].append(ast.literal_eval(r[INDEX_POINTS]))
        for i, index in enumerate(features_INDEX):
            feature_value = ast.literal_eval(r[index])
            try: 
                for j, value in enumerate(feature_value):
                    collected_data[-1].append(value)
            except TypeError:
                collected_data[-1].append(feature_value)
                
            
    file.close()
    return collected_data

######################################
#              CALL
######################################

#filename = "DF_version2Francesco.csv"
filename = "DF_version2_withDesc.csv"
#filename = "DF_version2.csv"

features1 = ["price", "country", "vintage"] #"country", "province", "region_1", "variety", top1_desc, top2_desc, top3_desc
collected_data1 = file_to_lst(filename, features1)

features2 = ["taster_name"]
collected_data2 = file_to_lst(filename, features2)

In [162]:
print(collected_data1[1])
print(len(collected_data1[1]))

[87, 15.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2011]
15


In [157]:
import random 
import numpy as np

# FORMAT THE DATA (from list to np-array)
                               
'''-------------------- format_data -------------------------
IN: collected_data: list of a list reflecting the data (where
the last element is what it has to be predicted)
RESULT: numpy array to be used for training/testing a model
 ----------------------------------------------------------'''


def format_data(collected_data):
    size = len(collected_data[0]) -1
    x = np.zeros((len(collected_data), size), dtype=np.float)
    y = np.zeros((x.shape[0],), dtype=np.float)

    for k, data in enumerate(collected_data):

        x[k] = data[1:]
        y[k] = data[0]

    return x, y                               
                               

    
# SPLIT THE DATASET INTO A TRAINING AND A TESTING SET 
def DS_builder(collected_data, p=0.75):

    # Shuffle the data to reduce the bias
    random.shuffle(collected_data)
    
    data_len = len(collected_data)
    testing_set = []
    learning_set = []
    
    # ------ We go trough all the data ------
    for count, data in enumerate(collected_data):
            if count / data_len < p:
                learning_set.append(data)
            else:
                testing_set.append(data)

    xls, yls = format_data(learning_set)
    xts, yts = format_data(testing_set)
    ds = [xls, yls, xts, yts]
    
    return ds

INDEX_XLS = 0
INDEX_YLS = 1 
INDEX_XTS = 2
INDEX_YTS = 3

######################################
#              CALL
######################################

ds1 = DS_builder(collected_data1)
ds2 = DS_builder(collected_data2)


In [158]:
########################################
# STILL TODO: parameter tuning 
########################################


# DEFINE YOUR MODEL AND TRAIN IT
import sklearn
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR 

algorithm = "LinearRegression"


# ------------ Management of the learning algorithm ----------------------
if algorithm == "LinearRegression":
    estimator1 = LinearRegression(fit_intercept=True, normalize=False, copy_X=True, n_jobs=None).fit(ds1[INDEX_XLS], ds1[INDEX_YLS])
    estimator2 = LinearRegression(fit_intercept=True, normalize=False, copy_X=True, n_jobs=None).fit(ds2[INDEX_XLS], ds2[INDEX_YLS])

elif algorithm == "LogisticRegression":
    estimator1 = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None).fit(ds1[INDEX_XLS], ds1[INDEX_YLS])
    estimator2 = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None, solver='warn', max_iter=100, multi_class='warn', verbose=0, warm_start=False, n_jobs=None).fit(ds2[INDEX_XLS], ds2[INDEX_YLS])

elif algorithm == "DecisionTreeRegressor":
    estimator1 = DecisionTreeRegressor(max_depth=5).fit(ds1[INDEX_XLS], ds1[INDEX_YLS])
    estimator2 = DecisionTreeRegressor(max_depth=5).fit(ds2[INDEX_XLS], ds2[INDEX_YLS])

elif algorithm == "SVR":
    svr_rbf = SVR(kernel='rbf', C=1e3, gamma=0.1).fit(ds1[INDEX_XLS], ds1[INDEX_YLS])
    svr_lin = SVR(kernel='linear', C=1e3).fit(ds1[INDEX_XLS], ds1[INDEX_YLS])
    svr_poly = SVR(kernel='poly', C=1e3, degree=2).fit(ds1[INDEX_XLS], ds1[INDEX_YLS])

else:
    pass



In [159]:
###########################################################################################
# STILL TODO: Interpretation of the computed accuracy (and maybe test with other metrics)
###########################################################################################



from sklearn.metrics import explained_variance_score, mean_absolute_error, mean_squared_error

with_prob = False

# MAKE PREDICTION 
if with_prob:
    predicted_points1 = estimator1.predict_proba(ds1[INDEX_XTS])
else: 
    predicted_points1 = estimator1.predict(ds1[INDEX_XTS])
    predicted_points2 = estimator2.predict(ds2[INDEX_XTS])

    
# COMPUTE ACCURACY 
print("Test 1 with the features: " + str(features1) + " and the algorithm " + algorithm + "\n")
print("Test 2 with the features: " + str(features2) + " and the algorithm " + algorithm + "\n")

evs1 = explained_variance_score(ds1[INDEX_YTS], predicted_points1)
print("The explained variance score is: " + str(evs1))

evs2 = explained_variance_score(ds2[INDEX_YTS], predicted_points2)
print("The explained variance score is: " + str(evs2) + "\n")

mae1 = mean_absolute_error(ds1[INDEX_YTS], predicted_points1) # The best value is 0 
print("The mean absolute error is: " + str(mae1))

mae2 = mean_absolute_error(ds2[INDEX_YTS], predicted_points2) # The best value is 0 
print("The mean absolute error is: " + str(mae2) + "\n")

mse1 = mean_squared_error(ds1[INDEX_YTS], predicted_points1) # The best value is 0 
print("The mean squared error is: " + str(mae1))

mse2 = mean_squared_error(ds2[INDEX_YTS], predicted_points2) # The best value is 0 
print("The mean squared error is: " + str(mae2))

Test 1 with the features: ['price'] and the algorithm LinearRegression

Test 2 with the features: ['taster_name'] and the algorithm LinearRegression

The explained variance score is: 0.15151736442080888
The explained variance score is: 0.0

The mean absolute error is: 2.27183560095645
The mean absolute error is: 2.536924378287522

The mean squared error is: 2.27183560095645
The mean squared error is: 2.536924378287522


In [160]:
print(predicted_points1)

[89.12211533 87.94471637 88.88182983 ... 89.12211533 88.13694477
 88.28111607]
