In [1]:
import numpy as np
import pandas as pd
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.compose import TransformedTargetRegressor
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from category_encoders import TargetEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
df = pd.read_csv("winemag-data-130k-v2.csv")

In [3]:
df = df[df['price'].notna()]

In [31]:
df.columns

Index(['Unnamed: 0', 'country', 'description', 'designation', 'points',
       'price', 'province', 'region_1', 'region_2', 'taster_name',
       'taster_twitter_handle', 'title', 'variety', 'winery'],
      dtype='object')

### Task 1.1

##### For task 1.1 we only pick the non-text features, so we dropped description and title, both are textual data. We also dropped the first column and the taster twitter because they are unrelated. 

In [71]:
 X_nt, Y = df.drop(labels=["Unnamed: 0", "description", "taster_twitter_handle", "title", "price"], axis=1), df["price"]

In [72]:
X_train, X_test, y_train, y_test = train_test_split(X_nt, Y, random_state=1)

In [51]:
cat_preprocessing = make_pipeline(
    SimpleImputer(strategy='most_frequent'),
    OneHotEncoder(handle_unknown='ignore'))
    
cont_preprocessing = make_pipeline(
    SimpleImputer(strategy='median'),
    StandardScaler())
    
preprocess = make_column_transformer(
    (cat_preprocessing, make_column_selector(dtype_include='object')),
    remainder=cont_preprocessing)

In [73]:
pipe_lr = make_pipeline(preprocess, Ridge())
scores = cross_val_score(pipe_lr, X_train, y_train, cv=5)
np.mean(scores)

0.5847156743918182

### Task 1.2

In [4]:
X_t, Y = df["description"], df["price"]

In [5]:
X_t

1         This is ripe and fruity, a wine that is smooth...
2         Tart and snappy, the flavors of lime flesh and...
3         Pineapple rind, lemon pith and orange blossom ...
4         Much like the regular bottling from 2012, this...
5         Blackberry and raspberry aromas show a typical...
                                ...                        
129966    Notes of honeysuckle and cantaloupe sweeten th...
129967    Citation is given as much as a decade of bottl...
129968    Well-drained gravel soil gives this wine its c...
129969    A dry style of Pinot Gris, this is crisp with ...
129970    Big, rich and off-dry, this is powered by inte...
Name: description, Length: 120975, dtype: object

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X_t, Y, random_state=1)

In [8]:
vect = CountVectorizer()
X_train = vect.fit_transform(X_train)
X_val = vect.transform(X_test)
X_train

<90731x27411 sparse matrix of type '<class 'numpy.int64'>'
	with 3143803 stored elements in Compressed Sparse Row format>

In [9]:
ridge = Ridge().fit(X_train, y_train)

In [14]:
ridge.score(X_train, y_train)

0.5484309542416049