## Applied Machine Learning HW 4

#### Hojin Lee (hl3328) & Hyuk Joon Kwon (hk3084)

In [1]:
import time
import pandas as pd
import numpy as np
from tabulate import tabulate
import warnings
warnings.filterwarnings('ignore')

from sklearn.linear_model import LinearRegression, Ridge
from sklearn.preprocessing import OneHotEncoder, StandardScaler, Normalizer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, cross_val_score, KFold, StratifiedKFold, GridSearchCV
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer

import category_encoders as ce

## Task2 

Use a pretrained word-embedding (word2vec, glove or fasttext) for featurization instead of the bag-of-words model. Does this improve classification? How about combining the embedded words with the BoW model?

In [2]:
import spacy
import nltk
import re
from nltk.corpus import stopwords

In [5]:
#reading csv file
wine_df_raw = pd.read_csv(r'wine-reviews/winemag-data-130k-v2.csv')
df_us = wine_df_raw[wine_df_raw['country'] == 'US']

### Load from spacy en_core_web_lg

In [6]:
nlp = spacy.load('en_core_web_lg')

### Extract the description column from the data frame.

In [7]:
text_trainval, y_trainval = df_us['description'], df_us['points']

### Change all the strings in the document to lower case

In [8]:
text_trainval = text_trainval.str.lower()

### Remove every character that is not either a lowercase alphabet or a space. 

In [9]:
text_trainval = text_trainval.apply(lambda x : re.sub("[^a-z\s]","",x) )

### Create a text vector of dimention 300 for every datapoint in the dataframe. 

In [10]:
stopwords = set(stopwords.words("english"))
text_trainval = text_trainval.apply(lambda x : " ".join(word for word in x.split() if word not in stopwords ))

### (numberOfDataPoints, 300)

In [11]:
document = nlp.pipe(text_trainval)
text_vector = np.array([text.vector for text in document])

### Cross validation with using just the description column

In [12]:
text_train, text_val, y_train, y_val = train_test_split(text_vector, y_trainval)
np.mean(cross_val_score(Ridge(), text_train, y_train, cv=5))

0.52976692700485

### Concatinate the text vector with the original dataframe on Task1

In [14]:
drop_cols = ['Unnamed: 0','country','taster_twitter_handle', 'province', 'region_2', 'title']

df_drop = df_us.drop(drop_cols, axis=1)

X = df_drop.loc[:, df_drop.columns != 'points']
y = df_drop['points']

In [15]:
X_ = X.reset_index(drop=True)
X_ = pd.concat([X_, pd.DataFrame(text_vector)], axis=1)


### Target encode the designtaion and winery columns. 

In [16]:
te = ce.TargetEncoder(cols=['designation','winery',]).fit(X_, y_trainval)
X_ = te.transform(X_)

In [17]:
text_train, text_val, y_train, y_val = train_test_split(X_, y_trainval)

In [25]:
category = text_train.dtypes == object

cat_preprocessing = make_pipeline(
    SimpleImputer(strategy='constant', fill_value='NaN'),
    OneHotEncoder(handle_unknown='ignore'))

cont_preprocessing = make_pipeline(
    SimpleImputer())

cont_preprocessing_scale = make_pipeline(
    SimpleImputer(),
    StandardScaler())

target_encoder = make_pipeline(
    ce.TargetEncoder()
    , StandardScaler())

te_feature = ['designation','winery']
cont_feature = list(text_train.select_dtypes(exclude=['object']).columns)
cat_feature = list(text_train.select_dtypes(include=['object']).columns)

preprocess = make_column_transformer(
    (cont_preprocessing, cont_feature)
    , (cat_preprocessing, cat_feature)
    , remainder ='passthrough')

preprocess_scale = make_column_transformer(
    (cont_preprocessing_scale, cont_feature)
    , (cat_preprocessing, cat_feature)
    , remainder ='passthrough')

cat_feature = list(set(list(text_train.select_dtypes(include=['object']).columns)) - set(te_feature))
preprocess_scale_te = make_column_transformer(
    (target_encoder, te_feature)
    , (cont_preprocessing_scale, cont_feature)
    , (cat_preprocessing, cat_feature)
    , remainder ='passthrough')

text_preprocessing = make_pipeline(
    CountVectorizer(max_features = 5000, min_df=1, ngram_range=(1,1), stop_words='english')
    )

def pipeline_prediction(X, y, preprocess, regression):
    OLR_pipe = make_pipeline(preprocess, regression)
    scores_olr = cross_val_score(OLR_pipe, X, y, cv=5)
    return np.mean(scores_olr)

In [26]:
text_feature = ['description']
cont_feature = ['price', 'designation','winery']
cat_feature = ['region_1','taster_name', 'variety']

preprocess_text = make_column_transformer(
    (cont_preprocessing_scale, cont_feature)
   , (cat_preprocessing, cat_feature)
   , (text_preprocessing, 'description')
   , remainder ='passthrough')

In [27]:
results_dict = {}
methods = [LinearRegression(), Ridge(alpha=10)]
processors = [preprocess_text]

method_name = ['Linear_regression', 'Ridge']
processors_name = ['preprocess_text']

processor_counter = 0

for processor in processors:
    method_counter = 0
    results_dict[processors_name[processor_counter]] = {}
    
    for method in methods:
        
        results_dict[processors_name[processor_counter]][method_name[method_counter]] = pipeline_prediction(text_train, y_train, processor, method)
        method_counter += 1
        
    processor_counter += 1
        
results_df = pd.DataFrame.from_dict(results_dict)
print(tabulate(results_df, headers='keys', tablefmt='psql'))



+-------------------+-------------------+
|                   |   preprocess_text |
|-------------------+-------------------|
| Linear_regression |          0.741601 |
| Ridge             |          0.762939 |
+-------------------+-------------------+
