# Hyperparameter Tuning

In [None]:
import numpy as np
import pandas as pd
import re
import string
import warnings
from timeit import timeit
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics 
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# NLTK Imports and Downloads
import nltk
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
df = pd.read_csv('yelp.csv')

## Data Prep

In [None]:
df.head()

In [None]:
df.shape

### Null Values

Nulll values are generally not desireable in a dataset.  In certain cases, observations (rows) with low counts will simply be dropped, in other cases, they can be filled with other values.

In [None]:
# Drop NULL values
df.dropna(inplace=True)

In [None]:
# Drop unwanted columns
df.drop(columns=['Unnamed: 0', 'longitude', 'latitude'], inplace=True)

In [None]:
df['date'] =  pd.to_datetime(df['date'])

In [None]:
# Add column 'text_len' that counts the length for the derived field
df['text_len'] = df.apply(lambda row: len(row['text']), axis = 1)

In [None]:
# add a column for positive or negative
df['target'] = df['stars_y'].apply(lambda c: 0 if c < 4 else 1)

### Duplicates

A common practice is to review any duplicates.  If there are large quantities, they can skew the results.

In [None]:
len_before = df.shape[0]
df.drop_duplicates(inplace=True)
len_after = df.shape[0]

print("Before =", len_before)
# drop duplicates
print("After =", len_after)
print('')
print("Total Removed =", len_before - len_after)

In [None]:
df.info()

In [None]:
print(len(df))
df = df[df['state'] != 'KS'].copy()
print(len(df))

## EDA

In [None]:
df.describe()

In [None]:
corr = df.corr()

f, ax = plt.subplots(figsize=(12, 8))

sns.heatmap(corr, cmap="Blues", annot=True, square=False, ax=ax,  linewidth = 1)
plt.title('Pearson Correlation of Features')
plt.yticks(rotation=45);

**Observations:**  
There are a few variables that are correlated to each other.  
 - `target` was created from `stars_y` and therefore shows a high positive correlation
 - `cool`, `useful` and `funny` are slightly correlated to each other.  This probably means that users to vote with one item, vote with others. 
 - `stars_y` and `stars_x`also show some correlation.  This makes sense because `stars_x` is the mean of all `stars_y` ratings.

## Text Cleaning

For **Parts** of our analysis, the text needs to have some basic transformation for our models to work properly.  These are as follows:

1. **Lower**: Convert all characters to lowercase
1. **Remove Punctuation**: In most cases, punctuation doesn't help NLP and ML models and can be removed.
1. **Stop Word Removal**: Stop words generally don't add context to analysis (unless the length of the text is very short (`100` - `200` characters) and can be removed.
1. **Lemmatization**: Words will be reduced to their *Lemma* or root.  This will greatly improve the accuracy of the analysis since words like `swimming` and `swimmer` will be reduced to `swim`.

**Note**: The original text will be preserved for other analysis.

In [None]:
df['text'][0]

In [None]:
def clean_string(text, stem="None"):
    
    final_string = ""
    
    # Make lower
    text = text.lower()
    
    translator = str.maketrans('', '', string.punctuation)
    text = text.translate(translator)

    text = text.split()
    useless_words = nltk.corpus.stopwords.words("english") + list(string.punctuation)
    useless_words = useless_words + ['.', ',', '!', "'"]
    
    # Remove stop words
    text_filtered = [word for word in text if not word in useless_words]
    
    # Remove numbers
    text_filtered = [re.sub('\w*\d\w*', '', w) for w in text_filtered]
    
    # Stem or Lemmatize
    if stem == 'Stem':
        stemmer = PorterStemmer() 
        text_stemmed = [stemmer.stem(y) for y in text_filtered]
    elif stem == 'Lem':
        lem = WordNetLemmatizer()
        text_stemmed = [lem.lemmatize(y) for y in text_filtered]
    else:
        text_stemmed = text_filtered
    
    for word in text_stemmed:
        final_string += word + " "
    
    return final_string

In [None]:
df['text_clean'] = df['text'].apply(lambda x: clean_string(x, stem='Stem'))

In [None]:
df['text_clean'][0]

# FEATURE SELECTION/ MODELING 

>*In machine learning and statistics, feature selection, also known as variable selection, attribute selection or variable subset selection, is the process of selecting a subset of relevant features (variables, predictors) for use in model construction. Feature selection techniques are used for several reasons[11]:*

>- *simplification of models to make them easier to interpret by researchers/users,*
>- *shorter training times,*
>- *to avoid the curse of dimensionality,*
>- *enhanced generalization by reducing overfitting*

In [None]:
targets = ['categories', 'city', 'state', 'postal_code', 'is_open', 'text_len', 'useful', 'cool', 'funny', 'review_count']

In [None]:
catFeat = df[targets].copy()

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()


catFeat['categories'] = le.fit_transform(catFeat['categories'].astype(str))
catFeat['city'] = le.fit_transform(catFeat['city'].astype(str))
catFeat['state'] = le.fit_transform(catFeat['state'].astype(str))
catFeat['postal_code'] = le.fit_transform(catFeat['postal_code'].astype(str))

catFeat.head()

In [None]:
# Split the data into 30% test and 70% training
X_train, X_test, y_train, y_test = train_test_split(catFeat, 
                                                    df['target'], 
                                                    test_size=0.3, random_state=0)

# Create a random forest classifier for feature importance
clf = RandomForestClassifier(random_state=42, n_jobs=6, class_weight='balanced')

clf.fit(X_train, y_train)

total_importance = 0
# Print the name and gini importance of each feature
for feature in zip(targets, clf.feature_importances_):
    if feature[1] > .1:
        print(feature)
        total_importance += feature[1]
        
print('\nCumulative Importance of Selected Features: ', total_importance)

## Model Selection

Many algorithms support binary classification.  We will use two and compare the results to select the best model.

 - **Random Forest Classifier**:  This classifier tends to be very robust.  It was used in the feature selection model above and will be tested against the features it selected.  Due to the nature of running many decision trees, it can take a while to compute larger datasets.
 > *A Random Forest classifier is an ensemble learning method for classification, regression, and other tasks that operates by constructing a multitude of decision trees at training time and outputting the class that is the mode of the classes (classification) or mean/average prediction (regression) of the individual trees [4].*
 - **Logistic Regression**: The logistic model is a fast and robust model that tends to run fairly quickly on all types of models.
 > *The logistic model (or logit model) is used to model the probability of a certain class or event existing, such as pass/fail, win/lose, alive/dead, or healthy/sick. This can be extended to model several classes of events, such as determining whether an image contains a cat, dog, or lion. Each object is detected in the image would be assigned a probability between 0 and 1, with a sum of one [5].*

To create our model, we will be mixing both text and numeric values.  There are multiple ways to accomplish this, but we will be using a `ColumnTransformer` in a Pipeline.

**Imbalanced Data**  
The number of negative reviews is far less than the number of positive reviews. This is known as Imbalanced Data.  When you have imbalanced data, the model will tend to bias to the value with more observations (positive).  To correct this, we can run a process known as SMOTE.  This process uses a nearest-neighbor approach for generating new minority class samples.  The method is applied only to the training data and then tested on the original, untouched test partition.  The method chosen here is first to oversample the minority class making it balanced, and then undersample it to reduce the size.  This helps bring balance without bloating the dataset.

In [None]:
# To speed this up, let's just take a random subset of the data
# df = df.sample(n=15000)

In [None]:
X = df[['categories', 'postal_code', 'text_len', 'review_count', 'text_clean']]
y = df['target']

In [None]:
print(X.shape)
print(y.shape)

In [None]:
def create_pipe(clf, ngrams=(1,1)):
    
    # Each pipeline uses the same column transformer.  
    column_trans = ColumnTransformer(
        [('Text', TfidfVectorizer(stop_words='english', ngram_range=ngrams), 'text_clean'),
         ('Categories', TfidfVectorizer(), 'categories'), 
         ('OHE', OneHotEncoder(dtype='int', handle_unknown='ignore'),['postal_code']),
         ('Numbers', MinMaxScaler(), ['review_count', 'text_len'])],
        remainder='drop') 
    
    pipeline = Pipeline([('prep',column_trans),
                         ('over', SMOTE(random_state=42)),
                         ('under', RandomUnderSampler(random_state=42)),
                         ('clf', clf)])
     
    return pipeline

In [None]:
models = {'RandForest' : RandomForestClassifier(random_state=42, n_estimators=50),
          'LogReg' : LogisticRegression(random_state=42, max_iter=1000)
          }

for name, model, in models.items():
    clf = model
    pipeline = create_pipe(clf)
    scores = cross_val_score(pipeline, X, y, scoring='f1_macro', cv=3, n_jobs=1, error_score='raise')
    print(name, ': Mean f1 Macro: %.3f and Standard Deviation: (%.3f)' % (np.mean(scores), np.std(scores)))

## Hyper Parameter Tuning

> *In machine learning, a hyperparameter is a parameter whose value is used to control the learning process. By contrast, the values of other parameters (typically node weights) are derived via training.*

<br>

> *Hyperparameters can be classified as model hyperparameters, that cannot be inferred while fitting the machine to the training set because they refer to the model selection task, or algorithm hyperparameters, that in principle have no influence on the performance of the model but affect the speed and quality of the learning process. An example of a model hyperparameter is the topology and size of a neural network. Examples of algorithm hyperparameters are learning rate and mini-batch size. [9]*

In [None]:
# Make training and test sets 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=53)

In [None]:
print(y_train.shape)
print(X_train.shape)

In [None]:
parameters = [{'clf__solver' : ['newton-cg', 'lbfgs', 'sag', 'liblinear'],
               'clf__C' : [.1, 1, 10, 100],
               'prep__Text__ngram_range': [(1, 1), (2, 2), (1, 2)]}]

In [None]:
clf = LogisticRegression(random_state=42, max_iter=500)
pipeline = create_pipe(clf)

In [None]:
pipeline.get_params()

In [None]:
# grid = GridSearchCV(pipeline, 
#                     parameters, 
#                     scoring='f1_macro', 
#                     cv=3).fit(X_train, y_train)

# print("Best cross-validation accuracy: {:.3f}".format(grid.best_score_))
# print("Test set score: {:.3f}".format(grid.score(X_test, y_test))) 
# print("Best parameters: {}".format(grid.best_params_))

# log_C = grid.best_params_['clf__C']
# log_solver = grid.best_params_['clf__solver']
# log_ngram = grid.best_params_['prep__Text__ngram_range']

In [None]:
log_C = 100
log_solver = 'newton-cg'
log_ngram = (1, 2)

45m 51s

```
Best cross-validation accuracy: 0.867
Test set score: 0.872
Best parameters: {'clf__C': 100, 'clf__solver': 'newton-cg', 'prep__Text__ngram_range': (1, 2)}
```

In [None]:
# grid = HalvingGridSearchCV(pipeline, 
#                            parameters, 
#                            scoring='f1_macro', 
#                            cv=3).fit(X_train, y_train)


# print("Best cross-validation accuracy: {:.3f}".format(grid.best_score_))
# print("Test set score: {:.3f}".format(grid.score(X_test, y_test))) 
# print("Best parameters: {}".format(grid.best_params_))

# log_C_b = grid.best_params_['clf__C']
# log_solver_b = grid.best_params_['clf__solver']
# log_ngram_b = grid.best_params_['prep__Text__ngram_range']

In [None]:
log_C_b = 100
log_solver_b = 'sag'
log_ngram_b = (1, 2)

7m 43s

```
Best cross-validation accuracy: 0.867
Test set score: 0.872
Best parameters: {'clf__C': 100, 'clf__solver': 'sag', 'prep__Text__ngram_range': (1, 2)}
```

**Notes:**
1. **C:** 
>*Regularization is applying a penalty to increasing the magnitude of parameter values in order to reduce overfitting. When you train a model such as a logistic regression model, you are choosing parameters that give you the best fit to the data. This means minimizing the error between what the model predicts for your dependent variable given your data compared to what your dependent variable actually is. [6]*
1. **Solver:** 
>*LIBLINEAR is a simple package for solving large-scale regularized linear
classification, regression and outlier detection.*
1. **N-Grams:** 
>*A bigram or digram is a sequence of two adjacent elements from a string of tokens, which are typically letters, syllables, or words. A bigram is an n-gram for n=2. The frequency distribution of every bigram in a string is commonly used for simple statistical analysis of text in many applications, including in computational linguistics, cryptography, and speech recognition*

# PERFORMANCE ASSESSMENT

Now that we have a model selected based on the cross-validation above, we can optimize the `Hyper Parameters` associated with the algorithm.  This allows for optimal results, potentially over and above the default settings.  

## Model Evaluation

In [None]:
def fit_and_print(pipeline, name):
    ''' take a supplied pipeline and run it against the train-test spit 
    and product scoring results.'''
    
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    score = metrics.f1_score(y_test, y_pred, average='macro')

    print(metrics.classification_report(y_test, y_pred, digits=3))

    ConfusionMatrixDisplay.from_predictions(y_test, 
                                            y_pred, 
                                            cmap=plt.cm.Blues)
    
    plt.tight_layout()
    plt.title(name)
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.tight_layout()
    plt.savefig(name + '.png', dpi=300) 
    plt.show; 

In [None]:
clf = LogisticRegression(random_state=42, max_iter=500)
pipeline = create_pipe(clf)
fit_and_print(pipeline, 'hyper_defaults')

In [None]:
clf = LogisticRegression(C=log_C, solver=log_solver, random_state=42, max_iter=500)
pipeline = create_pipe(clf, log_ngram)
fit_and_print(pipeline, 'hyper_grid')

In [None]:
clf = LogisticRegression(C=log_C_b, solver=log_solver_b, random_state=42, max_iter=500)
pipeline = create_pipe(clf, log_ngram_b)
fit_and_print(pipeline, 'hyper_halving')

https://towardsdatascience.com/hyperparameter-tuning-for-machine-learning-models-1b80d783b946