# Week 9 Extra Credit: Basic IR System

In [1]:
import io
import random
import string
import re
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

## Sonoma Wine Reccomender

Using a subset of the wines in the Wine Spectator Kaggle dataset, create a reccomendation system based on the closet match of a user inputted description.

 - The user can enter either flavors desired *or* enter in food that they would like to match the wine with
 - The process will continue until the user enters `QUIT` into the input.
 - The output will be the description from Wine Spectator, as well as the varietal, price, and number of points.

https://www.kaggle.com/zynicide/wine-reviews

In [2]:
wine_df = pd.read_csv('wine.csv')

In [3]:
wine_df.head()

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,variety,winery
0,US,"Malbec, Merlot and Petit Verdot are included i...",Kate's + B's,90,75.0,California,Alexander Valley,Sonoma,Cabernet Sauvignon,Medlock Ames
1,US,This straightforward wine shows an exuberance ...,,90,35.0,California,Alexander Valley,Sonoma,Cabernet Sauvignon,The Calling
2,US,"Smoke, brawny black pepper and black cherry fl...",Osborn Ranch,86,38.0,California,Alexander Valley,Sonoma,Primitivo,DeLorimier
3,US,"Complemented with 11% Cabernet Franc, this vin...",Oliver's Vineyard,93,85.0,California,Alexander Valley,Sonoma,Cabernet Sauvignon,Aperture
4,US,A collaboration between Sonoma County-based wi...,A Ken Forrester Wine Farrow Ranch,93,100.0,California,Alexander Valley,Sonoma,Cabernet Sauvignon,The Bridge


In [4]:
wine = list(wine_df['description'].values)
wine[0]

"Malbec, Merlot and Petit Verdot are included in this wine that's black-purple in color, with a thick density to its dusty black fruit. Cedar, tar and tobacco weave in between blackberry and currant, finishing in a powerful grip of tannin."

In [5]:
wn_lemmer = WordNetLemmatizer()

#  custom tokenizer
def custom_tokenizer(str_input):
   
    # remove special characters and stem
    words = re.sub(r"[^A-Za-z0-9\-]", " ", str_input).lower().split()
    
    # lemmatizer
    words = [wn_lemmer.lemmatize(word) for word in words] 
    
    return words

In [6]:
TfidfVec = TfidfVectorizer(stop_words='english', tokenizer=custom_tokenizer)
tfidf = TfidfVec.fit_transform(wine)


# display as a dataframe
df = pd.DataFrame(tfidf.toarray(), columns=TfidfVec.get_feature_names())
df

Unnamed: 0,-,-ness,-type,0,000,000-feet,01,02,03,04,...,zinfandoodle,zingier,zinginess,zingy,zinny,zins,zio,zip,zippy,zone
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11253,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11254,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11255,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11256,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Enter a Value and Return a Wine Reccomendation

In [7]:
def wine_reccomendation(new_wine):

    # step 1
    wine.append(new_wine)  # add our query to our corpus

    # step 2
    # need to re-vectorize given query addition
    TfidfVec = TfidfVectorizer(stop_words='english', tokenizer=custom_tokenizer)
    tfidf = TfidfVec.fit_transform(wine)

    # compare the similarity of the query to the existing corpus    
    vals = cosine_similarity(tfidf[-1], tfidf)

    idx = vals.argsort()[0][-2]  # get the index of the highest value (note: -2 is the last doc, since -1 is the query

    flat = vals.flatten()        # flatten the array
    flat.sort()

    req_tfidf = flat[-2]        # pull out the similarity value

    # show the doc that is most similar to our query
    if(req_tfidf==0):
        print("We didn't find any matches, please try again.")
    else:
        print(f"Your requirements: {new_wine}")
        print('')
        print(f"Similarity Score: {req_tfidf}")
        print('')
        print(f"Tasting Notes: {wine_df.loc[idx].description}")
        print('')
        print(f"Winery: {wine_df.loc[idx].winery}")
        print(f"Varietal: {wine_df.loc[idx].variety}")
        print(f"Points: {wine_df.loc[idx].points}")
        print(f"Retail Price: {wine_df.loc[idx].price}")
        print('--------------------------------------')
        wine.remove(new_wine) # need to remove the query

In [8]:
while True:
    # Get the Genre from the user or abort
    w = input('Enter flavor preferences or food types. Enter "QUIT" to exit: ',)
    print('')
    if w.lower() == "quit":
        # If the user enters quit, then exit the while loop completely.
        break
    else:

        wine_reccomendation(w)

Enter flavor preferences or food types. Enter "QUIT" to exit:  oak butter crisp



Your requirements: oak butter crisp

Similarity Score: 0.3657575439130573

Tasting Notes: This is a simple Chardonnay: It tastes sweet, soft and creamy, with butter, caramel, orange and mango flavors.

Winery: Olema
Varietal: Chardonnay
Points: 82
Retail Price: 15.0
--------------------------------------


Enter flavor preferences or food types. Enter "QUIT" to exit:  vanilla almond rich



Your requirements: vanilla almond rich

Similarity Score: 0.36723970630646563

Tasting Notes: This is an elegant white wine, dry, crisp and sophisticated. Shows flashy flavors of citrus fruits, roasted almond and vanilla. Try as an alternative to Sauvignon Blanc.

Winery: MacMurray Ranch
Varietal: Pinot Gris
Points: 87
Retail Price: 20.0
--------------------------------------


Enter flavor preferences or food types. Enter "QUIT" to exit:  cheery chocolate coffee dark oak



Your requirements: cheery chocolate coffee dark oak

Similarity Score: 0.22867209976284258

Tasting Notes: Here's a good bottle of Zinfandel. It's dry and full bodied, with rich, spicy flavors of berries, fruit liqueur, roasted coffee, dark chocolate and pepper. A little rustic around the edges, but a fine Zin for drinking now.

Winery: Kokomo
Varietal: Zinfandel
Points: 87
Retail Price: 22.0
--------------------------------------


Enter flavor preferences or food types. Enter "QUIT" to exit:  steak



Your requirements: steak

Similarity Score: 0.326602128366977

Tasting Notes: A little sweet and candy simple, this Zin shows blackberry, cherry and raspberry flavors that have a pie-filling taste. The firm tannins call for a grilled steak.

Winery: Hayman & Hill
Varietal: Zinfandel
Points: 84
Retail Price: 16.0
--------------------------------------


Enter flavor preferences or food types. Enter "QUIT" to exit:  fish tacos



Your requirements: fish tacos

Similarity Score: 0.4789383761486915

Tasting Notes: This is a good everyday Sauvignon Blanc for drinking with chicken, ham sandwiches or fish tacos. Slightly sweet in orange and vanilla, it's accompanied by crisp acidity.

Winery: Kenwood
Varietal: Sauvignon Blanc
Points: 84
Retail Price: 12.0
--------------------------------------


Enter flavor preferences or food types. Enter "QUIT" to exit:  quit





# Notes

## Basic Text preprocessing steps
removing noise: anything that isn’t a standard number or letter
removing stop words: very common words that add little value in analysis are removed from the vocabulary.
stemming: reducing inflected (or derived) words to their stem, base or root form 
lemmatization: similar to stemming, however stemming can often create non-words, whereas lemmas are actual words

## Bag of Words (BoW) Model
After preprocessing, text needs to be transformed into a meaningful number vectors for use in ML algorithms. The BoW model represents text as a matrix of word counts within a document. It's called a “bag of words" because information about the order or structure of words is discarded. The model only cares whether the known words occur in the document, but not where they occur. Intuitively, documents are similar if they have similar content.
It involves:
 - a vocabulary of known words
 - a measure of the presence of known words

For example, given a dictionary containing {Learning, is, the, not, great}, to vectorize the text “Learning is great”.
Its vector representation would be : $(1, 1, 0, 0, 1)$, where the numbers represent their word counts.

## TF-IDF
With BoW, highly frequent words start to dominate the document, but such words may not contain much informational content. It also gives more weight to longer documents than shorter documents.  

One approach is to rescale the frequency of words by how often they appear in all documents. The scores for frequent words that are also frequent across all documents are penalized. This scoring is called Term Frequency-Inverse Document Frequency, where
 - Term Frequency: a scoring of the frequency of the word in the current document
    - TF = (Number of times term t appears in a document)/(Number of terms in the document)

 - Inverse Document Frequency: a scoring of how rare a word is across documents.
    - IDF = $1+log(N/n)$, where, $N$ is the number of documents and n is the number of documents a term $t$ has appeared in.

## Cosine Similarity
A measure of similarity between two non-zero vectors of an inner product space</ul>
 - Tf-idf weight is a weight often used in information retrieval (IR) and text mining.
 - It is a statistical measure used to evaluate how important a word is to a document in a collection or corpus
     - Cosine Similarity $(d1, d2)= Dot product (d1, d2) / ||d1|| * ||d2||$ where $d1,d2$ are two non zero vectors.
     
**Reference:**  
 1. https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
 1. https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
 1. https://scikit-learn.org/stable/modules/metrics.html#cosine-similarity
 1. http://jonathansoma.com/lede/foundations/classes/text%20processing/tf-idf/