Goal: predict wine rating from description, region, etc.
=======================================================

I. Questions to answer:
-----------------------
1. What worlds are the most common in description?
2. What worlds are the most common in descriptions of high rating wines?
3. What words are the most common in descriptions of low rating wines?
4. What words can be found only in one group (high rating / low rating)?
5. What words are the most common for each of the grapes?
6. Is there correlation between price and rating?

In [1]:
# Import libs
import pandas as pd
import plotly as py

In [2]:
# Load wines dataset
wines_df = pd.read_csv('winemag-data-130k-v2.csv')

## Transform descriptions to bags of words

In [3]:
WINE_STOP_WORDS = [
    'wine',
    'wines',
    'drink',
    'drinks',
    'flavor',
    'flavors',
    'aroma',
    'aromas',
    'note',
    'notes',
    'good',
    'well',
    'year',
    'years',
    'make',
    'made',
    'nose',
    'give',
    'gives',
    'gived',
    'one',
    'feel',
    'feels',
    'also',
    'taste',
    'testes',
    'seem',
    'seems',
    'last',
    'lasts',
    'yet', 
    'finish',
    'texture',
    'like',
    'time',
    'almost',
    'mouth',
    
    # words that can be found in all ratings
    # interesting that majority of them are some characteristics
    # of the wine. apparently it's such a broad description, so it doesn't
    # affect overall quality
    # All of them are seen in more than 500 wines description
    # around 0.4%
    'still', 
    'fine', 
    'toast', 
    'noir', 
    'cherry', 
    'texture', 
    'sugar', 
    'wood', 
    'black', 
    'rich', 
    'brown', 
    'coffee', 
    'blackberries', 
    'citrus', 
    'big', 
    'showing', 
    'overall', 
    'crushed', 
    'dark', 
    'crisp', 
    'impressive', 
    'tobacco', 
    'leather', 
    'freshness', 
    'blackberry',
    'core', 
    'pinot', 
    'intensity', 
    'currant', 
    'dried', 
    'best', 
    'even', 
    'minerality', 
    'new', 
    'red', 
    'shows', 
    'franc', 
    'full', 
    'grapes', 
    '100', 
    'come', 
    'dry', 
    'many', 
    'keep', 
    'cinnamon', 
    'spices', 
    'first', 
    'already', 
    'color', 
    'sweet', 
    'powerful', 
    'least', 
    'balanced', 
    'acidity', 
    'licorice', 
    'long', 
    'white', 
    'syrup', 
    'opens', 
    'sauvignon', 
    'chocolate', 
    'perfumed', 
    'delivers', 
    'hold', 
    'spice', 
    'chardonnay', 
    'vanilla', 
    'fruits', 
    'balance', 
    'tannins', 
    'palate', 
    'ripe', 
    'almost', 
    'high', 
    'sense', 
    'concentration', 
    'huge', 
    'end', 
    'variety', 
    'vintage', 
    'oak', 
    'merlot',
    'pure', 
    'berry', 
    'finish', 
    'fruit', 
    'mouth', 
    'cabernet', 
    'structure', 
    'bodied', 
    'age', 
    'green', 
    'bottle', 
    'blend'   
]

def _tokenise(sentense):
    from nltk.tokenize import word_tokenize
    try:
        return word_tokenize(sentense)
    except UnicodeDecodeError: 
        return word_tokenize(sentense.decode('utf-8'))

def _clean_tokens(tokens):
    """
    Transform to lowcase
    Remove punctuations from words
    Filter out words with length less than 2 symbols
    """
    from itertools import chain
    
    def _clean_token(token):
        import re
        token = token.lower()
        return re.split(r'[^\w\s]', token)

    tokens = chain(*map(_clean_token, tokens))
    MIN_TOKEN_LENGTH = 2
    return filter(
        lambda t: len(t) >= MIN_TOKEN_LENGTH,
        tokens)

def _filter_out_stop_words(tokens):
    from nltk.corpus import stopwords
    import string
    stop_list = \
        stopwords.words('english') + \
        list(string.punctuation) + \
        WINE_STOP_WORDS
    return filter(
            lambda token: token not in stop_list,
            tokens)


def transfrom_to_bag_of_words(senstense):
    """
    Transform sentense to bag of words using nltk tokeniser
    Filter out stop words
    """
    tokens = _tokenise(senstense)
    tokens = _clean_tokens(tokens)
    return _filter_out_stop_words(tokens)

wines_df['description_as_bag_of_words'] = wines_df.apply(
    lambda x: transfrom_to_bag_of_words(x.description),
    axis=1)

# Train Support Vector Regressor with different C an epsilon

In [4]:
def _train_and_validate_svr(
        C, gamma, X, y):
    """
    Train svr with given params an
    calculate accuracy using cross validation
    """
    import numpy as np
    from sklearn.svm import SVC
    from sklearn.model_selection import cross_val_score
    clf = SVC(C=C, gamma=gamma)
    NUMBER_OF_CROSS_VAL_TRIES = 3
    scores = cross_val_score(
        clf, X, y, cv=NUMBER_OF_CROSS_VAL_TRIES,
        n_jobs=1, # switch of multiprocessing - too slow with 2nd python
        verbose=3)
    return \
        clf.fit(X, y), \
        np.mean(scores)

def _calculate_tf_idf(df_):
    from sklearn.feature_extraction.text import TfidfVectorizer
    print('Calculate tf idf')
    vectorizer = TfidfVectorizer()
    return vectorizer.fit_transform(
        df_.as_matrix())

def _scale_numerical_features(df_):
    from sklearn.preprocessing import MinMaxScaler
    print('Scale numerical')
    scaler =  MinMaxScaler()
    return scaler.fit_transform(
        df_.fillna(0).as_matrix())

def _binarize_categorical_features(df_):
    import numpy as np
    from sklearn.feature_extraction import DictVectorizer
    print('Binarize categorical features')
    vectorizer = DictVectorizer(sparse=False)
    # convert dataframe to matrix
    D = [
        val for index, val in 
        sorted(
            df_.T.to_dict().items(),
            key=lambda x: x [0])]
    X = vectorizer.fit_transform(D)
    X = np.nan_to_num(X)
    return X

def _prepare_text_feature_for_tf_idf(df_, text_feature):
    """
    Join array to string, so it can be processes by tf idf transformer
    Modifies dataframe column inplace
    """
    def _join_array_to_string(arr):
        return ' '.join((arr))

    new_text_feature = '%s_' % text_feature
    df_[new_text_feature] = df_.apply(
        lambda x: _join_array_to_string(x[text_feature]),
        axis=1)
    return new_text_feature

def _select_significant_text_features_using_classifier(X, y):
    """
    Use LinearSVC to preselect text features
    """
    # TODO experiment with C
    from sklearn.svm import LinearSVC
    from sklearn.feature_selection import SelectFromModel
    print('Select significant features')
    lc = LinearSVC(
        C=0.05,
        penalty="l1", 
        dual=False).fit(X, y)
    model = SelectFromModel(lc, prefit=True)
    return model.transform(X)

def _transform_points_to_class(points):
    """
    Return class number according to points amount
    """
    HIGH_RATING = 94
    LOW_RATING = 83
    if points < LOW_RATING:
        # low rating wines
        return 0
    if points < HIGH_RATING:
        # average wines
        return 1
    return 2 # exceptional wines

def _generate_x_and_y_from_df(
        df_, text_feature, numerical_features, categorical_features,
        result, normalize_results=True):
    """
    Generate normalised matrix from dataframe
    """
    import numpy as np
    from scipy import sparse 
    from sklearn.preprocessing import normalize
    # get y
    y = df_[result].\
        apply(
            _transform_points_to_class).\
        as_matrix()
    
    # get x
    new_text_feature = _prepare_text_feature_for_tf_idf(df_, text_feature)
    text_x = _calculate_tf_idf(df_[new_text_feature])
    # preselect significant x features
    text_x = _select_significant_text_features_using_classifier(
        text_x, y)
    numerical_features = _scale_numerical_features(df_[numerical_features])
    categorical_features = _binarize_categorical_features(df_[categorical_features])
    categorical_features = _select_significant_text_features_using_classifier(
        categorical_features, y)
    X = sparse.hstack(
        (
            text_x,
            sparse.csr_matrix(numerical_features),
            sparse.csr_matrix(categorical_features)
        ))
    if not normalize_results:
        return X, y
    print('Normalize')
    return normalize(X), y

def try_train_svr(
        df_, text_feature, numerical_features, categorical_features, result):
    """
    Experiment with different types of C and epsilon to train SVR on
    given dataset
    """
    import itertools
    X, y = _generate_x_and_y_from_df(
        df_, text_feature, 
        numerical_features, categorical_features, result)
    X = X.todense()

    C_s = [
        0.1, 1, 10, 100, 1000]
    gammas = [0.1, 0.01, 0.001, 0.0001, 0.00001]
    c_and_gammas = itertools.product(C_s, gammas)

    # run regression with different SVR params
    regression_results = []
    for C, gamma in c_and_gammas:
        print('Test C and gamma:%f / %f' % (C, gamma))
        accuracy = _train_and_validate_svr(
            C, gamma, X, y)
        print('Accuracy', accuracy)
        regression_results.append(
            {
                'accuracy': accuracy,
                'C': C,
                'gamma': gamma
            })

    regression_results = sorted(
           regression_results,
           key=lambda res: res['accuracy'],
           reversed=True)

    # return regression results
    return regression_results


def show_regresion_results(results):
    for result in results[:5]:
        print('C', result['C'])
        print('gamma', result['gamma'])
        print('accuracy', result['accuracy'])
       
    
TEXT_FEATURE = 'description_as_bag_of_words'
NUMERICAL_FEATURES = [
    'price',
    
]
CATEGORICAL_FEATURES = [
    'country',
    'variety',
    'province',
    'region_1'
]
RESULT_COLUMN = 'points'

regression_results = try_train_svr(
        wines_df, TEXT_FEATURE, 
        NUMERICAL_FEATURES, 
        CATEGORICAL_FEATURES,
        RESULT_COLUMN)

show_regresion_results(regression_results)

Calculate tf idf
(129971, 30810)
Select significant features
(129971, 521)
Scale numerical
Binarize categorical features
(129971, 2408)
Select significant features
(129971, 291)
Normalize
Test C and gamma:0.100000 / 0.100000
[CV]  ................................................................
[CV] ........................... , score=0.929992613794, total=72.3min
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 72.3min remaining:    0.0s


[CV] ........................... , score=0.929992613794, total=71.2min
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed: 143.5min remaining:    0.0s


[CV] ........................... , score=0.929990997853, total=70.8min


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed: 214.3min finished


NameError: global name 'np' is not defined

In [None]:
# Initialise Plotly
py.offline.init_notebook_mode()

# Helpers for visualising distribution
def _get_value_counts_from_series(column, series):
    """
    Return series unique values as x and value counts as y
    """
    # display zeros on graph also
    df = series.value_counts().to_frame()
    df = df.sort_index()
    return df.index.tolist(), df[column].tolist()

def _plot_numeric_feature(column, series):
    """
    Helper to display scatter plot 
    using series index as x and series values as y
    """
    x, y = _get_value_counts_from_series(column, series.fillna(0))
    data = py.graph_objs.Scatter(
        x=x, y=y)
    layout = py.graph_objs.Layout(title=column)
    py.offline.iplot({
            'data': [data], 
            'layout': layout})
    
    
def _plot_categorical_featrue(column, series):
    """
    Helper to display bar plot 
    using series index as x and series values as y
    """
    x, y = _get_value_counts_from_series(column, series.fillna('No data'))
    data = py.graph_objs.Bar(
        x=x, y=y)
    layout = py.graph_objs.Layout(title=column)
    py.offline.iplot({
            'data': [data], 
            'layout': layout})

    
# Visualise distributions for categorical and numerical features
CATEGORICAL_FEATURES = [
    'country',
    'province',
    'region_1',
    'region_2',
    'variety',
    'winery'
]

NUMERICAL_FEATURES = [
    'points',
    'price'
]
for column in CATEGORICAL_FEATURES:
    _plot_categorical_featrue(column, wines_df[column])
    
for column in NUMERICAL_FEATURES:
    _plot_numeric_feature(column, wines_df[column])

## Data cleanliness

1. _63 wines_ have no **country**
2. _80k_ have no **region 2**

# What worlds are the most and least common in descriptions?

In [None]:
def calculate_words_popularity(
        descriptions, include_numbers=False):
    """
    Calculate each words popularity as number of descriptions
    where the word can be found
    """
    from collections import Counter
    from itertools import chain
    # remove dublicates from descriptions
    descriptions_as_sets = [
        set(description)
        for description in descriptions.tolist()]
    descriptions_flat = chain(*descriptions_as_sets)
    if not include_numbers:
        descriptions_flat = filter(
            lambda w: not w.isdigit(),
            descriptions_flat)
    word_counts = Counter(descriptions_flat)
    return word_counts

def plot_words_popularity(word_counts, title):
    """
    Plot words poularity. Expects counter format
    """
    x, y = \
        [c[0] for c in word_counts], \
        [c[1] for c in word_counts]
    
    data = py.graph_objs.Bar(
        x=x, y=y)
    layout = py.graph_objs.Layout(title=title)
    py.offline.iplot({
            'data': [data], 
            'layout': layout})

def show_most_popular_and_less_popular_words(
        descriptions, graph_title):
    """
    Show mots and less popular words in passed descriptions
    """
    N_POPULAR = 100
    word_counts = calculate_words_popularity(descriptions)
    # Show most popular words
    plot_words_popularity(
        word_counts.most_common(N_POPULAR),
        'Most popular words -- %s' % graph_title)
    # Show less popular words
    plot_words_popularity(
        word_counts.most_common()[:-N_POPULAR-1:-1],
        'Less popular words -- %s' % graph_title)    

show_most_popular_and_less_popular_words(
    wines_df.description_as_bag_of_words,
    'Total')

# What worlds are the most common in high rated wines?
1. What is the average rate?
2. What is std for rate?
3. What are the high rating and low rating ranges?
4. What words are common for high rated wines?

In [None]:
# Plot average rate and stanart deviations

def show_metric_with_average(metric, series):
    """
    Plot metric values destribution, avg and std
    """
    x, y = _get_value_counts_from_series(
        metric, series.fillna(0))
    data = py.graph_objs.Scatter(
        x=x, y=y, 
        name='%s distribution' % metric)
    # get data to plot avg
    avg = series.mean()
    avg_trace = py.graph_objs.Scatter(
        x=[avg, ] * len(y),
        y=y,
        mode='markers',
        marker={
            'color': 'rgb(128, 0, 128)',
            'symbol': 'diamond-open',
        })
    # get data to plot stds
    std = series.std()
    std_traces = []
    MAX_STD_MULTIPLIER = 3
    x_y = zip(x, y)
    for std_multiplier in\
            xrange(1, MAX_STD_MULTIPLIER + 1):
        # filter metric values and frequencies
        # where metric is in range 
        # [avg - std * std_multiplie, avg + std * std_multiplier]
        x_y_ = filter(
            lambda (x, y): \
                (x >= avg - std * std_multiplier) and\
                (x <= avg + std * std_multiplier),
            x_y)
        # append scatter plot with filled are in range
        std_traces.append(
            py.graph_objs.Scatter(
                x=[x for x, y in x_y_],
                y=[y for x, y in x_y_],
                fill='tozeroy')    
        )
        
    layout = py.graph_objs.Layout(title=column)
    py.offline.iplot({
            'data': [data, avg_trace,] + std_traces, 
            'layout': layout}) 


show_metric_with_average('points', wines_df.points)   

1. **high rating** >= 94
2. **low rating** <= 83

In [None]:
HIGH_RATING = 94
LOW_RATING = 83


# Show words for high rating
show_most_popular_and_less_popular_words(
    wines_df[wines_df.points >= HIGH_RATING]['description_as_bag_of_words'],
    'Wines with high rating')

# Show words for low rating
show_most_popular_and_less_popular_words(
    wines_df[wines_df.points <= LOW_RATING]['description_as_bag_of_words'],
    'Wines with low rating')


# What words can be found only in one group?

In [None]:
def filter_words_in_series(
        series, not_include_words_series):
    """
    FIlter words in series, that are not in not_include_words_series
    """
    def _filter_out_words(
            bag_of_words, not_include_words_dict):
        return filter(
            lambda w: w not in not_include_words_dict,
            bag_of_words)

    from itertools import chain
    series = series.copy()
    # get dict of not include words
    not_include_words_dict = {
        word: 1 for word in 
        chain.from_iterable(
            not_include_words_series.tolist())
        }
    series = series.apply(
        lambda b: _filter_out_words(b, not_include_words_dict))
    return series

# Show most and less popular in high rating wines only
show_most_popular_and_less_popular_words(
    filter_words_in_series(
        wines_df[wines_df.points >= HIGH_RATING]['description_as_bag_of_words'], 
        wines_df[wines_df.points <= LOW_RATING]['description_as_bag_of_words'],
    ),
    'Words in high rating wines descriptions only')

# Show most and less popular in high rating wines only
show_most_popular_and_less_popular_words(
    filter_words_in_series(
        wines_df[wines_df.points <= LOW_RATING]['description_as_bag_of_words'],
        wines_df[wines_df.points >= HIGH_RATING]['description_as_bag_of_words']
    ),
    'Words in low rating wines descriptions only')

# What words can be found in both groups (high and low rating)?


In [None]:
def filter_words_in_both_series(
        series_1, series_2):
    """
    FIlter words in series, that are in both series
    """
    def _filter_out_words_in_dict(
            bag_of_words, include_words_dict):
        return filter(
            lambda w: w in include_words_dict,
            bag_of_words)
    
    def _generate_word_dict_from_series(
            series):
        from itertools import chain
        return {
            word: 1 for word in 
            chain.from_iterable(
                series.tolist())}

    series_1 = series_1.copy()
    series_2 = series_2.copy()
    series_1 = series_1.apply(
        lambda b: _filter_out_words_in_dict(
                        b, 
                        _generate_word_dict_from_series(series_2)))
    series_2 = series_2.apply(
        lambda b: _filter_out_words_in_dict(
                        b, 
                        _generate_word_dict_from_series(series_1)))
    return pd.concat([series_2, series_1])
                
                
show_most_popular_and_less_popular_words(
    filter_words_in_both_series(
        wines_df[wines_df.points >= HIGH_RATING]['description_as_bag_of_words'], 
        wines_df[wines_df.points <= LOW_RATING]['description_as_bag_of_words'],
    ),
    'Words both in high rating wines and low rating wines descriptions')

# What words can be found in all rating groups?

In [None]:
def get_words_that_can_be_found_in_all_ratings(df_):
    """
    Get the list of the words that can be found in description
    of all ratings
    """
    from itertools import chain
    
    dictionary_by_ratings = {}
    for rating in df_.points.unique():
        dictionary_by_ratings[rating] = \
            set(
                chain.from_iterable(
                    df_[df_.points == rating]['description_as_bag_of_words'].tolist()))
    dictionaries = list(dictionary_by_ratings.values())
    result_set = dictionaries[0]
    for dictionary in dictionaries[1:]:
        result_set = result_set & dictionary
    return result_set

def filter_descriptions_that_are_common_for_all_ratings(df_):
    def _filter_common_words(bag_of_words, common_words_dictionary):
        return filter(
            lambda w: w in common_words_dictionary,
            bag_of_words)

    common_words_dictionary = \
        get_words_that_can_be_found_in_all_ratings(df_)
    series = df_.description_as_bag_of_words.copy()
    return series.apply(
        lambda x: _filter_common_words(x, common_words_dictionary))

show_most_popular_and_less_popular_words(
    filter_descriptions_that_are_common_for_all_ratings(wines_df),
    'Wine descriptions that are common to all ratings')

# What words are the most common for each of the grapes?

In [None]:
MIN_LEN_TO_DISPLAY = 0.05 * len(wines_df)
for grape in wines_df.variety.unique():
    grapes_df = wines_df[wines_df.variety == grape]
    if len(grapes_df) < MIN_LEN_TO_DISPLAY:
        continue
    show_most_popular_and_less_popular_words(
        grapes_df.description_as_bag_of_words,
        'Grape %s. Total amount of wines %d' % (
            grape, len(grapes_df)))
    

# Is there correlation between price and rating?

In [None]:
def plot_correlation_between_metrics(
        metric_to_group_by, metric_to_calculate_average, df_):
    avg_price_for_points = df_.groupby(
            [metric_to_group_by, ])[metric_to_calculate_average].mean().to_frame()

    data = py.graph_objs.Scatter(
        x=avg_price_for_points.index, 
        y=avg_price_for_points[metric_to_calculate_average])
    layout = py.graph_objs.Layout(
        title='Avg %s by %s' % (
            metric_to_calculate_average, metric_to_group_by))
    py.offline.iplot({
            'data': [data], 
            'layout': layout})  
    
def plot_correlation_between_price_and_and_rating(df_):
    # plot 2 dimensional scatter plot
    data = py.graph_objs.Scatter(
        x=df_.points, 
        y=df_.price,
        mode='markers')
    layout = py.graph_objs.Layout(
        title='Dependacnies between price and rating')
    py.offline.iplot({
            'data': [data], 
            'layout': layout})
                     
    # plot average price for rating
    plot_correlation_between_metrics(
        'points', 'price', df_)
    
    plot_correlation_between_metrics(
        'price', 'points', df_)

                     
plot_correlation_between_price_and_and_rating(wines_df)