#### Questions:
#### How to interpret beer style coefficients in the regression model (as one categorical model, do not remove beer styles)
#### If we use beer style, should we balance out the samples (remove top 5 and bottom 5 beers) (no)

#### Group beer styles by type of yeast used? (ale, lager, hybrid)
#### Ales are fermented quicker, are more aromatic, and fruity
#### Lagers are fermented slower and at lower temperatures to create a "hoppy" taste
#### Hybrids are a combination of ale and lager
#### https://www.beeradvocate.com/beer/style/

#### How to select columns to use in regression? Lasso technique?
#### Split sentiment for sentiment by sentence. Find sentences with synonyms for each rating dimension
#### create aroma sentiment, appearance sentiment, etc...
#### interaction between age and beer style

In [11]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
import string
import webcolors

from datetime import datetime
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.linear_model import Lasso, LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold

from string import digits
from textblob import TextBlob, Word

Baseline features
1. beer/ABV - the alcohol by volume of the beer
2. beer/style
3. user's gender
4. user's age in years

Extra features from review text
1. sentiment of the review
2. adjectives
3. adverbs
4. verbs
4. colors

In [12]:
def convertUnixTimeToYears(unixTimes):
    ageInYears = []
    today = datetime.now()

    for age in unixTimes:
        birthdate = datetime.fromtimestamp(int(age))
        delta = today - birthdate
        years = delta.days / 365
        ageInYears.append(years)

    return (ageInYears)

In [13]:
def fill_missing_values(df):
    df['beer/style'] = df['beer/style'].fillna('missing')
    df['review/text'] = df['review/text'].fillna('')
    df['user/birthdayUnix'] = df['user/birthdayUnix'].fillna(np.mean(df['user/birthdayUnix']))

In [14]:
def read_type(filename):
    all_ales = []
    with open(filename, 'r') as f:
        for line in f:
            name = line.lower().replace('/','').replace('(','').replace(')','').strip()
            name = re.sub(' +',' ',name)
            all_ales.append(name)
            
    return all_ales

In [15]:
def assign_beer_category(df):
    all_ales = read_type('ales.txt')
    all_lagers = read_type('lagers.txt')
    all_hybrids = read_type('hybrids.txt')

    category = []

    for style in df['beer/style']:
        style = style.lower().replace('/','').replace('(','').replace(')','')
        style = re.sub(' +',' ',style)

        if style in all_ales:
            category.append('ale')

        elif style in all_lagers or 'oktoberfest' in style or \
        'keller bier zwickel bier' in style:
            category.append('lager')

        elif style in all_hybrids:
            category.append('hybrid')

        else:
            category.append('other')

    df['beer/category'] = category

In [16]:
# returns tf-idf vectors (scipy matrix) and tf-idf model (TfidfVectorizer) 
# for a corpus of words
def get_tf_idf_vector(df):
    # replace multiple spaces, punctuation, and digits from review words
    all_documents = [' '.join([re.sub('\s+', '', \
        word.strip()).replace(' ','').translate(None, string.punctuation).translate(None, digits) 
        for word in review.split(' ') \
        if len(word) > 0]) for review in df['review/text']]

    # extract vectors
    tokenize = lambda doc: doc.lower().split(" ")
    sklearn_tfidf = TfidfVectorizer(norm='l2',min_df=0, use_idf=True, smooth_idf=False, 
        sublinear_tf=True, tokenizer=tokenize, stop_words='english')
    sklearn_representation = sklearn_tfidf.fit_transform(all_documents)  
    
    return sklearn_tfidf, sklearn_representation

In [17]:
# creates a vocab of top tf-idf words from a corpus
def get_tf_idf_words(df):
    sklearn_tfidf, sklearn_representation = get_tf_idf_vector(df)
    feature_names = sklearn_tfidf.get_feature_names()
    vocab = set()
    
    for i in range(len(all_documents)):
        tags = TextBlob(all_documents[i]).tags

        feature_index = sklearn_representation[i,:].nonzero()[1]
        tfidf_scores = zip(feature_index, np.sort([sklearn_representation[i, x] for x in feature_index])[::-1][:5])

        for w, s in [(feature_names[i], s) for (i, s) in tfidf_scores]:
            for tag in tags:
                if w == tag[0] and (tag[1] == 'JJ' or tag[1] == 'NN'):
                    vocab.add(w)
                    
    return vocab

In [18]:
# extract average sentiment by sentence
def extract_sentiment(corpus):
    sentiment = []
    
    for text in corpus:
        curr = []
        sentences = TextBlob(text).sentences
        
        for sentence in sentences:
            curr.append(sentence.sentiment.polarity)
            
        if len(curr) == 0:
            curr.append(TextBlob(text).sentiment.polarity)
            
        sentiment.append(np.mean(curr))
    
    return sentiment

In [19]:
# extract average sentiment by sentence
def extract_overall_sentiment(corpus):
    sentiment = []
    
    for text in corpus:
        sentiment.append(TextBlob(text).polarity)
    
    return sentiment

In [20]:
def format_predictions(X, df, ratings, Xtest, ytest):
    for rating in ratings:
        y = df[rating]
        reg = LinearRegression()
        reg.fit(X, y)
        ytest[rating] = reg.predict(Xtest)
        
    return result

In [21]:
# removes rows containing beer styles from train that are not in test
def remove_different_styles(train, test):
    testStyles = test['beer/style'].unique()
    trainStyles = train['beer/style'].unique()
    diffStyles = np.setdiff1d(trainStyles, testStyles)
    train = train[~train['beer/style'].isin(diffStyles)]
    
    return train

In [22]:
def bag_of_words(data):
    vectorizer = CountVectorizer(stop_words="english", max_features=1000)
    X_train_counts = vectorizer.fit_transform([str(text) for text in data['review/text']])

    tf_transformer = TfidfTransformer().fit(X_train_counts)
    X_tfidf = tf_transformer.transform(X_train_counts)

    X = pd.DataFrame(X_tfidf.todense())
    
    return X

In [35]:
# extracts user's age in years and review polarity from dataset
def extract_features(df, y, cols_keep, alpha=0.001):
    fill_missing_values(df)
    df = df[cols_keep]
    
    X = bag_of_words(df)
    X['sentence_polarity'] = extract_sentiment(df['review/text'])
    X['review_polarity'] = extract_overall_sentiment(df['review/text'])
    X['beer/style'] = df['beer/style']
    X['beer/ABV'] = df['beer/ABV']
    fill_missing_values(X)
    
    tmp = pd.get_dummies(X, columns=["beer/style"], prefix=["style"])
    select_features(X, tmp, y, alpha)
    
    X = pd.get_dummies(X, columns=["beer/style"], prefix=["style"])
    
    return X

In [24]:
def select_features_from_lasso(X, y, alpha):
    # fit lasso model and pass to select from model
    lasso = Lasso(alpha).fit(X, y)
    model = SelectFromModel(lasso, prefit=True)

    # new features
    X_new = model.transform(X)
    return X.columns[model.get_support()]

In [25]:
def select_features(df, X, y, alpha):
    feat = select_features_from_lasso(X, y, alpha)
    
    styles = []
    categories = []
    words = []
    
    for col in feat:
        if 'style_' in col:
            styles.append(col.replace('style_',''))
            
        if 'category_' in col:
            categories.append(col.replace('category_',''))
            
        if col.isdigit():
            words.append(col)
            
    for col in df.columns:
        if col.isdigit() and col not in words:
            df = df.drop([col], axis = 1)
    
    df['beer/category'][~(df['beer/category'].isin(categories))] = 'other'
    df['beer/style'][~(df['beer/style'].isin(styles))] = 'other'
    
    return df

In [None]:
df = pd.DataFrame.from_csv('train.csv')
X = extract_features(df, df[ratings], cols_keep, alpha = 1)
X.head()

In [26]:
df = pd.DataFrame.from_csv('train.csv')
ratings = ['review/appearance','review/aroma','review/overall','review/palate','review/taste']
cols_keep = ['beer/style', 'user/birthdayUnix', 'review/text', 'beer/ABV']
len(df)

37500

In [14]:
fill_missing_values(df)
len(df)

37500

### Convert beer style to numerical features via one-hot encoding
#### 96 features used

In [15]:
df['userAgeInYears'] = convertUnixTimeToYears(df['user/birthdayUnix'])
X = df[["beer/style", 'beer/ABV', 'userAgeInYears']]
X = pd.get_dummies(X, columns=["beer/style"], prefix=["style"])

In [16]:
X.head()

Unnamed: 0_level_0,beer/ABV,userAgeInYears,style_Altbier,style_American Adjunct Lager,style_American Amber / Red Ale,style_American Amber / Red Lager,style_American Barleywine,style_American Black Ale,style_American Blonde Ale,style_American Brown Ale,...,style_Scotch Ale / Wee Heavy,style_Scottish Ale,style_Scottish Gruit / Ancient Herbed Ale,style_Smoked Beer,style_Tripel,style_Vienna Lager,style_Weizenbock,style_Wheatwine,style_Winter Warmer,style_Witbier
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
40163,5.0,40,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8135,11.0,40,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10529,4.7,40,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
44610,4.4,41,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
37062,4.4,40,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### simple linear regression on each rating with 5-fold cross validation
#### score results via negative mean squared error

In [18]:
results = {}
for rating in ratings:
    reg = LinearRegression()
    scores = cross_val_score(reg, X, df[rating], cv=5, scoring='neg_mean_squared_error')
    results[rating] = np.mean(scores)

print (results)

{'review/appearance': -0.26254842658791933, 'review/taste': -0.36146235792824416, 'review/palate': -0.32955588074837056, 'review/overall': -0.40436346160060266, 'review/aroma': -0.31103058294692559}


#### Add review sentiments to features
#### polarity = how positive, neutral, or negative the review is
#### subjectivity = how biased the review is 

In [20]:
X['review_polarity'] = extract_sentiment(df['review/text'])

In [None]:
X.head()

#### Evaluate model with five-fold cross validation

In [None]:
results = {}
for rating in ratings:
    reg = LinearRegression()
    scores = cross_val_score(reg, X, base[rating], cv=5, scoring='neg_mean_squared_error')
    results[rating] = np.mean(scores)

print (results)

#### Add an interaction between age and beer style

In [None]:
X['style/interaction'] = X['userAgeInYears']

In [None]:
results = {}
for rating in ratings:
    reg = LinearRegression()
    scores = cross_val_score(reg, X, base[rating], cv=5, scoring='neg_mean_squared_error')
    results[rating] = np.mean(scores)

print (results)

#### Assign beer styles to categories: ale, lager, hybrid, or other and add to feature set
#### See whether or not categorizing the beers will improve rating prediction accuracy
#### Adding categories did not improve MSE

In [None]:
assign_beer_category(df)
X['beer/category'] = df['beer/category']
X = pd.get_dummies(X, columns=["beer/category"], prefix=["style"])
X.head()

In [None]:
results = {}
for rating in ratings:
    reg = LinearRegression()
    scores = cross_val_score(reg, X, base[rating], cv=5, scoring='neg_mean_squared_error')
    results[rating] = np.mean(scores)

print (results)

#### output predictions on testing set

In [106]:
results = {}

for rating in ratings:
    df = pd.DataFrame.from_csv('train.csv')
    ytrain = df[ratings][rating]
    
    Xtrain = extract_features(df, ytrain, cols_keep)
    
    reg = LinearRegression()
    scores = cross_val_score(reg, Xtrain, ytrain, cv=5, scoring='neg_mean_squared_error')
    results[rating] = np.mean(scores)
    
print results

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is tryin

       beer/ABV  userAgeInYears  review_polarity  style/interaction  \
index                                                                 
40163       5.0              40        -0.049583                 40   
8135       11.0              40         0.151150                 40   
10529       4.7              40         0.217778                 40   
44610       4.4              41         0.227381                 41   
37062       4.4              40         0.347628                 40   

       style_American Adjunct Lager  style_American Amber / Red Ale  \
index                                                                 
40163                           0.0                             0.0   
8135                            0.0                             0.0   
10529                           0.0                             0.0   
44610                           0.0                             0.0   
37062                           0.0                             0.0   

    

In [None]:
format_predictions(Xtrain, ytrain, ratings, Xtest, ytest)
ytest.to_csv('results.csv')

In [None]:
805-714-7899