## Import Library

In [218]:
import pandas as pd
import numpy as np
import time # to calculate time to compile the program
import ftfy # for cleaning text

from sklearn.svm import SVC  #Support Vector Machines
from sklearn.ensemble import RandomForestClassifier #Random Forest Classifier

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score # Model Validation metrics


from sklearn.feature_extraction.text import CountVectorizer # to tokenize a collection of text 
from sklearn.feature_extraction.text import TfidfVectorizer # to tokenize a collection of text 

from sklearn.model_selection import train_test_split # to split the data
from tensorflow.keras.models import Sequential # recurrent neural network
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM

import re
import nltk
from nltk.corpus import stopwords # to preprocess data
from nltk.stem import WordNetLemmatizer # to get meaning full word
import string
lem = WordNetLemmatizer()

## Read data file

In [219]:
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,index,beer/ABV,beer/beerId,beer/brewerId,beer/name,beer/style,review/appearance,review/aroma,review/overall,review/palate,review/taste,review/text,review/timeStruct,review/timeUnix,user/ageInSeconds,user/birthdayRaw,user/birthdayUnix,user/gender,user/profileName
0,40163,5.0,46634,14338,Chiostro,Herbed / Spiced Beer,4.0,4.0,4.0,4.0,4.0,Pours a clouded gold with a thin white head. N...,"{'min': 38, 'hour': 3, 'mday': 16, 'sec': 10, ...",1229398690,,,,,RblWthACoz
1,8135,11.0,3003,395,Bearded Pat's Barleywine,American Barleywine,4.0,3.5,3.5,3.5,3.0,12oz bottle into 8oz snifter.\t\tDeep ruby red...,"{'min': 38, 'hour': 23, 'mday': 8, 'sec': 58, ...",1218238738,,,,,BeerSox
2,10529,4.7,961,365,Naughty Nellie's Ale,American Pale Ale (APA),3.5,4.0,3.5,3.5,3.5,First enjoyed at the brewpub about 2 years ago...,"{'min': 7, 'hour': 18, 'mday': 26, 'sec': 2, '...",1101492422,,,,Male,mschofield
3,44610,4.4,429,1,Pilsner Urquell,Czech Pilsener,3.0,3.0,2.5,3.0,3.0,First thing I noticed after pouring from green...,"{'min': 7, 'hour': 1, 'mday': 20, 'sec': 5, 'y...",1308532025,1209827000.0,"Aug 10, 1976",208508400.0,Male,molegar76
4,37062,4.4,4904,1417,Black Sheep Ale (Special),English Pale Ale,4.0,3.0,3.0,3.5,2.5,A: pours an amber with a one finger head but o...,"{'min': 51, 'hour': 6, 'mday': 12, 'sec': 48, ...",1299912708,,,,,Brewbro000


In [220]:
# To find out number of columns and rows.
df.shape

(37500, 19)

In [221]:
df.columns

Index(['index', 'beer/ABV', 'beer/beerId', 'beer/brewerId', 'beer/name',
       'beer/style', 'review/appearance', 'review/aroma', 'review/overall',
       'review/palate', 'review/taste', 'review/text', 'review/timeStruct',
       'review/timeUnix', 'user/ageInSeconds', 'user/birthdayRaw',
       'user/birthdayUnix', 'user/gender', 'user/profileName'],
      dtype='object')

## Data cleaning and Data preprocessing

In [222]:
# checking missing values in percentange with ascending order.

percent_missing = df.isnull().sum() * 100 / len(df)
missing_value_df = pd.DataFrame({'column_name': df.columns,
                                 'percent_missing': percent_missing})
missing_value_df.sort_values('percent_missing', inplace=True, ascending = False)
missing_value_df

Unnamed: 0,column_name,percent_missing
user/birthdayUnix,user/birthdayUnix,79.050667
user/birthdayRaw,user/birthdayRaw,79.050667
user/ageInSeconds,user/ageInSeconds,79.050667
user/gender,user/gender,59.162667
review/text,review/text,0.026667
user/profileName,user/profileName,0.013333
review/taste,review/taste,0.0
review/timeUnix,review/timeUnix,0.0
review/timeStruct,review/timeStruct,0.0
index,index,0.0


In [223]:
# removing columns with more than 30% missing rows
more_than_30_per = missing_value_df[missing_value_df['percent_missing']>30]
more_than_30_per

Unnamed: 0,column_name,percent_missing
user/birthdayUnix,user/birthdayUnix,79.050667
user/birthdayRaw,user/birthdayRaw,79.050667
user/ageInSeconds,user/ageInSeconds,79.050667
user/gender,user/gender,59.162667


In [224]:
df = df.drop(more_than_30_per.index, axis=1)

In [225]:
# there is only columns with less than 1% missing values.
for i in df.columns:
    if(df["{}".format(i)].isna().sum() != 0):
        print(i)
# so we remove directly that row because it will not affect on our model.
df = df.dropna()
# reindexing the data frame
df = df.reset_index(drop=True)

review/text
user/profileName


In [139]:
# now there is no missing value in our data set.
df.isnull().sum().sum()

0

In [140]:
# check duplicate rows in data set
df.duplicated().sum()

0

In [141]:
df.shape

(37485, 15)

In [143]:
df.head(1)

Unnamed: 0,index,beer/ABV,beer/beerId,beer/brewerId,beer/name,beer/style,review/appearance,review/aroma,review/overall,review/palate,review/taste,review/text,review/timeStruct,review/timeUnix,user/profileName
0,40163,5.0,46634,14338,Chiostro,Herbed / Spiced Beer,4.0,4.0,4.0,4.0,4.0,Pours a clouded gold with a thin white head. N...,"{'min': 38, 'hour': 3, 'mday': 16, 'sec': 10, ...",1229398690,RblWthACoz


## Feature Engineering

In [144]:
# we convert float columns into integer to remove fractional part.
float_columns = ['beer/ABV', 'review/appearance', 'review/aroma', 'review/overall', 'review/palate', 'review/taste']
for i in float_columns:
    df["{}".format(i)] = df["{}".format(i)].astype(int)

In [145]:
# now our float columns is properly scale.
for i in float_columns:
    print(sorted(df['{}'.format(i)].unique()))

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 30, 39, 43, 57]
[0, 1, 2, 3, 4, 5]
[1, 2, 3, 4, 5]
[0, 1, 2, 3, 4, 5]
[1, 2, 3, 4, 5]
[1, 2, 3, 4, 5]


In [146]:
# Now we consider only important columns for father process.
df2 = pd.DataFrame(df, columns = ['beer/ABV', 'beer/name','beer/style', 'review/appearance', 'review/aroma', 'review/overall',
       'review/palate', 'review/taste', 'review/text'])

## Natural language Processing

### Now we apply NLP on textual columns

In [147]:
df2.head(3)

Unnamed: 0,beer/ABV,beer/name,beer/style,review/appearance,review/aroma,review/overall,review/palate,review/taste,review/text
0,5,Chiostro,Herbed / Spiced Beer,4,4,4,4,4,Pours a clouded gold with a thin white head. N...
1,11,Bearded Pat's Barleywine,American Barleywine,4,3,3,3,3,12oz bottle into 8oz snifter.\t\tDeep ruby red...
2,4,Naughty Nellie's Ale,American Pale Ale (APA),3,4,3,3,3,First enjoyed at the brewpub about 2 years ago...


In [148]:
# There is three columns with text data so we combine into one coloum
df2['combine_text'] = df[['beer/name', 'beer/style', 'review/text']].apply(lambda x: ' '.join(x), axis = 1)
df2 = df2.drop(['beer/name', 'beer/style', 'review/text'], axis = 1)

In [149]:
df2.head(3)

Unnamed: 0,beer/ABV,review/appearance,review/aroma,review/overall,review/palate,review/taste,combine_text
0,5,4,4,4,4,4,Chiostro Herbed / Spiced Beer Pours a clouded ...
1,11,4,3,3,3,3,Bearded Pat's Barleywine American Barleywine 1...
2,4,3,4,3,3,3,Naughty Nellie's Ale American Pale Ale (APA) F...


## Remove url, html tags, emoji , punctuation and digits from taxt columns 

In [152]:
def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'',text)
def remove_html(text):
    html=re.compile(r'<.*?>')
    return html.sub(r'',text)
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)
def remove_punct(text):
    table=str.maketrans('','',string.punctuation)
    return text.translate(table)

In [153]:
text_columns = ['combine_text']
for i in text_columns:
    df2["{}".format(i)] = df2["{}".format(i)].apply(lambda x : remove_URL(x))
    df2["{}".format(i)] = df2["{}".format(i)].apply(lambda x : remove_html(x))
    df2["{}".format(i)] = df2["{}".format(i)].apply(lambda x: remove_emoji(x))
    df2["{}".format(i)] = df2["{}".format(i)].apply(lambda x : remove_punct(x))
    df2["{}".format(i)] = df2["{}".format(i)].apply(lambda x : ftfy.fix_text(x))
df2['combine_text'] = df2['combine_text'].str.replace('\d+', '')

In [154]:
df2.head()

Unnamed: 0,beer/ABV,review/appearance,review/aroma,review/overall,review/palate,review/taste,combine_text
0,5,4,4,4,4,4,Chiostro Herbed Spiced Beer Pours a clouded g...
1,11,4,3,3,3,3,Bearded Pats Barleywine American Barleywine oz...
2,4,3,4,3,3,3,Naughty Nellies Ale American Pale Ale APA Firs...
3,4,3,3,2,3,3,Pilsner Urquell Czech Pilsener First thing I n...
4,4,4,3,3,3,2,Black Sheep Ale Special English Pale Ale A pou...


In [157]:
start = time.time()
corpus = []
for i in range(len(df2)):
    review = df2['combine_text'][i]
    review = review.lower()
    review = review.split()
    review = [lem.lemmatize(word) for word in review if word not in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)
lc=(time.time()-start)
print("Time to Complete this Cell :- ",lc)

Time to Complete this Cell :-  1230.8305563926697


In [199]:
corpus[1]

'bearded pat barleywine american barleywine oz bottle oz snifter deep ruby red hue one finger light tan head settle thin rim along glass sharp piercing hop light hard candy background hop provide bitterness arent sharp would expected mostly light candy flavor like aroma full bodied good carbonation finish lingering sweet flavor bit bitterness sweet barleywine weird hard candy flavor seemed dominate bad would liked little complexity'

## Approach with Random Forest Classifier and Model Validation metrics

In [160]:
X = df2[['beer/ABV', 'review/appearance', 'review/aroma', 'review/palate', 'review/taste']]

In [161]:
y = df2['review/overall']

In [162]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

### Random Forest Classifier

In [163]:
classifier = RandomForestClassifier(n_estimators = 50)
classifier.fit(X_train, y_train)

RandomForestClassifier(n_estimators=50)

In [164]:
y_pred1 = classifier.predict(X_test)

### Model Validation metrics

In [165]:
result = confusion_matrix(y_test, y_pred1)
print("Confusion Matrix:")
print(result)
result1 = classification_report(y_test, y_pred1)
print("Classification Report:",)
print (result1)
result2 = accuracy_score(y_test,y_pred1)
print("Accuracy:",result2)

Confusion Matrix:
[[  40   38    7    0    0]
 [  24  225  129   21    1]
 [   3   98 1212  690    4]
 [   1   20  470 3896  122]
 [   0    0    6  351  139]]
Classification Report:
              precision    recall  f1-score   support

           1       0.59      0.47      0.52        85
           2       0.59      0.56      0.58       400
           3       0.66      0.60      0.63      2007
           4       0.79      0.86      0.82      4509
           5       0.52      0.28      0.36       496

    accuracy                           0.74      7497
   macro avg       0.63      0.56      0.58      7497
weighted avg       0.72      0.74      0.73      7497

Accuracy: 0.7352274243030545


### Support Vector Classifier

In [166]:
clf = SVC(kernel='linear') 
clf.fit(X_train, y_train) 

SVC(kernel='linear')

In [167]:
y_pred2 = clf.predict(X_test)

### Model Validation metrics

In [168]:
result = confusion_matrix(y_test, y_pred2)
print("Confusion Matrix:")
print(result)
result1 = classification_report(y_test, y_pred2)
print("Classification Report:",)
print (result1)
result2 = accuracy_score(y_test,y_pred2)
print("Accuracy:",result2)

Confusion Matrix:
[[  50   30    5    0    0]
 [  33  239  111   17    0]
 [   2  120 1269  616    0]
 [   1   16  580 3912    0]
 [   0    0    7  489    0]]
Classification Report:
              precision    recall  f1-score   support

           1       0.58      0.59      0.58        85
           2       0.59      0.60      0.59       400
           3       0.64      0.63      0.64      2007
           4       0.78      0.87      0.82      4509
           5       0.00      0.00      0.00       496

    accuracy                           0.73      7497
   macro avg       0.52      0.54      0.53      7497
weighted avg       0.68      0.73      0.70      7497

Accuracy: 0.729625183406696


  _warn_prf(average, modifier, msg_start, len(result))


## Create the LSTM Model

In [169]:
cv1 = CountVectorizer(max_features=100)
X1 = cv1.fit_transform(corpus).toarray()

In [170]:
cv2 = TfidfVectorizer(max_features=15)
X2 = cv2.fit_transform(corpus).toarray()

In [171]:
X2[1]

array([0.        , 0.        , 0.        , 0.64545453, 0.22518387,
       0.13876474, 0.41528596, 0.23786559, 0.        , 0.        ,
       0.23578135, 0.        , 0.        , 0.47831967, 0.        ])

In [172]:
vocabulary = cv2.get_feature_names()
Z = pd.DataFrame(data=X2, columns=vocabulary).iloc[:,0::2]

In [173]:
X_train, X_test, y_train, y_test = train_test_split(X2, y, test_size = 0.2, random_state = 0)

In [174]:
model = Sequential()
model.add(LSTM(15, return_sequences=True, input_shape=(15, 1)))
model.add(LSTM(15, return_sequences=True))
model.add(LSTM(15))
model.add(Dense(1))
model.compile(loss='mean_squared_error', optimizer='adam')

In [175]:
model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_12 (LSTM)               (None, 15, 15)            1020      
_________________________________________________________________
lstm_13 (LSTM)               (None, 15, 15)            1860      
_________________________________________________________________
lstm_14 (LSTM)               (None, 15)                1860      
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 16        
Total params: 4,756
Trainable params: 4,756
Non-trainable params: 0
_________________________________________________________________


In [176]:
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)

In [210]:
start = time.time()
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=64, verbose=1)

Train on 29988 samples, validate on 7497 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x15b2c888f88>

In [211]:
lc=(time.time()-start)
print("Time to Complete above Cell :- ", lc)

Time to Complete above Cell :-  88.43959403038025


In [212]:
model.save("beer_review.h5")

## Conclusion
### 1. "beer/name", "beer/style" and "review/text" are not able to predict review.
### 2. "beer/ABV", "review/appearance", "review/aroma", "review/palate"  and "review/taste" are able to predict review.