## Import the Library

In [1]:
# Import modules
import pandas as pd
from scipy import sparse
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
import re
import pickle

## Import nltk Library

In [2]:
import nltk

In [3]:
dir(nltk)

['AbstractLazySequence',
 'AffixTagger',
 'AlignedSent',
 'Alignment',
 'AnnotationTask',
 'ApplicationExpression',
 'Assignment',
 'BigramAssocMeasures',
 'BigramCollocationFinder',
 'BigramTagger',
 'BinaryMaxentFeatureEncoding',
 'BlanklineTokenizer',
 'BllipParser',
 'BottomUpChartParser',
 'BottomUpLeftCornerChartParser',
 'BottomUpProbabilisticChartParser',
 'Boxer',
 'BrillTagger',
 'BrillTaggerTrainer',
 'CFG',
 'CRFTagger',
 'CfgReadingCommand',
 'ChartParser',
 'ChunkParserI',
 'ChunkScore',
 'Cistem',
 'ClassifierBasedPOSTagger',
 'ClassifierBasedTagger',
 'ClassifierI',
 'ConcordanceIndex',
 'ConditionalExponentialClassifier',
 'ConditionalFreqDist',
 'ConditionalProbDist',
 'ConditionalProbDistI',
 'ConfusionMatrix',
 'ContextIndex',
 'ContextTagger',
 'ContingencyMeasures',
 'CoreNLPDependencyParser',
 'CoreNLPParser',
 'Counter',
 'CrossValidationProbDist',
 'DRS',
 'DecisionTreeClassifier',
 'DefaultTagger',
 'DependencyEvaluator',
 'DependencyGrammar',
 'DependencyGrap

## Import and Combine the three data set

In [4]:
# Read in the data
df = pd.read_csv('/Users/yueyangqin/Desktop/Data_Science/AI/Final Project/DataSet/7282_1.csv')
df2= pd.read_csv('/Users/yueyangqin/Desktop/Data_Science/AI/Final Project/DataSet/Datafiniti_Hotel_Reviews.csv')
df3= pd.read_csv('/Users/yueyangqin/Desktop/Data_Science/AI/Final Project/DataSet/Datafiniti_Hotel_Reviews_Jun19.csv')

# Import address, name, reviews.text from the original dataset, and combine the three data set together
imp_col_list = ['address', 'name', 'reviews.text']
df = df.loc[:,imp_col_list]
df2 = df2.loc[:,imp_col_list]
df3 = df3.loc[:,imp_col_list]
df= df.append(df2)
df= df.append(df3)

In [5]:
df.tail()

Unnamed: 0,address,name,reviews.text
9995,3101 Coliseum Dr,Hampton Inn Hampton-newport News,My friends and I took a trip to Hampton for th...
9996,3101 Coliseum Dr,Hampton Inn Hampton-newport News,"from check in to departure, staff is friendly,..."
9997,3101 Coliseum Dr,Hampton Inn Hampton-newport News,This Hampton is located on a quiet street acro...
9998,7886 Main Street,Roseberry's Inn,Awesome wings (my favorite was garlic parmesan...
9999,3505 S Main St,Hampton Inn-lindale/tyler,Clean facility just off freeway ..... staff fr...


In this data set, we might only need to use the "name", "address", "reviews.text

## Explore the Dataset

First, what is the dataset shape

In [6]:
print("Input data has {} rows and {} cloumns".format(len(df), len(df.columns)))

Input data has 55912 rows and 3 cloumns


Second,how many missing data in the dataset

In [7]:
print("Number of null in address: {}".format(df['address'].isnull().sum()))
print("Number of null in name: {}".format(df['name'].isnull().sum()))
print("Number of null in review text: {}".format(df['reviews.text'].isnull().sum()))


Number of null in address: 0
Number of null in name: 0
Number of null in review text: 23


In this place, we have 23 missing data for the "reviews.text", we will drop the missing data row from our dataset

In [8]:
df.dropna()

Unnamed: 0,address,name,reviews.text
0,Riviera San Nicol 11/a,Hotel Russo Palace,Pleasant 10 min walk along the sea front to th...
1,Riviera San Nicol 11/a,Hotel Russo Palace,Really lovely hotel. Stayed on the very top fl...
2,Riviera San Nicol 11/a,Hotel Russo Palace,Ett mycket bra hotell. Det som drog ner betyge...
3,Riviera San Nicol 11/a,Hotel Russo Palace,We stayed here for four nights in October. The...
4,Riviera San Nicol 11/a,Hotel Russo Palace,We stayed here for four nights in October. The...
...,...,...,...
9995,3101 Coliseum Dr,Hampton Inn Hampton-newport News,My friends and I took a trip to Hampton for th...
9996,3101 Coliseum Dr,Hampton Inn Hampton-newport News,"from check in to departure, staff is friendly,..."
9997,3101 Coliseum Dr,Hampton Inn Hampton-newport News,This Hampton is located on a quiet street acro...
9998,7886 Main Street,Roseberry's Inn,Awesome wings (my favorite was garlic parmesan...


##  Clean our dataset

Use the regular expression to clean, like remove the punctuation, which is useful to clean the dataset
1. Remove the punctuation

In [9]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [10]:
df['reviews_text_nopunct'] = df['reviews.text'].str.replace('[^\w\s]','')

lower the case for the review text

In [11]:
df['reviews.text'] = df['reviews.text'].str.lower()
df['reviews_text_nopunct'] = df['reviews_text_nopunct'].str.lower()
df.head()

Unnamed: 0,address,name,reviews.text,reviews_text_nopunct
0,Riviera San Nicol 11/a,Hotel Russo Palace,pleasant 10 min walk along the sea front to th...,pleasant 10 min walk along the sea front to th...
1,Riviera San Nicol 11/a,Hotel Russo Palace,really lovely hotel. stayed on the very top fl...,really lovely hotel stayed on the very top flo...
2,Riviera San Nicol 11/a,Hotel Russo Palace,ett mycket bra hotell. det som drog ner betyge...,ett mycket bra hotell det som drog ner betyget...
3,Riviera San Nicol 11/a,Hotel Russo Palace,we stayed here for four nights in october. the...,we stayed here for four nights in october the ...
4,Riviera San Nicol 11/a,Hotel Russo Palace,we stayed here for four nights in october. the...,we stayed here for four nights in october the ...


2. Tokenization

In [12]:
import re
def tokenize(text):
    tokens = re.split('\W+', text)
    return tokens
# convert the column 'reviews_text_nopunct' into string
df['reviews_text_nopunct'] = df['reviews_text_nopunct'].astype(str)

df['reviews_text_tokenize'] = df['reviews_text_nopunct'].apply(lambda x: tokenize(x))
df.head()

Unnamed: 0,address,name,reviews.text,reviews_text_nopunct,reviews_text_tokenize
0,Riviera San Nicol 11/a,Hotel Russo Palace,pleasant 10 min walk along the sea front to th...,pleasant 10 min walk along the sea front to th...,"[pleasant, 10, min, walk, along, the, sea, fro..."
1,Riviera San Nicol 11/a,Hotel Russo Palace,really lovely hotel. stayed on the very top fl...,really lovely hotel stayed on the very top flo...,"[really, lovely, hotel, stayed, on, the, very,..."
2,Riviera San Nicol 11/a,Hotel Russo Palace,ett mycket bra hotell. det som drog ner betyge...,ett mycket bra hotell det som drog ner betyget...,"[ett, mycket, bra, hotell, det, som, drog, ner..."
3,Riviera San Nicol 11/a,Hotel Russo Palace,we stayed here for four nights in october. the...,we stayed here for four nights in october the ...,"[we, stayed, here, for, four, nights, in, octo..."
4,Riviera San Nicol 11/a,Hotel Russo Palace,we stayed here for four nights in october. the...,we stayed here for four nights in october the ...,"[we, stayed, here, for, four, nights, in, octo..."


3. Remove Stopwords

In [13]:
stopwords = nltk.corpus.stopwords.words('english')

In [14]:
def remove_stopwords(tokenized_list):
    text = [word for word in tokenized_list if word not in stopwords]
    return text

In [15]:
df['reviews_text_nostop'] = df['reviews_text_tokenize'].apply(lambda x: remove_stopwords(x))
df.head()

Unnamed: 0,address,name,reviews.text,reviews_text_nopunct,reviews_text_tokenize,reviews_text_nostop
0,Riviera San Nicol 11/a,Hotel Russo Palace,pleasant 10 min walk along the sea front to th...,pleasant 10 min walk along the sea front to th...,"[pleasant, 10, min, walk, along, the, sea, fro...","[pleasant, 10, min, walk, along, sea, front, w..."
1,Riviera San Nicol 11/a,Hotel Russo Palace,really lovely hotel. stayed on the very top fl...,really lovely hotel stayed on the very top flo...,"[really, lovely, hotel, stayed, on, the, very,...","[really, lovely, hotel, stayed, top, floor, su..."
2,Riviera San Nicol 11/a,Hotel Russo Palace,ett mycket bra hotell. det som drog ner betyge...,ett mycket bra hotell det som drog ner betyget...,"[ett, mycket, bra, hotell, det, som, drog, ner...","[ett, mycket, bra, hotell, det, som, drog, ner..."
3,Riviera San Nicol 11/a,Hotel Russo Palace,we stayed here for four nights in october. the...,we stayed here for four nights in october the ...,"[we, stayed, here, for, four, nights, in, octo...","[stayed, four, nights, october, hotel, staff, ..."
4,Riviera San Nicol 11/a,Hotel Russo Palace,we stayed here for four nights in october. the...,we stayed here for four nights in october the ...,"[we, stayed, here, for, four, nights, in, octo...","[stayed, four, nights, october, hotel, staff, ..."


Now we have a clean column 'reviews_text_nostop' with remove punctuation, tokenized, and remove stopwords.
We just need to keep the column names are 'address', 'name', and 'reviews_text_nostop'.

4. Using Stemming

In [16]:
df_nostop = df[['address', 'name', 'reviews_text_nostop']]
df_nostop.head()

Unnamed: 0,address,name,reviews_text_nostop
0,Riviera San Nicol 11/a,Hotel Russo Palace,"[pleasant, 10, min, walk, along, sea, front, w..."
1,Riviera San Nicol 11/a,Hotel Russo Palace,"[really, lovely, hotel, stayed, top, floor, su..."
2,Riviera San Nicol 11/a,Hotel Russo Palace,"[ett, mycket, bra, hotell, det, som, drog, ner..."
3,Riviera San Nicol 11/a,Hotel Russo Palace,"[stayed, four, nights, october, hotel, staff, ..."
4,Riviera San Nicol 11/a,Hotel Russo Palace,"[stayed, four, nights, october, hotel, staff, ..."


In [17]:
df_nostop

Unnamed: 0,address,name,reviews_text_nostop
0,Riviera San Nicol 11/a,Hotel Russo Palace,"[pleasant, 10, min, walk, along, sea, front, w..."
1,Riviera San Nicol 11/a,Hotel Russo Palace,"[really, lovely, hotel, stayed, top, floor, su..."
2,Riviera San Nicol 11/a,Hotel Russo Palace,"[ett, mycket, bra, hotell, det, som, drog, ner..."
3,Riviera San Nicol 11/a,Hotel Russo Palace,"[stayed, four, nights, october, hotel, staff, ..."
4,Riviera San Nicol 11/a,Hotel Russo Palace,"[stayed, four, nights, october, hotel, staff, ..."
...,...,...,...
9995,3101 Coliseum Dr,Hampton Inn Hampton-newport News,"[friends, took, trip, hampton, basslights, sho..."
9996,3101 Coliseum Dr,Hampton Inn Hampton-newport News,"[check, departure, staff, friendly, profession..."
9997,3101 Coliseum Dr,Hampton Inn Hampton-newport News,"[hampton, located, quiet, street, across, hosp..."
9998,7886 Main Street,Roseberry's Inn,"[awesome, wings, favorite, garlic, parmesan, g..."


In [18]:
ps = nltk.PorterStemmer()

set up preview in panda like 100, so we can view more message

In [19]:
pd.set_option('display.max_colwidth', 100)

In [20]:
def stemming_function(nostop_text):
    text = " ".join([ps.stem(word) for word in nostop_text])
    return text

df_nostop['reviews_text_stemmed'] = df_nostop['reviews_text_nostop'].apply(lambda x: stemming_function(x))
df_nostop.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_nostop['reviews_text_stemmed'] = df_nostop['reviews_text_nostop'].apply(lambda x: stemming_function(x))


Unnamed: 0,address,name,reviews_text_nostop,reviews_text_stemmed
0,Riviera San Nicol 11/a,Hotel Russo Palace,"[pleasant, 10, min, walk, along, sea, front, water, bus, restaurants, etc, hotel, comfortable, b...",pleasant 10 min walk along sea front water bu restaur etc hotel comfort breakfast good quit vari...
1,Riviera San Nicol 11/a,Hotel Russo Palace,"[really, lovely, hotel, stayed, top, floor, surprised, jacuzzi, bath, didnt, know, getting, staf...",realli love hotel stay top floor surpris jacuzzi bath didnt know get staff friendli help includ ...
2,Riviera San Nicol 11/a,Hotel Russo Palace,"[ett, mycket, bra, hotell, det, som, drog, ner, betyget, var, att, vi, fick, ett, rum, taksarna,...",ett mycket bra hotel det som drog ner betyget var att vi fick ett rum taksarna dr det endast var...
3,Riviera San Nicol 11/a,Hotel Russo Palace,"[stayed, four, nights, october, hotel, staff, welcoming, friendly, helpful, assisted, booking, t...",stay four night octob hotel staff welcom friendli help assist book ticket opera room clean comfo...
4,Riviera San Nicol 11/a,Hotel Russo Palace,"[stayed, four, nights, october, hotel, staff, welcoming, friendly, helpful, assisted, booking, t...",stay four night octob hotel staff welcom friendli help assist book ticket opera room clean comfo...


## Add some new features

We can add some new features to help the model to predict the right result. We can add a festure like the 'reviews_text_length'.

In [21]:
df_new = df_nostop[['address', 'name', 'reviews_text_stemmed']]
df_new.head()

Unnamed: 0,address,name,reviews_text_stemmed
0,Riviera San Nicol 11/a,Hotel Russo Palace,pleasant 10 min walk along sea front water bu restaur etc hotel comfort breakfast good quit vari...
1,Riviera San Nicol 11/a,Hotel Russo Palace,realli love hotel stay top floor surpris jacuzzi bath didnt know get staff friendli help includ ...
2,Riviera San Nicol 11/a,Hotel Russo Palace,ett mycket bra hotel det som drog ner betyget var att vi fick ett rum taksarna dr det endast var...
3,Riviera San Nicol 11/a,Hotel Russo Palace,stay four night octob hotel staff welcom friendli help assist book ticket opera room clean comfo...
4,Riviera San Nicol 11/a,Hotel Russo Palace,stay four night octob hotel staff welcom friendli help assist book ticket opera room clean comfo...


In [22]:
df_new['reviews_text_length'] = df_new['reviews_text_stemmed'].apply(lambda x: len(x) - x.count(" "))
df_new.head()

Unnamed: 0,address,name,reviews_text_stemmed,reviews_text_length
0,Riviera San Nicol 11/a,Hotel Russo Palace,pleasant 10 min walk along sea front water bu restaur etc hotel comfort breakfast good quit vari...,123
1,Riviera San Nicol 11/a,Hotel Russo Palace,realli love hotel stay top floor surpris jacuzzi bath didnt know get staff friendli help includ ...,131
2,Riviera San Nicol 11/a,Hotel Russo Palace,ett mycket bra hotel det som drog ner betyget var att vi fick ett rum taksarna dr det endast var...,99
3,Riviera San Nicol 11/a,Hotel Russo Palace,stay four night octob hotel staff welcom friendli help assist book ticket opera room clean comfo...,172
4,Riviera San Nicol 11/a,Hotel Russo Palace,stay four night octob hotel staff welcom friendli help assist book ticket opera room clean comfo...,172


## Vectorization

There is 3 popular way do the vectorization:
1. Count Vectorization
2. N-gram vectorizing
3. TF-IDF

I am going to use the Count Vectorization, which is more straight forward, the three method is quite same.
First, We just need the "address", "name", and "reviews_text_stemmed" from the dataframe.

In [23]:
count_vect = CountVectorizer(max_features=5000)
x_count = count_vect.fit_transform(df_new['reviews_text_stemmed'])

In [24]:
#x_features = pd.concat([df_new['reviews_text_length'], pd.DataFrame(x_count.toarray())], axis=1)
#x_features.head()

# This two line of code will let the kernel dead, 
# because the dataset is huge, and juypter notebook do not have that huge memory to handle it. 
# So we limit the max features number is 5000

In [25]:
# And then put all of that in a new dataframe.
bag_of_words = pd.DataFrame(x_count.toarray(), columns=count_vect.get_feature_names())
bag_of_words.head()

Unnamed: 0,10,100,1000,10000,101,1015,1030,10am,10min,10pm,...,zero,zimmer,zion,zip,zona,zone,zoo,zu,zum,zur
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [26]:
print(df_new.shape)
print(bag_of_words.shape)

(55912, 4)
(55912, 5000)


In [27]:
df_new.reset_index(drop=True, inplace=True)
bag_of_words.reset_index(drop=True, inplace=True)
x_features = pd.concat([df_new['reviews_text_length'], bag_of_words], axis=1)


In [28]:
print(x_features.shape)
x_features.head()

(55912, 5001)


Unnamed: 0,reviews_text_length,10,100,1000,10000,101,1015,1030,10am,10min,...,zero,zimmer,zion,zip,zona,zone,zoo,zu,zum,zur
0,123,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,131,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,99,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,172,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,172,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [29]:
x_features.isnull().sum()

reviews_text_length    0
10                     0
100                    0
1000                   0
10000                  0
                      ..
zone                   0
zoo                    0
zu                     0
zum                    0
zur                    0
Length: 5001, dtype: int64

## Explore RandomForestClassifier

In [30]:
from sklearn.ensemble import RandomForestClassifier

In [31]:
print(dir(RandomForestClassifier))
print(RandomForestClassifier())

['__abstractmethods__', '__annotations__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_abc_impl', '_check_n_features', '_estimator_type', '_get_param_names', '_get_tags', '_make_estimator', '_more_tags', '_repr_html_', '_repr_html_inner', '_repr_mimebundle_', '_required_parameters', '_set_oob_score', '_validate_X_predict', '_validate_data', '_validate_estimator', '_validate_y_class_weight', 'apply', 'decision_path', 'feature_importances_', 'fit', 'get_params', 'predict', 'predict_log_proba', 'predict_proba', 'score', 'set_params']
RandomForestClassifier()


Explore RandomForestClassifier through Cross-Validation

In [32]:
from sklearn.model_selection import KFold, cross_val_score

In [33]:
#rf = RandomForestClassifier(n_jobs=-1)
#k_fold = KFold(n_splits=5)
#cross_val_score(rf, x_features, df_new['name'], cv=k_fold, scoring='accuracy', n_jobs=-1)

In [35]:
X_train, X_test, y_train, y_test = train_test_split(x_features, df_new['name'], test_size=0.20)

In [36]:
rf = RandomForestClassifier(n_estimators=50, max_depth=20, n_jobs=-1)

In [38]:
# Fit the model to the data.
rf.fit(X_train,y_train)
y_pred = rf.predict(X_test)

print("Accuracy score: ",round((accuracy_score(y_test, y_pred)*100),2), "%")

Accuracy score:  12.57 %


The low accuracy is not surperising. We are predicting the exact hotel that customer want to go. We can get higher accuracy if we change the hotel name to some attritube of that hotel. So we can reduce the category of the hotel list.
However, the effort required for a single person to do that is beyond my limit. So we will add more steps to evalute the random forest model performance later. 


Now it is the time to using the model and make a prediction.

## Using the Model

In [127]:
test_review = 'I loved the beach, the nearby bars, the live music, and the walkable neighborhood. The weather was great and it was sunny.'

In [128]:
print(test_review)

I loved the beach, the nearby bars, the live music, and the walkable neighborhood. The weather was great and it was sunny.


In [129]:
test_review = test_review.lower()
# remove the punctuation
test_review = re.sub('[^A-Za-z0-9]+', ' ', test_review)

In [130]:
# tokenization
test_review = re.split('\W+', test_review)

In [131]:
# remove stopwords
test_review = [word for word in test_review if word not in stopwords]

In [132]:
# stemming
test_review = " ".join([ps.stem(word) for word in test_review])
test_review_copy = test_review

In [133]:
test_review = [test_review]

# Convert your test review into a vector.
X_test = count_vect.fit_transform(test_review)

In [134]:
print(test_review)

['love beach nearbi bar live music walkabl neighborhood weather great sunni ']


In [135]:
bag_of_word_predict = pd.DataFrame(X_test.toarray(), columns=count_vect.get_feature_names())
bag_of_word_predict.head()

Unnamed: 0,bar,beach,great,live,love,music,nearbi,neighborhood,sunni,walkabl,weather
0,1,1,1,1,1,1,1,1,1,1,1


In [136]:
# print "bag_of_word_predict" columns names
print(bag_of_word_predict.columns)

Index(['bar', 'beach', 'great', 'live', 'love', 'music', 'nearbi',
       'neighborhood', 'sunni', 'walkabl', 'weather'],
      dtype='object')


In [137]:
# print "bag_of_words" columns names
print(bag_of_words.columns)

Index(['10', '100', '1000', '10000', '101', '1015', '1030', '10am', '10min',
       '10pm',
       ...
       'zero', 'zimmer', 'zion', 'zip', 'zona', 'zone', 'zoo', 'zu', 'zum',
       'zur'],
      dtype='object', length=5000)


In [138]:
# add columns for "bag_of_word_predict" from "bag_of_words" and fill the missing data with 0.
col_list = (bag_of_words.append([bag_of_word_predict])).columns.tolist()

In [139]:
bag_of_word_predict = bag_of_word_predict.reindex(columns = col_list).fillna(0)
print(bag_of_word_predict.columns)
bag_of_word_predict

Index(['10', '100', '1000', '10000', '101', '1015', '1030', '10am', '10min',
       '10pm',
       ...
       'zero', 'zimmer', 'zion', 'zip', 'zona', 'zone', 'zoo', 'zu', 'zum',
       'zur'],
      dtype='object', length=5000)


Unnamed: 0,10,100,1000,10000,101,1015,1030,10am,10min,10pm,...,zero,zimmer,zion,zip,zona,zone,zoo,zu,zum,zur
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [146]:
reviews_text_length = len(test_review_copy) - test_review_copy.count(" ")
print(reviews_text_length)

63


In [165]:
bag_of_word_predict.insert(loc=0, column='reviews_text_length', value=reviews_text_length)

In [166]:
bag_of_word_predict

Unnamed: 0,reviews_text_length,10,100,1000,10000,101,1015,1030,10am,10min,...,zero,zimmer,zion,zip,zona,zone,zoo,zu,zum,zur
0,63,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [169]:
# Make a prediction of which hotel your review would be a review:
prediction = rf.predict(bag_of_word_predict)[0]

In [174]:
df[df['name'] == prediction][['name', 'address']].head(15)

Unnamed: 0,name,address
4744,"The Alexandrian, Autograph Collection",480 King St
4745,"The Alexandrian, Autograph Collection",480 King St
4746,"The Alexandrian, Autograph Collection",480 King St
4747,"The Alexandrian, Autograph Collection",480 King St
4748,"The Alexandrian, Autograph Collection",480 King St
4749,"The Alexandrian, Autograph Collection",480 King St
4750,"The Alexandrian, Autograph Collection",480 King St
4751,"The Alexandrian, Autograph Collection",480 King St
4752,"The Alexandrian, Autograph Collection",480 King St
4753,"The Alexandrian, Autograph Collection",480 King St


## Model review

This model is a little bit weak, because the accuracy is low. And different hotel might have different condition between different season. Using N-gram and TF-IDF might have different result. If you interesting with the project and you can check the link:
https://www.kaggle.com/gmayock/where-to-find-your-dream-vacation