<a href="https://colab.research.google.com/github/acbattles/Review_Topics/blob/master/NLP_Airbnb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Topic Modeling

Topic modeling on Denver AirBnb review data through 2018

### Import Data and Clean

In [None]:
import numpy as np
import pandas as pd

df = pd.read_csv('/content/drive/My Drive/DataScience_Projects/reviews_Denver.csv')
df.head()

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments
0,177,99461681,2016-09-04,78348548,Brenna,"Great place to stay! Definitely a tiny home, b..."
1,177,101460574,2016-09-13,72125554,Rachael,Joe was very friendly and gave us all the info...
2,177,101681475,2016-09-14,21300329,Haley,Amazing property and Joe was a great host! Eve...
3,177,102963808,2016-09-19,53882399,Mike,"Joe was a great host, the description of this ..."
4,177,103447333,2016-09-22,94621370,Carol,He welcomes us when we first arrived and gave ...


In [None]:
df.isnull().sum()

listing_id         0
id                 0
date               0
reviewer_id        0
reviewer_name      0
comments         145
dtype: int64

In [None]:
df.dropna(inplace=True)

In [None]:
blanks = []  # start with an empty list

#change if there are additional columns
for i,x1,x2,x3,x4,x5,rv in df.itertuples():  # iterate over the DataFrame
    if type(rv)==str:            # avoid NaN values
        if rv.isspace():         # test 'review' for whitespace
            blanks.append(i)     # add matching index numbers to the list
        
print(len(blanks), 'blanks: ', blanks)

1 blanks:  [110981]


In [None]:
df.drop(blanks, inplace=True)

In [None]:
#split into training and testing sets
df['date']=pd.to_datetime(df['date'])


df_train = df[df['date'].dt.year<2019].copy()
df_test = df[df['date'].dt.year>2018].copy()

In [None]:
print("training data size: " , len(df_train))
print("testing data size: " , len(df_test))

training data size:  126388
testing data size:  122343


### NLP Pre-processing

In [None]:
import spacy
nlp = spacy.load('en_core_web_sm')

#tokenize comments
texts_train = df_train['comments'].tolist()
spacy_docs = list(nlp.pipe(texts_train))

In [None]:
#lemmatize words and remove stop words
docs = [[t.lemma_.lower() for t in doc if len(t.orth_) > 2 and not t.is_stop] for doc in spacy_docs]

In [None]:
#include bi-grams
import re
from gensim.models import Phrases


bigram = Phrases(docs, min_count=10)
tokens = []

for idx in range(len(docs)):
    for token in bigram[docs[idx]]:
        if '_' in token:  # bigrams can be recognized by the "_" that joins the invidual words
            docs[idx].append(token)
            tokens.append(token)



In [None]:
#create dictionary
from gensim.corpora import Dictionary

dictionary = Dictionary(docs)
print('Number of unique words in original documents:', len(dictionary))

dictionary.filter_extremes(no_below=5, no_above=0.1)
print('Number of unique words after removing rare and common words:', len(dictionary))

print("Example representation of document 3:", dictionary.doc2bow(docs[2]))

Number of unique words in original documents: 31150
Number of unique words after removing rare and common words: 10368
Example representation of document 3: [(8, 1), (28, 1), (29, 1), (30, 1), (31, 1)]


In [None]:
#create corpus
corpus = [dictionary.doc2bow(doc) for doc in docs]

### Run LDA model

In [None]:
#run LDA model
from gensim.models import LdaModel

model = LdaModel(corpus=corpus,
                       id2word=dictionary,
                       num_topics=5,#number of topics
                       chunksize=500,
                       passes=3,
                       random_state=617)

#print top words in each topic
for (topic, words) in model.print_topics():
  print(topic, ":", words, '\n')

0 : 0.049*"check" + 0.028*"quick" + 0.021*"respond" + 0.015*"question" + 0.014*"communication" + 0.011*"quick_respond" + 0.010*"helpful" + 0.009*"apartment" + 0.009*"quickly" + 0.009*"trip" 

1 : 0.039*"highly" + 0.036*"highly_recommend" + 0.025*"amazing" + 0.021*"wonderful" + 0.018*"beautiful" + 0.016*"awesome" + 0.015*"cozy" + 0.014*"thank" + 0.013*"responsive" + 0.013*"helpful" 

2 : 0.029*"feel" + 0.019*"like" + 0.013*"airbnb" + 0.012*"amazing" + 0.009*"touch" + 0.008*"go" + 0.008*"come" + 0.008*"beautiful" + 0.008*"feel_like" + 0.007*"group" 

3 : 0.025*"distance" + 0.023*"walk_distance" + 0.017*"spot" + 0.017*"bar" + 0.015*"shop" + 0.014*"park" + 0.014*"street" + 0.014*"lot" + 0.012*"city" + 0.011*"away" 

4 : 0.031*"bed" + 0.017*"kitchen" + 0.014*"bathroom" + 0.010*"bedroom" + 0.010*"comfy" + 0.009*"night" + 0.008*"private" + 0.008*"sleep" + 0.008*"coffee" + 0.007*"parking" 



#### Topics:
The topics look like:

1. (0) Host Communication
2. (1) Recommend to others
3. (2) Good Feelings
4. (3) Location
5. (4) Accommodations

In [None]:
#Add topic number to original df
topic_nums = []

for (text, doc) in zip(texts_train, docs):
    probs = np.array(model[dictionary.doc2bow(doc)])
    topic_nums.append(probs[np.argsort(probs[:,-1])][-1,0])
    
df_train['topic'] = topic_nums

In [None]:
df_train.head()

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments,topic
0,177,99461681,2016-09-04,78348548,Brenna,"Great place to stay! Definitely a tiny home, b...",3.0
1,177,101460574,2016-09-13,72125554,Rachael,Joe was very friendly and gave us all the info...,1.0
2,177,101681475,2016-09-14,21300329,Haley,Amazing property and Joe was a great host! Eve...,1.0
3,177,102963808,2016-09-19,53882399,Mike,"Joe was a great host, the description of this ...",4.0
4,177,103447333,2016-09-22,94621370,Carol,He welcomes us when we first arrived and gave ...,0.0


In [None]:
#save df as csv

#Uncomment and run once number of topics is finalized:
df_train.to_csv('/content/drive/My Drive/DataScience_Projects/reviews_DenverBefore2019_topics_1.0.csv')

# Classification

I want to predict the topic of reviews left in 2019 in Denver, trained on the reviews through 2018.


In [None]:
import numpy as np
import pandas as pd

df_train = pd.read_csv('/content/drive/My Drive/DataScience_Projects/reviews_DenverBefore2019_topics_1.0.csv')

In [None]:
import string
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English

from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline

# Create our list of punctuation marks
punctuations = string.punctuation

# Create our list of stopwords
nlp = spacy.load('en')
stop_words = spacy.lang.en.stop_words.STOP_WORDS

# Load English tokenizer, tagger, parser, NER and word vectors
parser = English()

# Creating our tokenizer function
def spacy_tokenizer(sentence):
    # Creating our token object, which is used to create documents with linguistic annotations.
    mytokens = parser(sentence)

    # Lemmatizing each token and converting each token into lowercase
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]

    # Removing stop words
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]

    # return preprocessed list of tokens
    return mytokens


# Custom transformer using spaCy
class predictors(TransformerMixin):
    def transform(self, X, **transform_params):
        # Cleaning Text
        return [clean_text(text) for text in X]

    def fit(self, X, y=None, **fit_params):
        return self

    def get_params(self, deep=True):
        return {}

# Basic function to clean the text
def clean_text(text):
    # Removing spaces and converting text into lowercase
    return text.strip().lower()

#assign TF_IDF vectorizer
tfidf_vector = TfidfVectorizer(tokenizer = spacy_tokenizer)

In [None]:
from sklearn.model_selection import train_test_split

X = df_train['comments'] # the features we want to analyze
ylabels = df_train['topic'] # the labels, or answers, we want to test against

X_train, X_test, y_train, y_test = train_test_split(X, ylabels, test_size=0.3, random_state=617)

#### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 1000,
                                    max_depth = )

# Create pipeline using Bag of Words
pipe = Pipeline([("cleaner", predictors()),
                 ('vectorizer', tfidf_vector),
                 ('classifier', classifier)])

# model generation
pipe.fit(X_train,y_train)

Pipeline(memory=None,
         steps=[('cleaner', <__main__.predictors object at 0x7f96328f2860>),
                ('vectorizer',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_wor...
                 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                        class_weight=None, criterion='gini',
                                        max_depth=None, max_features='auto',
                                        max_leaf_nodes=None, max_samples=None,
                                        min_im

In [None]:
from sklearn import metrics
# Predicting with a test dataset
predicted = pipe.predict(X_test)

# Model Accuracy
print("Random Forest:",metrics.accuracy_score(y_test, predicted))
#print("Logistic Regression Precision:",metrics.precision_score(y_test, predicted))
#print("Logistic Regression Recall:",metrics.recall_score(y_test, predicted))

Random Forest: 0.6677743492364903


These results aren't bad, but let's see if we can do better

### SVM

In [None]:
from sklearn.svm import LinearSVC
classifier = LinearSVC(class_weight = 'balanced')

# Create pipeline using Bag of Words
pipe = Pipeline([("cleaner", predictors()),
                 ('vectorizer', tfidf_vector),
                 ('classifier', classifier)])

# model generation
pipe.fit(X_train,y_train)

Pipeline(memory=None,
         steps=[('cleaner', <__main__.predictors object at 0x7f9626b27ac8>),
                ('vectorizer',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_wor...
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<function spacy_tokenizer at 0x7f966d3c6f28>,
                                 use_idf=True, vocabulary=None)),
                ('classifier',
                 LinearSVC(C=1.0, class_wei

In [None]:
from sklearn import metrics
# Predicting with a test dataset
predicted = pipe.predict(X_test)

# Model Accuracy
print("SVM Accuracy:",metrics.accuracy_score(y_test, predicted))

SVM Accuracy: 0.8547089695914761


In [None]:
#test on new data
x=np.random.randint(0,len(df_train))
sample_pred = df_train['comments'][x]
print(sample_pred)

prediction_test = pipe.predict(sample_pred)
print("First guess:", prediction_test[0])
print("Second guess:", prediction_test[1])

Very warm & welcoming from the moment we got there. Equipt with all basics essentials one would need.
First guess: 4.0
Second guess: 2.0


We get much better accuracy with the SVM model.

In this random example, we see that the model predicted the topic 4 (accomodation). We can also ask to see the next likely topic.