In [2]:
import pandas as pd
df = pd.read_csv('data/reviews.csv')
df.date = pd.to_datetime(df.date, format='%Y-%m-%d')
df.dropna(inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 140348 entries, 0 to 140454
Data columns (total 6 columns):
 #   Column         Non-Null Count   Dtype         
---  ------         --------------   -----         
 0   listing_id     140348 non-null  int64         
 1   id             140348 non-null  int64         
 2   date           140348 non-null  datetime64[ns]
 3   reviewer_id    140348 non-null  int64         
 4   reviewer_name  140348 non-null  object        
 5   comments       140348 non-null  object        
dtypes: datetime64[ns](1), int64(3), object(2)
memory usage: 7.5+ MB


In [15]:
import string
from nltk.corpus import stopwords as sw
from nltk.corpus import wordnet as wn
from nltk import wordpunct_tokenize
from nltk import WordNetLemmatizer
from nltk import sent_tokenize
from nltk import pos_tag

# initialise WordNet lemmatizer and punctuation filter
lemmatizer = WordNetLemmatizer()
punct = set(string.punctuation)

# join provided stopwords with the default NLTK English stopwords
stopwords = set(sw.words('english')).union(set(sw.words('norwegian'))).union(set(sw.words('french')))

def lemmatize(token, tag):
    tag = {
        'N': wn.NOUN,
        'V': wn.VERB,
        'R': wn.ADV,
        'J': wn.ADJ
    }.get(tag[0], wn.NOUN)

    return lemmatizer.lemmatize(token, tag)

def cab_tokenizer(document):
    # initialize token list
    tokens = []
    
    # split the document into sentences
    for sent in sent_tokenize(document):
        # split the document into tokens and then create part of speech tag for each token
        for token, tag in pos_tag(wordpunct_tokenize(sent)):
            # preprocess and remove unnecessary characters
            token = token.lower()
            token = token.strip()
            token = token.strip('_')
            token = token.strip('*')

            # If stopword, ignore token and continue
            if token in stopwords:
                continue

            # If punctuation, ignore token and continue
            if all(char in punct for char in token):
                continue

            # Lemmatize the token and add back to the tokens list
            lemma = lemmatize(token, tag)
            tokens.append(lemma)
    
    return tokens

In [16]:
# average length of comment
print(df['comments'].apply(lambda x: len(x)).mean())

233.07958788155156


In [17]:
df_temp = df[:1000]

In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer

# tf idf vectoriser
tfidf_vec = TfidfVectorizer(tokenizer=cab_tokenizer, ngram_range=(1,2))
X = tfidf_vec.fit_transform(df_temp['comments'])

# see the number of unique tokens produced by the vectorizer. Lots of them...
print(tfidf_vec.get_feature_names())

 accommodation', 'way allow', 'way around', 'way back', 'way could', 'way described', 'way ensure', 'way friendly', 'way get', 'way good', 'way help', 'way host', 'way inform', 'way know', 'way location', 'way make', 'way meet', 'way need', 'way overall', 'way possible', 'way property', 'way save', 'way sneak', 'way stay', 'way travel', 'wc', 'wc bedroom', 'wealth', 'wealth information', 'wealth knowledge', 'wealth suggestion', 'wear', 'wear comfortable', 'weather', 'weather enjoy', 'weather nice', 'weather outside', 'weather rest', 'weather take', 'weather warm', 'weather week', 'web', 'web site', 'website', 'website hide', 'wee', 'wee hour', 'weedy', 'weedy path', 'week', 'week absolutely', 'week case', 'week end', 'week gunn', 'week hospitable', 'week melissa', 'week place', 'week rain', 'week room', 'week sad', 'week sleep', 'week work', 'week working', 'weekend', 'weekend away', 'weekend book', 'weekend city', 'weekend getaway', 'weekend glenn', 'weekend issue', 'weekend katrine',

In [26]:
from sklearn.cluster import KMeans

rs = 42
# K means clustering using the term vector
kmeans = KMeans(n_clusters=7, random_state=rs).fit(X)

In [25]:
# function to visualise text cluster. Useful for the assignment too :)
def visualise_text_cluster(n_clusters, cluster_centers, terms, num_word = 5):
    # -- Params --
    # cluster_centers: cluster centers of fitted/trained KMeans/other centroid-based clustering
    # terms: terms used for clustering
    # num_word: number of terms to show per cluster. Change as you please.
    
    # find features/terms closest to centroids
    ordered_centroids = cluster_centers.argsort()[:, ::-1]
    
    for cluster in range(n_clusters):
        print("Top terms for cluster {}:".format(cluster), end=" ")
        for term_idx in ordered_centroids[cluster, :5]:
            print(terms[term_idx], end=', ')
        print()
        
# call it
visualise_text_cluster(kmeans.n_clusters, kmeans.cluster_centers_, tfidf_vec.get_feature_names())

Top terms for cluster 0: view dining, leave thank, 
Top terms for cluster 1: apartement situate, norway highly, 
Top terms for cluster 2: great visit, place flat, 
Top terms for cluster 3: card room, rund, 
Top terms for cluster 4: und die, water could, 
Top terms for cluster 5: useful stay, go jogging, 
Top terms for cluster 6: gracious, apartement situate, 


In [35]:
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
countvectorizer = CountVectorizer(analyzer= 'word', stop_words='english')
tfidfvectorizer = TfidfVectorizer(analyzer='word',stop_words= 'english')

count_wm = countvectorizer.fit_transform(df_temp['comments'])