### Cosine Similarity

Below is the code to calculate cosine similarity between property descriptions and customer reviews for each of the 4 groups we defined earlier. We will calculate the cosine similarity using the set of words which we used to define the 4 groups with the words that make up the description of a property.

We will classify each of the property into one of the 4 groups (Homesize, Surroundings, Amenities and Location) using their cosine similarity scores.

In [1]:
import csv
import pandas as pd
import nltk
import numpy as np
from nltk.tokenize import PunktSentenceTokenizer,RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from scipy import sparse
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import pairwise_distances
import warnings

In [2]:
#to ignore deprecation warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

#Please use xlsx file format to read the data
#I faced issues with csv
prop_df=pd.read_csv("C:/Users/bandi/Desktop/Text Analytics/Group Project/property_descriptions.csv")

#checking for nulls if present any
print("Number of rows with null values:")
print(prop_df.isnull().sum().sum())
prop_df=prop_df.dropna() 



Number of rows with null values:
19


In [3]:
# Removing stop words from property descriptions
from nltk.tokenize import sent_tokenize, word_tokenize
prop_df['headline'] = prop_df['headline'].apply(word_tokenize).apply(list)
stop_words = set(stopwords.words('english'))
def remove_stopwords(s):
    return [w for w in s if not w in stop_words] 
    
prop_df['headline'] =  prop_df['headline'].apply(remove_stopwords)

In [4]:
# Converting bag of words to sentence after removing stop words
def convert_to_words(s):
    return " ".join(s)
prop_df['headline'] =  prop_df['headline'].apply(convert_to_words)

## Topic 0 - Homesize

In [5]:
# Reading the attributes file
# Check into the "attributes.txt" file for the proper format
# Each attribute has to be listed in a new line.
attributes=list(line.strip() for line in open('attribute0.txt'))
attributes=" ".join(attributes)

In [6]:
# Merging attibutes to the review
# Restaurant_review is the name of the column with review text.

tempDataFrame=pd.DataFrame({'headline':[attributes]})
tempDataFrame=tempDataFrame.transpose()
description_list1=prop_df['headline']
frames = [tempDataFrame, description_list1]
result = pd.concat(frames)
result.columns = ['Property description']
result=result.reset_index()

In [7]:
# Building bag of words using frequency

vec_words = CountVectorizer(decode_error='ignore')
total_features_words = vec_words.fit_transform(result['Property description'])
print("The size of the vocabulary space:")
print(total_features_words.shape)

The size of the vocabulary space:
(189, 535)


In [8]:
# Calculating pairwise cosine similarity

subset_sparse = sparse.csr_matrix(total_features_words)
total_features_review=subset_sparse
total_features_attr=subset_sparse[0,]
similarity=1-pairwise_distances(total_features_attr,total_features_review, metric='cosine')


#Assigning the similarity score to dataframe
# similarity=np.array(similarities[0]).reshape(-1,).tolist()
similarity=pd.DataFrame(similarity)
similarity=similarity.transpose()
similarity.columns = ['similarity']
similarity=similarity.drop(similarity.index[[0]])
prop_df_0=prop_df.assign(similarity=similarity.values)

#writing to an output file
prop_df_0.to_excel("similarity_score0.xlsx",index=False)

## Topic 1 - Surroundings

In [252]:
attributes=list(line.strip() for line in open('attribute1.txt'))
attributes=" ".join(attributes)

In [253]:
tempDataFrame=pd.DataFrame({'headline':[attributes]})
tempDataFrame=tempDataFrame.transpose()
description_list1=prop_df['headline']
frames = [tempDataFrame, description_list1]
result = pd.concat(frames)
result.columns = ['Property description']
result=result.reset_index()
vec_words = CountVectorizer(decode_error='ignore')
total_features_words = vec_words.fit_transform(result['Property description'])
print("The size of the vocabulary space:")
print(total_features_words.shape)

#Calculating pairwise cosine similarity

subset_sparse = sparse.csr_matrix(total_features_words)
total_features_review=subset_sparse
total_features_attr=subset_sparse[0,]
similarity=1-pairwise_distances(total_features_attr,total_features_review, metric='cosine')


#Assigning the similarity score to dataframe
# similarity=np.array(similarities[0]).reshape(-1,).tolist()
similarity=pd.DataFrame(similarity)
similarity=similarity.transpose()
similarity.columns = ['similarity']
similarity=similarity.drop(similarity.index[[0]])
prop_df_1=prop_df.assign(similarity=similarity.values)

#writing to an output file
prop_df_1.to_excel("similarity_score1.xlsx",index=False)

The size of the vocabulary space:
(189, 535)


## Topic 2 - Amenities

In [254]:
attributes=list(line.strip() for line in open('attribute2.txt'))
attributes=" ".join(attributes)

tempDataFrame=pd.DataFrame({'headline':[attributes]})
tempDataFrame=tempDataFrame.transpose()
description_list1=prop_df['headline']
frames = [tempDataFrame, description_list1]
result = pd.concat(frames)
result.columns = ['Property description']
result=result.reset_index()
vec_words = CountVectorizer(decode_error='ignore')
total_features_words = vec_words.fit_transform(result['Property description'])
print("The size of the vocabulary space:")
print(total_features_words.shape)

#Calculating pairwise cosine similarity

subset_sparse = sparse.csr_matrix(total_features_words)
total_features_review=subset_sparse
total_features_attr=subset_sparse[0,]
similarity=1-pairwise_distances(total_features_attr,total_features_review, metric='cosine')


#Assigning the similarity score to dataframe
# similarity=np.array(similarities[0]).reshape(-1,).tolist()
similarity=pd.DataFrame(similarity)
similarity=similarity.transpose()
similarity.columns = ['similarity']
similarity=similarity.drop(similarity.index[[0]])
prop_df_2=prop_df.assign(similarity=similarity.values)

#writing to an output file
prop_df_2.to_excel("similarity_score2.xlsx",index=False)

The size of the vocabulary space:
(189, 543)


## Topic 3 - Location

In [255]:
attributes=list(line.strip() for line in open('attribute3.txt'))
attributes=" ".join(attributes)

tempDataFrame=pd.DataFrame({'headline':[attributes]})
tempDataFrame=tempDataFrame.transpose()
description_list1=prop_df['headline']
frames = [tempDataFrame, description_list1]
result = pd.concat(frames)
result.columns = ['Property description']
result=result.reset_index()
vec_words = CountVectorizer(decode_error='ignore')
total_features_words = vec_words.fit_transform(result['Property description'])
print("The size of the vocabulary space:")
print(total_features_words.shape)

#Calculating pairwise cosine similarity

subset_sparse = sparse.csr_matrix(total_features_words)
total_features_review=subset_sparse
total_features_attr=subset_sparse[0,]
similarity=1-pairwise_distances(total_features_attr,total_features_review, metric='cosine')


#Assigning the similarity score to dataframe
# similarity=np.array(similarities[0]).reshape(-1,).tolist()
similarity=pd.DataFrame(similarity)
similarity=similarity.transpose()
similarity.columns = ['similarity']
similarity=similarity.drop(similarity.index[[0]])
prop_df_3=prop_df.assign(similarity=similarity.values)

#writing to an output file
prop_df_3.to_excel("similarity_score3.xlsx",index=False)

The size of the vocabulary space:
(189, 533)
