In [1]:
!pip install --quiet --disable-pip-version-check sentence-transformers > /dev/null
!pip install --quiet --disable-pip-version-check afinn > /dev/null
!pip install --quiet --disable-pip-version-check vaderSentiment > /dev/null
!pip install --quiet --disable-pip-version-check textblob > /dev/null


In [2]:

# Standard Libraries
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np


# NLP Libraries
import ast
import spacy
from gensim.models import Word2Vec
from nltk.corpus import wordnet

import nltk
from nltk.corpus import wordnet as wn
from nltk.corpus import sentiwordnet as swn
from afinn import Afinn
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from textblob import TextBlob

# Download NLTK resources
nltk.download('wordnet')
nltk.download('sentiwordnet')

# Text Processing Libraries
import re
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS


# Additional Libraries
from tqdm.auto import tqdm
import time

from google.colab import drive
drive.mount('/content/drive')


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package sentiwordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/sentiwordnet.zip.


Mounted at /content/drive


___________________________________________________________________________________________________

# **Load Processed Dataset**
___________________________________________________________________________________________________


In [None]:
main_data_path = "/content/drive/MyDrive/Colab Notebooks/research/main_data/"


In [None]:
def load_list_from_file(file_path):
    """
    Load a list from a text file.

    Inputs:
        file_path (str): The path to the file containing the list.

    Returns:
        list: The loaded list.
    """
    with open(file_path, 'r') as file:
        lst = [line.strip() for line in file]
    return lst

# Load all_seed_words
all_seed_words = load_list_from_file(main_data_path + "all_seeds_list.txt")


In [None]:
len(all_seed_words),all_seed_words

(50,
 ['bungalow',
  'cottage',
  'apartment',
  'flat',
  'balcony',
  'abode',
  'condo',
  'townhouse',
  'pet-friendly',
  'smoke-free',
  'bedroom',
  'amenities',
  'toilet',
  'bathroom',
  'kitchen',
  'wifi',
  'wardrobe',
  'living room',
  'tv',
  'air conditioning',
  'neighborhood',
  'park',
  'restaurants',
  'shops',
  'city',
  'local shops',
  'gym',
  'view',
  'convenience store',
  'Community',
  'public transportation',
  'parking',
  'accessible parking',
  'bus stop',
  'train station',
  'tram station',
  'metro station',
  'airport',
  'laundry',
  'Mobility',
  'host',
  'cancellation',
  'hospitality',
  'communication',
  'check-in',
  'check-out',
  'instruction',
  'responsive',
  'approachable',
  'condition'])

___________________________________________________________________________________________________

# **Feature Engineering** <a name="feature"></a>
___________________________________________________________________________________________________


## **Sentiment Score Computation**
___________________________________________________________________________________________________

(toward each topic word)

In [None]:
# Initialize sentiment analyzers
afinn = Afinn()
vader_analyzer = SentimentIntensityAnalyzer()

def get_sentiment_sentiwordnet(word):
    """
    Get sentiment score using SentiWordNet lexicon.

    Inputs:
    - word (str): Input word.

    Returns:
    - float: Sentiment score between -1 (negative) and +1 (positive).
    """
    synsets = wn.synsets(word)
    if not synsets:
        return 0.0
    synset = synsets[0]  # Take the first synset
    swn_synset = swn.senti_synset(synset.name())
    return swn_synset.pos_score() - swn_synset.neg_score()

def get_sentiment_afinn(text):
    """
    Get sentiment score using AFINN lexicon.

    Inputs:
    - text (str): Input text.

    Returns:
    - float: Sentiment score between -1 (most negative) and +1 (most positive).
    """
    afinn_score = afinn.score(text)
    scaled_score = afinn_score / 5.0  # Scale the score to range from -1 to 1
    return scaled_score

def get_sentiment_vader(text):
    """
    Get sentiment score using VADER sentiment analyzer.

    Inputs:
    - text (str): Input text.

    Returns:
    - dict: Dictionary containing sentiment scores (positive, negative, neutral, compound).
    """
    return vader_analyzer.polarity_scores(text)

def get_sentiment_textblob(text):
    """
    Get sentiment score using TextBlob sentiment analyzer.

    Inputs:
    - text (str): Input text.

    Returns:
    - float: Sentiment score between -1 (negative) and +1 (positive).
    """
    return TextBlob(text).sentiment.polarity


def analyze_sentiment_hybrid(text, topic_word=None):
    """
    Function to detect sentiment towards a specific topic word. Returns -1, 0, or 1.

    Inputs:
        text (str): Input text to analyze sentiment.
        topic_word (str): The topic word for which sentiment is to be evaluated.

    Returns:
        int: Sentiment label (-1 for negative, 0 for neutral, 1 for positive).
    """
    # Hybrid sentiment analysis logic
    sentiment_scores = [
        get_sentiment_afinn(text),
        get_sentiment_vader(text)['compound'],
        get_sentiment_textblob(text)
    ]

    avg_score = sum(sentiment_scores) / len(sentiment_scores)

    return avg_score



## **Detecting Similar Words**
___________________________________________________________________________________________________


In [None]:


def find_similar_words_with_similarity_score(tokenized_column_elements, seed_words, min_similarity_score=0.8, topn=5, vector_size=100, window=5, min_count=1, sg=1):
    """
    Finds similar words, synonyms, and antonyms for each seed word from the tokenized comments in a DataFrame column.

    Inputs:
    - tokenized_column_elements (list): List of tokenized comments.
    - seed_words (list): List of seed words for which similar words will be found.
    - min_similarity_score (float): Minimum similarity score threshold for filtering similar words.
    - topn (int): Number of similar words to retrieve for each seed word.
    - vector_size (int): Dimensionality of the word vectors.
    - window (int): Maximum distance between the current and predicted word within a sentence.
    - min_count (int): Ignores all words with total frequency lower than this.
    - sg (int): Training algorithm: 1 for skip-gram; otherwise CBOW.

    Returns:
    - dict: Dictionary containing filtered similar words, synonyms, and antonyms for each seed word.
    """

    # Flatten the list of tokenized sentences into a single list of tokens
    tokenized_comments = [sublist for sublist in tokenized_column_elements]

    # print(tokenized_comments)

    # TrainING Word2Vec model on the tokenized comments
    model = Word2Vec(tokenized_comments, vector_size=vector_size, window=window, min_count=min_count, sg=sg)

    # Function to get similar words for a given word
    def get_similar_words(word, model, topn=topn):
        if word in model.wv:
            similar_words = model.wv.most_similar(word, topn=topn)
            return similar_words
        else:
            return []

    # Function to get synonyms and antonyms for a word
    def get_synonyms_antonyms(word):
        synonyms = set()
        antonyms = set()
        for synset in wordnet.synsets(word):
            for lemma in synset.lemmas():
                synonym = lemma.name()
                if any(token in tokenized_column_elements for token in synonym.split('_')):
                    synonyms.add(synonym)
                    if lemma.antonyms():
                        antonym = lemma.antonyms()[0].name()
                        if any(token in tokenized_column_elements for token in antonym.split('_')):
                            antonyms.add(antonym)
        return list(synonyms), list(antonyms)

    # Function to filter similar words based on minimum similarity score
    def filter_similar_words(similar_words, similar_words_dict, min_similarity_score):
        filtered_similar_words = [(similar_word, score) for similar_word, score in similar_words
                                  if similar_word not in similar_words_dict and score >= min_similarity_score]
        return filtered_similar_words

    # Finding similar words, synonyms, and antonyms for each seed word in the list
    similar_words_dict = {}
    for word in seed_words:
        similar_words = get_similar_words(word, model)
        synonyms, antonyms = get_synonyms_antonyms(word)
        similar_words_with_similarity = [(synonym, model.wv.similarity(word, synonym)) for synonym in synonyms if synonym in model.wv] + [(antonym, model.wv.similarity(word, antonym)) for antonym in antonyms if antonym in model.wv] + [(similar_word, similarity) for similar_word, similarity in similar_words if similar_word in model.wv]

        # Filtering out the seed word from the similar words
        filtered_similar_words = filter_similar_words(similar_words_with_similarity, [word], min_similarity_score)

        similar_words_dict[word] = filtered_similar_words

    return similar_words_dict





## **Profile and Interaction Generator**
___________________________________________________________________________________________________


### **Detecting the topic words**


In [None]:
def extract_topic_sentences(review_sentences, filtered_similar_words_dict):
    """
    Extracts sentences containing topic words and their similar words from the review sentences.

    Inputs:
        review_sentences (list): List of sentences in the review.
        filtered_similar_words_dict (dict): Dictionary where keys are topic words and values are lists of similar words with similarity scores.

    Returns:
        dict: A dictionary where keys are topic words and values are lists of tuples containing the sentence and similarity score.
    """
    topic_sentences_dict = {word: [] for word in filtered_similar_words_dict.keys()}
    for sentence in review_sentences:
        for word, similar_words in filtered_similar_words_dict.items():
            if word in sentence:  # Prioritizing the topic word
                topic_sentences_dict[word].append((sentence, 1))
                break  # Move to the next sentence after finding the topic word

            elif similar_words:
                for similar_word, similarity_score in similar_words:
                    if similar_word in sentence:
                        topic_sentences_dict[word].append((sentence, similarity_score))
                        break  # Move to the next sentence after finding a similar word
    return topic_sentences_dict


### **Profile Generator**

In [None]:


def generate_profile(df, all_seed_words, is_user=True):
    """
    Generate a profile for users or properties based on detected topic words and sentiment analysis.

    Inputs:
    - df (DataFrame): DataFrame containing user or property data.
    - all_seed_words (list): List of seed words for detecting topic words.
    - is_user (bool): True if processing user data, False if processing property data.

    Returns:
    - DataFrame: Profile DataFrame containing sentiment scores for each topic word and user or property.
    """
    profile = {}  # to store profile information
    filtered_similar_words_dict = find_similar_words_with_similarity_score(df['preprocessed_tokenized_word'].tolist(), all_seed_words, min_similarity_score=0.9)

    if is_user:
        tokenized_sent_lemmatized_column = 'review_lemmatized_tokenized_sent'
        col_name = 'user_id'
    else:
        tokenized_sent_lemmatized_column = 'description_lemmatized_tokenized_sent'
        col_name = 'property_id'

    # Extracting all the entries of the 'user_id' column
    id_list = df[col_name]
    id_df = id_list.to_frame()

    # Renaming the column to atch the profile type
    id_df.columns = [col_name]


    for index, row in tqdm(df.iterrows(), total=len(df), desc="Processing rows"):
        id = row[col_name]
        tokenized_description = row[tokenized_sent_lemmatized_column]

        # Add a new entry for the user or property if it doesn't exist
        if is_user == True:
            profile[index] = {}
        elif id not in profile:
            profile[index] = {}

        # Extracting topic words and their associated sentences
        dict_detected_topic_word_list_sentence = extract_topic_sentences(tokenized_description, filtered_similar_words_dict)

        for topic_word, sent_list in dict_detected_topic_word_list_sentence.items():
            if not sent_list:
                # Assigning sentiment score of 0 if no sentences are found
                sentiment = 0 # 0 is neutral stance toward the topic word

                # Update profile with sentiment
                if topic_word not in profile[index]:
                    profile[index][topic_word] = []
                profile[index][topic_word].append(sentiment)
            else:
                # Iterating over each sentence in the list of sentences
                for sent in sent_list:

                    # obtaining the similarity score for the detected word
                    similarity_score = sent[1]

                    # Analyzing sentiment and multiply by similarity score
                    sentiment = analyze_sentiment_hybrid(sent[0], topic_word) * similarity_score

                    # Update profile with sentiment
                    if topic_word not in profile[index]:
                        profile[index][topic_word] = []
                    profile[index][topic_word].append(sentiment)

    # Calculating the average sentiment score for each topic word in each user or property
    for idx, topics in profile.items():
        for topic, scores in topics.items():
            profile[idx][topic] = sum(scores) / len(scores)

    # Converting profile dictionary to DataFrame
    profile_df = pd.DataFrame.from_dict(profile, orient='index')
    profile_df.fillna(0, inplace=True)
    profile_df.rename_axis('index', inplace=True)

    # Concatenate id_df with existing_df, adding id column for index
    result_df = pd.concat([id_df, profile_df], axis=1)

    result_df.set_index([col_name], inplace=True)

    return result_df


In [None]:
def get_unique_counts(df):
    """
    Get the unique counts of user_id and property_id in a DataFrame.

    Inputs:
        df (DataFrame): DataFrame containing user reviews.

    Returns:
        dict: Dictionary containing the counts of unique user_id and property_id.
    """
    unique_counts = {
        'unique_user_ID': df['user_id'].nunique(),
        'unique_property_ID': df['property_id'].nunique()
    }

    return unique_counts

# ===================================================================================================
# ===================================================================================================
def count_unique_elements(df):
    """
    Count the unique elements in each column of a DataFrame.

    Inputs:
        df (DataFrame): The input DataFrame.

    Returns:
        dict: A dictionary where keys are column names and values are dictionaries
              containing the counts of unique elements in that column.
    """
    unique_counts = {}
    for column in df.columns:
        counts = df[column].value_counts().to_dict()
        unique_counts[column] = counts
    return unique_counts

# ===================================================================================================
# ===================================================================================================
def save_profile(profile_df, destination, mode='w', include_index=True):
    """
    Save the profile DataFrame to a CSV file.

    Inputs:
    - profile_df (DataFrame): Profile DataFrame to be saved.
    - destination (str): Destination path for the CSV file.
    - mode (str): File writing mode. Default is 'w' (overwrite), use 'a' to append.
    - include_index (bool): Whether to include the DataFrame index in the CSV file. Default is True.
    """
    profile_df.to_csv(destination, mode=mode, index=include_index)


def unique_counts(df):
    """
    Count the occurrences of each unique value in the DataFrame.

    Inputs:
        df (DataFrame): The input DataFrame.

    Returns:
        DataFrame: A DataFrame containing the count of each unique value.
    """
    flattened_df = df.stack()  # Flatten the DataFrame
    unique_value_counts = flattened_df.value_counts()  # Count occurrences of each unique value
    return unique_value_counts


# ===================================================================================================
# ===================================================================================================


def apply_literal_eval_to_column(df, column):
    """
    Apply ast.literal_eval() to a DataFrame column.

    Input Params:
    - df (DataFrame): DataFrame containing the column to apply literal_eval to.
    - column (str): Name of the column to apply literal_eval to.

    Returns:
    - None
    """
    df[column] = df[column].apply(ast.literal_eval)


### **Interaction Extractor**

In [None]:

def extract_user_property_interaction(user_reviews, unique_property_ids):
    """
    Detect user-property interactions based on user reviews.

    Inputs:
        user_reviews (DataFrame): DataFrame containing user reviews with 'user_id' and 'property_id' columns.
        unique_property_ids (list): List of unique property_ids to maintain uniformity.

    Returns:
        DataFrame: DataFrame representing user-property interactions.
    """
    # list to hold dictionaries representing user-property interactions
    interactions = []

    for _, row in user_reviews.iterrows():
        user_id = row['user_id']
        property_id = row['property_id']

        # Creating a dictionary representing the interaction and append it to the list
        interaction = {'user_id': user_id}
        for prop_id in unique_property_ids:
            if prop_id == property_id:
                interaction[prop_id] = 1
            else:
                interaction[prop_id] = 0
        interactions.append(interaction)

    # Creating DataFrame from the list of interactions
    user_property_df = pd.DataFrame(interactions)
    user_property_df.set_index('user_id', inplace=True)

    return user_property_df


## **Property Description**
___________________________________________________________________________________________________

Profile Creation, Backup and Clear it from RAM

In [None]:
# Apply ast.literal_eval() on the loaded  property_df  to the retain the list as list not string
property_reviews = pd.read_csv(os.path.join(main_data_path, "processed_property_df.csv"))

apply_literal_eval_to_column(property_reviews, 'preprocessed_tokenized_word')
apply_literal_eval_to_column(property_reviews, 'description_lemmatized_tokenized_sent')


property_reviews.head(10)

Unnamed: 0,property_id,description,ratings,preprocessed_description,description_lemmatized_tokenized_sent,preprocessed_tokenized_word
0,241032,make your self at home in this charming one-be...,95.0,self home charming bedroom apartment centrally...,[make your self at home in this charming one-b...,"[self, home, charming, bedroom, apartment, cen..."
1,953595,chemically sensitive? we've removed the irrita...,96.0,chemically sensitive ve remove irritants trigg...,"[chemically sensitive?, we've removed the irri...","[chemically, sensitive, ve, remove, irritants,..."
2,3308979,new modern house built in 2013. spectacular s...,97.0,new modern house build spectacular sunset wate...,[new modern house built in 2013. spectacular ...,"[new, modern, house, build, spectacular, sunse..."
3,7421966,a charming apartment that sits atop queen anne...,,charming apartment sit atop queen anne hill no...,[a charming apartment that sits atop queen ann...,"[charming, apartment, sit, atop, queen, anne, ..."
4,278830,cozy family craftman house in beautiful neighb...,92.0,cozy family craftman house beautiful neighborh...,[cozy family craftman house in beautiful neigh...,"[cozy, family, craftman, house, beautiful, nei..."
5,5956968,we're renting out a small private unit of one ...,95.0,rent small private unit seattle beautiful hous...,[we're renting out a small private unit of one...,"[rent, small, private, unit, seattle, beautifu..."
6,1909058,enjoy a quiet stay in our comfortable 1915 cra...,99.0,enjoy quiet stay comfortable craftsman bungalo...,[enjoy a quiet stay in our comfortable 1915 cr...,"[enjoy, quiet, stay, comfortable, craftsman, b..."
7,856550,"our tiny cabin is private , very quiet and com...",97.0,tiny cabin private quiet comfortable spot bus ...,"[our tiny cabin is private , very quiet and co...","[tiny, cabin, private, quiet, comfortable, spo..."
8,4948745,"nestled in the heart of the city, this space i...",97.0,nestle heart city space turn century charm mee...,"[nestled in the heart of the city, this space ...","[nestle, heart, city, space, turn, century, ch..."
9,2493658,"beautiful apartment in an extremely safe, quie...",97.0,beautiful apartment extremely safe quiet pedes...,"[beautiful apartment in an extremely safe, qui...","[beautiful, apartment, extremely, safe, quiet,..."


In [None]:
unique_property_ids = property_reviews['property_id'].unique().tolist()

In [None]:
property_profile =generate_profile(property_reviews,all_seed_words,is_user = False)

Processing rows: 100%|██████████| 3818/3818 [00:23<00:00, 165.35it/s]


In [None]:
property_profile

Unnamed: 0_level_0,bungalow,cottage,apartment,flat,balcony,abode,condo,townhouse,pet-friendly,smoke-free,...,host,cancellation,hospitality,communication,check-in,check-out,instruction,responsive,approachable,condition
property_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
241032,0.0,0.0,0.264358,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0
953595,0.0,0.0,0.815333,0.737533,0.000000,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0
3308979,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0
7421966,0.0,0.0,0.685822,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0
278830,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.149888,0.0,0.0,0.0,0.0,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8101950,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.913615,0.0
8902327,0.0,0.0,0.322265,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0
10267360,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0
9604740,0.0,0.0,0.000000,0.000000,0.425383,0.0,0.139867,0.0,0.0,0.0,...,0.0,0.094202,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0


In [None]:
profile_file_path = os.path.join(main_data_path, "property_profile.csv")

# Save property_profile to CSV and Clear property_profile from memory
property_profile.to_csv(profile_file_path, mode='w', index=True)
del property_profile,property_reviews

## **User_Reviews**
___________________________________________________________________________________________________


### **Testing Dataset:**

In [None]:
# Apply ast.literal_eval() on the loaded  test_review  to the retain the list as list not string
test_review = pd.read_csv(os.path.join(main_data_path, "processed_test_review.csv"))

apply_literal_eval_to_column(test_review, 'preprocessed_tokenized_word')
apply_literal_eval_to_column(test_review, 'review_lemmatized_tokenized_sent')

test_review

Unnamed: 0,property_id,user_id,review,preprocessed_review,review_lemmatized_tokenized_sent,preprocessed_tokenized_word
0,6044106,27099102,we loved staying at christine's place! the vie...,love stay christine place view phenomenal nice...,"[we loved staying at christine's place!, the v...","[love, stay, christine, place, view, phenomena..."
1,3143227,5098623,spring street center was the perfect place for...,spring street center perfect place stay seattl...,[spring street center was the perfect place fo...,"[spring, street, center, perfect, place, stay,..."
2,3968416,1168337,another wonderful stay with carolyn and martin...,wonderful stay carolyn martin,[another wonderful stay with carolyn and marti...,"[wonderful, stay, carolyn, martin]"
3,113951,767996,jill was wonderful. she called to make sure we...,jill wonderful call sure arrive ok sure need c...,"[jill was wonderful., she called to make sure ...","[jill, wonderful, call, sure, arrive, ok, sure..."
4,1432713,5295478,we had a great stay at mr. z's - the location ...,great stay mr z location residential capitol h...,[we had a great stay at mr. z's - the location...,"[great, stay, mr, z, location, residential, ca..."
...,...,...,...,...,...,...
16773,143580,5471216,jana's home was very close to a bus-stop that ...,jana home close bus stop take straight downtow...,[jana's home was very close to a bus-stop that...,"[jana, home, close, bus, stop, take, straight,..."
16774,5682,352959,"the place was great, it's very easily accessib...",place great easily accessible downtown clean h...,"[the place was great, it's very easily accessi...","[place, great, easily, accessible, downtown, c..."
16775,58503,28284877,terrific location and spacious place to stay. ...,terrific location spacious place stay hosts gr...,[terrific location and spacious place to stay....,"[terrific, location, spacious, place, stay, ho..."
16776,5020861,43454907,christen was a great host! the bed was comfort...,christen great host bed comfortable food coffe...,"[christen was a great host!, the bed was comfo...","[christen, great, host, bed, comfortable, food..."


In [None]:
test_user_profile =  generate_profile(test_review, all_seed_words)
test_user_profile

Processing rows: 100%|██████████| 16778/16778 [00:33<00:00, 497.04it/s]


Unnamed: 0_level_0,bungalow,cottage,apartment,flat,balcony,abode,condo,townhouse,pet-friendly,smoke-free,...,host,cancellation,hospitality,communication,check-in,check-out,instruction,responsive,approachable,condition
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
27099102,0.0,0.0,0.0000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0000,0.0,0.0,0.000000,0.0,0.0,0.0
5098623,0.0,0.0,0.0000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0000,0.0,0.0,0.000000,0.0,0.0,0.0
1168337,0.0,0.0,0.0000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0000,0.0,0.0,0.000000,0.0,0.0,0.0
767996,0.0,0.0,0.0000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0000,0.0,0.0,0.000000,0.0,0.0,0.0
5295478,0.0,0.0,0.0000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0000,0.0,0.0,0.000000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5471216,0.0,0.0,0.0000,0.0,0.0,0.134742,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.8949,0.0,0.0,0.000000,0.0,0.0,0.0
352959,0.0,0.0,0.0000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0000,0.0,0.0,0.000000,0.0,0.0,0.0
28284877,0.0,0.0,0.0000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0000,0.0,0.0,0.000000,0.0,0.0,0.0
43454907,0.0,0.0,0.0000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.752933,0.0,0.0,0.0000,0.0,0.0,0.000000,0.0,0.0,0.0


In [None]:
# count_unique_elements(test_user_profile)

In [None]:
test_user_profile_file_path = os.path.join(main_data_path, "test_user_profile.csv")


# Save test_user_profile to CSV and Clear it from memory
save_profile(test_user_profile,test_user_profile_file_path, mode='w', include_index=True)
del test_user_profile

In [None]:
test_user_property_interaction = extract_user_property_interaction(test_review,unique_property_ids)
test_user_property_interaction

Unnamed: 0_level_0,241032,953595,3308979,7421966,278830,5956968,1909058,856550,4948745,2493658,...,1844791,6120046,262764,8578490,3383329,8101950,8902327,10267360,9604740,10208623
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
27099102,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5098623,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1168337,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
767996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5295478,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5471216,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
352959,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
28284877,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
43454907,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
get_unique_counts(test_review)

{'unique_user_ID': 16283, 'unique_property_ID': 2484}

In [None]:
unique_counts(test_user_property_interaction)


0    64041626
1       16778
dtype: int64

In [None]:
test_user_prop_interaction_file_path = os.path.join(main_data_path, "test_user_prop_interaction.csv")


# Save test_user_property_interaction to CSV and Clear it from memory
save_profile(test_user_property_interaction,test_user_prop_interaction_file_path, mode='w', include_index=True)
del test_user_property_interaction,test_review

### **Validation Dataset:**

In [None]:
# Apply ast.literal_eval() on the loaded  property_df  to the retain the list as list not string
valid_review = pd.read_csv(os.path.join(main_data_path, "processed_valid_review.csv"))
apply_literal_eval_to_column(valid_review, 'preprocessed_tokenized_word')
apply_literal_eval_to_column(valid_review, 'review_lemmatized_tokenized_sent')


In [None]:
valid_user_profile =  generate_profile(valid_review, all_seed_words)

Processing rows: 100%|██████████| 8390/8390 [00:24<00:00, 335.98it/s]


In [None]:
# count_unique_elements(eval_user_profile)

In [None]:
valid_user_profile_file_path = os.path.join(main_data_path, "valid_user_profile.csv")


# Save valid_user_profile to CSV and Clear it from memory
save_profile(valid_user_profile,valid_user_profile_file_path, mode='w', include_index=True)
del valid_user_profile

In [None]:
valid_user_property_interaction =extract_user_property_interaction(valid_review,unique_property_ids)
valid_user_property_interaction

Unnamed: 0_level_0,241032,953595,3308979,7421966,278830,5956968,1909058,856550,4948745,2493658,...,1844791,6120046,262764,8578490,3383329,8101950,8902327,10267360,9604740,10208623
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
19136870,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8631871,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
155626,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
17639945,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3188802,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39176653,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20956757,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
23180352,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10433157,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
# unique_proportion(eval_user_property_interaction)

In [None]:
valid_user_prop_interaction_file_path = os.path.join(main_data_path, "eval_user_prop_interaction.csv")


# Save valid_user_property_interaction to CSV and Clear it from memory
save_profile(valid_user_property_interaction,valid_user_prop_interaction_file_path,include_index= True)
del valid_user_property_interaction,valid_review

### **Training Dataset:**

In [None]:
# Apply ast.literal_eval() on the loaded  property_df  to the retain the list as list not string
train_review = pd.read_csv(os.path.join(main_data_path, "processed_train_review.csv"))

apply_literal_eval_to_column(train_review, 'preprocessed_tokenized_word')
apply_literal_eval_to_column(train_review, 'review_lemmatized_tokenized_sent')
train_review

Unnamed: 0,property_id,user_id,review,preprocessed_review,review_lemmatized_tokenized_sent,preprocessed_tokenized_word
0,6793490,9129697,rob was very accommodating and accepted a book...,rob accommodate accept book minute helpful all...,[rob was very accommodating and accepted a boo...,"[rob, accommodate, accept, book, minute, helpf..."
1,1472532,10977089,sid and neha were awesome. they really helped ...,sid neha awesome help jam quick responsive go ...,"[sid and neha were awesome., they really helpe...","[sid, neha, awesome, help, jam, quick, respons..."
2,3449059,27450730,i was in seattle for business the week of marc...,seattle business week march time city want sta...,[i was in seattle for business the week of mar...,"[seattle, business, week, march, time, city, w..."
3,2737012,10922774,we loved our time here. the studio was so cozy...,love time studio cozy bed amazing sleep great ...,"[we loved our time here., the studio was so co...","[love, time, studio, cozy, bed, amazing, sleep..."
4,3970736,31037410,deborah made us feel very welcome and comforta...,deborah feel welcome comfortable apartment nic...,[deborah made us feel very welcome and comfort...,"[deborah, feel, welcome, comfortable, apartmen..."
...,...,...,...,...,...,...
58720,456040,2084798,we enjoyed our stay! it was easy to find ...,enjoy stay easy find close apartment roomy com...,"[we enjoyed our stay!, it was easy to find and...","[enjoy, stay, easy, find, close, apartment, ro..."
58721,7561333,254465,"jonathan was communicative and available, gave...",jonathan communicative available give perfect ...,"[jonathan was communicative and available, gav...","[jonathan, communicative, available, give, per..."
58722,7430926,7859809,audrey did a great job ensuring our stay was c...,audrey great job ensure stay comfortable accom...,[audrey did a great job ensuring our stay was ...,"[audrey, great, job, ensure, stay, comfortable..."
58723,1392332,20573874,we had a wonderful experience and loved our ti...,wonderful experience love time spend house ups...,[we had a wonderful experience and loved our t...,"[wonderful, experience, love, time, spend, hou..."


In [None]:
train_user_profile =  generate_profile(train_review, all_seed_words)
train_user_profile

Processing rows: 100%|██████████| 58725/58725 [01:53<00:00, 518.84it/s]


Unnamed: 0_level_0,bungalow,cottage,apartment,flat,balcony,abode,condo,townhouse,pet-friendly,smoke-free,...,host,cancellation,hospitality,communication,check-in,check-out,instruction,responsive,approachable,condition
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
9129697,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0
10977089,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.406511,0.0,0.0
27450730,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0
10922774,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0
31037410,0.0,0.0,0.618467,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2084798,0.0,0.0,0.436867,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0
254465,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.546189,0.000000,0.0,0.0
7859809,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0
20573874,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.508189,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0


In [None]:
train_user_profile_file_path = os.path.join(main_data_path, "train_user_profile.csv")


# Save train_user_profile to CSV and Clear it from memory
save_profile(train_user_profile,train_user_profile_file_path, mode='w', include_index=True)
del train_user_profile

In [None]:


get_unique_counts(train_review)

{'unique_user_ID': 53817, 'unique_property_ID': 3078}

In [None]:
train_user_property_interaction =extract_user_property_interaction(train_review,unique_property_ids)
train_user_property_interaction

Unnamed: 0_level_0,241032,953595,3308979,7421966,278830,5956968,1909058,856550,4948745,2493658,...,1844791,6120046,262764,8578490,3383329,8101950,8902327,10267360,9604740,10208623
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
9129697,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10977089,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
27450730,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10922774,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
31037410,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2084798,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
254465,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7859809,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20573874,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
unique_counts(train_user_property_interaction)

0    224153325
1        58725
dtype: int64

In [None]:
# unique_proportion(train_user_property_interaction)

In [None]:
train_user_prop_interaction_file_path = os.path.join(main_data_path, "train_user_prop_interaction.csv")


# Save train_user_property_interaction to CSV and Clear train_user_property_interaction from memory
save_profile(train_user_property_interaction,train_user_prop_interaction_file_path, mode='w', include_index= True)
del train_user_property_interaction,train_review