In [1]:
#
# Working with dataframes
import pandas as pd

# Maths; basic library
import numpy as np

# Let's just ignore ignore warnings 
import warnings
warnings.filterwarnings("ignore") 

# Time keeps us keep track of how fast our script is 
import time

# This is good for working with system files and folders 
import os 

# For doing out-of-the-box sentiment analysis 
from nltk.sentiment.vader import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer() # Creating a reference to the object for easy use

# we use Python's langdetect to detect languages present in the comments 
from langdetect import detect

# Regex is always handy when working with text data 
import re

# API from Google for translating sentences to English, so we can use them in Vader
from googletrans import Translator
translator = Translator() # Creating a reference to the object for easy use 

In [55]:
#
def fill_nans(df,feature,filling="mean"):
    '''
    Fills Nans by replacing them with either median, mean, or a value chosen by the user.
    Default filling value is mean.
    '''
    if filling == "median":
        median = df[feature].median()
        df[feature] = df[feature].fillna(median)
    elif filling == "mean":
        mean = df[feature].mean()
        df[feature] = df[feature].fillna(mean)
    else:
        df[feature].fillna(filling, inplace=True)
        
def language_detection(text):
    '''
    Function that tries to detect the language of a comment.
    '''
    try:
        return detect(text)
    except:
        return None
    
def print_sentiment_scores(sentence):
    '''
    Prints all of the sentiment scores of a given text.
    '''
    snt = analyzer.polarity_scores(sentence)
    print("{:-<40} {}".format(sentence, str(snt)))

def sentiment_score(text, sentiment):
    '''
    Returns the sentiment value. Input is the text and desired sentiment: neg, neu, pos and compound.
    '''
    if sentiment == "neg":
        negative_value = analyzer.polarity_scores(text)['neg']
        return negative_value
    elif sentiment == "neu":
        neutral_value = analyzer.polarity_scores(text)['neu']
        return neutral_value
    elif sentiment == "pos":
        positive_value = analyzer.polarity_scores(text)['pos']
        return positive_value
    elif sentiment == "compound":
        compound_value = analyzer.polarity_scores(text)['compound']
        return compound_value
    else: 
        print("This is not a valid sentiment. Please refer to the function docstring.")
        
def translator_compound_scores(text, engl=True):
    '''
    This function uses the Google Translate API to check if a text is in English.
    If not, it translates it. Then, Vader calculates its compound score. 
    Then, the function assigns either 1 (positive), 0 (neutral) or -1(negative) based on rules.
    '''
    # Checks language
    if engl:
        trans = text
    else: # If not English, translate it.
        trans = translator.translate(text).text
    
    # Going to be using Sentiment Analysys 
    score = analyzer.polarity_scores(trans)
    lb = score['compound'] # Compount polarity score is what interests us. 
    
    # Rules for output
    if lb >= 0.05:
        return 1
    elif (lb > -0.05) and (lb < 0.05):
        return 0
    else:
        return -1

def clean(doc):
    '''
    Function that does some basic text cleaning. 
    '''
    stop_free = " ".join([word for word in doc.lower().split() if word not in stop])
    punc_free = "".join(token for token in stop_free if token not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized

def get_sentiment(reviews_path,listings_path):
    '''
    Function that returns the average sentiment score of a listing. The output is a dataframe with the listing ID 
    and the sentiment score (arithmetical mean).
    '''
    # Start time to keep a note of how long it took
    start_time = time.time()
    
    # Loading Data
    reviews = pd.read_csv(reviews_path)
    listings = pd.read_csv(listings_path)
    
    # Dropping unneccesary columns from the reviews dataset.
    reviews.drop(["date","reviewer_id","reviewer_name","id"], axis=1,inplace=True)

    # Merging the dataframes, only adding specific columns. 
    df = pd.merge(reviews, listings, left_on='listing_id', right_on='id', how='left')
    
    # Filling nans 
    fill_nans(df,"number_of_reviews",0)
    fill_nans(df,"summary_text","")
    
    # Where there no comments present, just replace with "", an empty string.
    fill_nans(df,"comments","")
    
    # renaming columns 
    df.drop(["id"], axis=1,inplace=True)
    
    # THE ACTUAL WORK TAKING PLACE
    df['reviews_sentiment'] = df['comments'].apply(translator_compound_scores)
    df['summary_sentiment'] = df['summary_text'].apply(translator_compound_scores)
    
    # This is the output 
    mean_sentiment = df.groupby('listing_id').mean()
    
    # End time 
    end_time = time.time()
    print("Language detection, translation and sentiment analysis was succesfully finished in ",end_time-start_time," seconds.")
    
    mean_sentiment = mean_sentiment.dropna()
    return(mean_sentiment)

In [81]:
root = r"C:\Users\aleen\Desktop\Reviews\paris"
reviews = pd.read_csv(r"C:\Users\aleen\Desktop\Reviews\paris\paris_reviews.csv")
listings = pd.read_csv(r"C:\Users\aleen\Desktop\Reviews\paris\paris_listings_root.csv")
reviews_path = r"C:\Users\aleen\Desktop\Reviews\paris\paris_reviews.csv"
listings_path = r"C:\Users\aleen\Desktop\Reviews\paris\paris_listings_root.csv"

In [82]:
mean_sentiment = get_sentiment(reviews_path, listings_path)
mean_sentiment.shape

Language detection, translation and sentiment analysis was succesfully finished in  1037.6510257720947  seconds.


(25474, 80)

In [83]:
mean_sentiment.to_csv(r"C:\Users\aleen\Desktop\Reviews\paris\paris_listings_plus_sentiment.csv", index=False)