In [1]:
import pandas as pd
import numpy as np

from translate import Translator

from nltk.sentiment.vader import SentimentIntensityAnalyzer

# LOAD CSV

In [2]:
scrap = pd.read_csv('../data/scrapping_booking_prisma.csv')

# TRANSLATION

In [3]:
"""
def translate_to_english(text):
    """
    The function translates a given text from Spanish to English using the Google Translate API.
    """
    translator = Translator(to_lang='en', from_lang='es')
    translation = translator.translate(text)
    return translation
"""

IndentationError: unexpected indent (1360331091.py, line 4)

In [4]:
"""
scrap['pos_review_en'] = scrap['pos_review'].apply(translate_to_english)
scrap['neg_review_en'] = scrap['neg_review'].apply(translate_to_english)
"""

"\nscrap['pos_review_en'] = scrap['pos_review'].apply(translate_to_english)\nscrap['neg_review_en'] = scrap['neg_review'].apply(translate_to_english)\n"

In [5]:
prisma = pd.read_csv('../data/prisma_eng_full.csv')
pd.set_option('display.max_rows', None)

# CREATE COLS

In [30]:
def create_cols (df):

    """
    Function that performs different operations to create new columns in a df.
    - Dates
    - Scores
    - Sentiment Analysis
    """
    
    # "YEAR - MONTH" - create columns
    df['date_review'] = pd.to_datetime(df['date_review'], dayfirst=True)
    df['year_review'] = df['date_review'].dt.year
    df['month_review'] = df['date_review'].dt.month

    # "LEN_REVIEW" - create columns
    df['len_pos_review'] = df['pos_review'].apply(lambda x: len(str(x)) if not pd.isnull(x) else 0)
    df['len_neg_review'] = df['neg_review'].apply(lambda x: len(str(x)) if not pd.isnull(x) else 0)

    # SCORES - create column
    #df['score'] = df['score'].str.replace(',', '.').astype(float)
    df['score'] = df['score'].astype(str).str.replace(',', '.').astype(float)


    def map_score(score):
        if score >= 9:
            return 'Wonderful'
        elif score >= 7:
            return 'Good'
        elif score >= 5:
            return 'Ok'
        elif score >= 3:
            return 'Poor'
        else:
            return 'Very poor'

    df['score_2'] = df['score'].apply(map_score)

    # SIA - create sentiment columns
    sia = SentimentIntensityAnalyzer() #nltk

    df["pos_compound"] = df.pos_review_en.apply(lambda x: sia.polarity_scores(x)["compound"])
    df['pos_comp_score'] = df['pos_compound'].apply(lambda x: "positive" if x >= 0.2 else ("neutral" if -0.2 <= x <= 0.2 else "negative"))

    df["neg_compound"] = df.neg_review_en.apply(lambda x: sia.polarity_scores(x)["compound"] if not pd.isna(x) else np.nan)
    df['neg_comp_score'] = df['neg_compound'].apply(lambda x: "positive" if x >= 0.2 else ("neutral" if -0.2 <= x <= 0.2 else "negative") if not pd.isna(x) else np.nan)

    return df




In [31]:
prisma = create_cols (prisma)

# CLEAN COLS

In [32]:
prisma.columns# CREATE COLS

Index(['client', 'nationality', 'title', 'pos_review', 'pos_review_en',
       'pos_compound', 'pos_comp_score', 'len_pos_review', 'neg_review',
       'neg_review_en', 'neg_compound', 'neg_comp_score', 'len_neg_review',
       'date_review', 'year_review', 'month_review', 'score', 'score_2',
       'travel_type', 'room_type'],
      dtype='object')

In [35]:
def clean_cols(df):
    
    """
    Function that performs various operations to clean and organize the columns of the df.
    """
     # DELETE - the rows without room
    df = df[df['room_type'].str.startswith('Habitación')]  
    # DELETE - reviews w/o comments
    df = df[df['pos_review'] != 'Esta entrada no tiene comentarios']
    # DELETE - NaN values
    df = df.dropna(subset=['travel_type'])
    
    # DELETE - Unnamed: 0
    #df = df.drop("Unnamed: 0", axis=1)

    column_order = ['client',
                    'nationality',
                    'title',
                    
                    'pos_review',
                    'pos_review_en',
                    'pos_compound',
                    'pos_comp_score',
                    'len_pos_review',
                    
                    'neg_review',
                    'neg_review_en',
                    'neg_compound',
                    'neg_comp_score',
                    'len_neg_review',
                    
                    'date_review',
                    'year_review',
                    'month_review',
                    
                    'score',
                    'score_2',
                    'travel_type',
                    'room_type']
    
    df = df.reindex(columns=column_order)
    
    return df


In [36]:
prisma = clean_cols (prisma)

In [38]:
prisma.isnull().sum()

client              0
nationality         0
title               0
pos_review          0
pos_review_en       0
pos_compound        0
pos_comp_score      0
len_pos_review      0
neg_review        248
neg_review_en     248
neg_compound      248
neg_comp_score    248
len_neg_review      0
date_review         0
year_review         0
month_review        0
score               0
score_2             0
travel_type         0
room_type           0
dtype: int64

# SAVE

In [39]:
#prisma.to_csv('../data/prisma_def.csv', index=False)

In [2]:
import pandas as pd

In [3]:
df = pd.read_csv('../data/prisma_def.csv')
df.head(3)

Unnamed: 0,client,nationality,title,pos_review,pos_review_en,pos_compound,pos_comp_score,len_pos_review,neg_review,neg_review_en,neg_compound,neg_comp_score,len_neg_review,date_review,year_review,month_review,score,score_2,travel_type,room_type
0,Sandra,España,Excepcional,Restaurante cena,La Cena Restaurant,0.0,neutral,16,,,,,0,2023-04-27,2023,4,10.0,Wonderful,En pareja,Habitación Familiar Deluxe
1,Augustbou,España,Muy bien,"Ubicación, aparcamiento, amabilidad del person...","Location, parking, friendly staff.\nCorrect br...",0.4939,positive,68,"No es un hotel con magníficas instalaciones, e...","It is not a hotel with magnificent facilities,...",-0.1144,neutral,65,2023-04-26,2023,4,8.0,Good,En pareja,Habitación Doble Deluxe
2,Jeep,España,Muy bien,Tamaño de habitacion,Room Size,0.0,neutral,20,Wi-fi colapsada,Wi-fi Collapsed,-0.2732,negative,15,2023-04-24,2023,4,8.0,Good,En pareja,Habitación Doble Deluxe


In [5]:
df.to_json('../data/prisma_def.json')