In [13]:
from transformers import pipeline # 30 seconds to load first time

In [14]:
import pandas as pd

Import your csv of reviews

In [15]:
df = pd.read_csv("resort_reviews.csv")

print(df.shape)     # rows, columns


(270, 5)


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 270 entries, 0 to 269
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   review_id       270 non-null    int64  
 1   resort          270 non-null    object 
 2   full_day_price  270 non-null    float64
 3   avg_rating      270 non-null    float64
 4   review          270 non-null    object 
dtypes: float64(2), int64(1), object(2)
memory usage: 10.7+ KB


In [17]:
#Create new column called text_length
df['text_length'] = df['review'].apply(len)

In [18]:
df.describe()

Unnamed: 0,review_id,full_day_price,avg_rating,text_length
count,270.0,270.0,270.0,270.0
mean,135.5,115.342593,4.242593,426.07037
std,78.086491,61.129686,0.581819,426.651393
min,1.0,67.0,1.0,18.0
25%,68.25,67.0,4.0,157.25
50%,135.5,67.0,4.2,291.0
75%,202.75,169.5,4.6,572.75
max,270.0,235.0,5.0,3067.0


Max tensor length can be 512 when doing sentiment analysis

Select which model to use for sentiment analysis

In [19]:
sentiment_pipeline = pipeline(model="distilbert/distilbert-base-uncased-finetuned-sst-2-english")

Device set to use mps:0


In [20]:
def add_sentiment_columns(df: pd.DataFrame, text_col: str = "review_text") -> pd.DataFrame:
    """
    Adds sentiment 'label' and 'score' columns to a DataFrame using a HuggingFace pipeline.
    If text_length > 512, only the first 512 characters are analyzed.

    Args:
        df (pd.DataFrame): Input DataFrame containing a column of text reviews.
        text_col (str): Name of the column with text to analyze.

    Returns:
        pd.DataFrame: Original DataFrame with new 'label' and 'score' columns.
    """

    def get_sentiment(text: str, length: int):
        if length > 512:
            text = text[:512]
        result = sentiment_pipeline(text)[0]
        return result["label"], result["score"]

    df[["label", "score"]] = df.apply(lambda row: get_sentiment(row[text_col], row["text_length"]), axis=1, result_type="expand")

    return df


In [21]:
add_sentiment_columns(df, text_col = "review")

Unnamed: 0,review_id,resort,full_day_price,avg_rating,review,text_length,label,score
0,1,Perisher Ski Resort,183.0,5.0,Highly recommended! Me and my family had a fan...,458,POSITIVE,0.999874
1,2,Perisher Ski Resort,183.0,5.0,"Visited Perisher valley, Smiggin Holes.. Which...",339,POSITIVE,0.991743
2,3,Perisher Ski Resort,183.0,3.0,Perisher Valley is absolutely magical – the sn...,871,NEGATIVE,0.989859
3,4,Perisher Ski Resort,183.0,2.0,We do the skiing once a year during the last 9...,1478,NEGATIVE,0.998968
4,5,Perisher Ski Resort,183.0,4.0,I think Perisher does a good job given the con...,194,NEGATIVE,0.997874
...,...,...,...,...,...,...,...,...
265,266,Selwyn,119.0,3.9,We visited on a Monday in school holidays and ...,775,POSITIVE,0.968631
266,267,Selwyn,119.0,3.9,Crowd management was poor!\n2 terminal out of ...,377,NEGATIVE,0.999516
267,268,Selwyn,119.0,3.9,"Small but amazing. Love the ""magic carpet"" for...",203,NEGATIVE,0.962833
268,269,Selwyn,119.0,3.9,Love this place! Perfect spot to learn to ski ...,237,POSITIVE,0.999855


In [23]:
# show number of each different value in 'label' column
label_counts = df['label'].value_counts()

print(label_counts)

label
POSITIVE    196
NEGATIVE     74
Name: count, dtype: int64


In [25]:
df.describe()

Unnamed: 0,review_id,full_day_price,avg_rating,text_length,score
count,270.0,270.0,270.0,270.0,270.0
mean,135.5,115.342593,4.242593,426.07037,0.975566
std,78.086491,61.129686,0.581819,426.651393,0.075614
min,1.0,67.0,1.0,18.0,0.514583
25%,68.25,67.0,4.0,157.25,0.994523
50%,135.5,67.0,4.2,291.0,0.999452
75%,202.75,169.5,4.6,572.75,0.999777
max,270.0,235.0,5.0,3067.0,0.999887


In [28]:
# find row where score is 0.514583
df_low_confidence = df[df['score'] < 0.52]

df_low_confidence.head()

Unnamed: 0,review_id,resort,full_day_price,avg_rating,review,text_length,label,score
172,173,Mount Hotham,67.0,4.7,Exceeded my expectation. Even a month felt les...,73,NEGATIVE,0.514583


That row is incorrect. Manually edit. 

In [33]:
# where review column row contains  "Exceeded my expectation. Even a month felt less. Can stay here all winter", change label value to POSITIVE and score to 1
df.loc[df['review'] == "Exceeded my expectation. Even a month felt less. Can stay here all winter", ['label', 'score']] = ['POSITIVE', 1]

df[df['review'] == "Exceeded my expectation. Even a month felt less. Can stay here all winter"]

Unnamed: 0,review_id,resort,full_day_price,avg_rating,review,text_length,label,score
172,173,Mount Hotham,67.0,4.7,Exceeded my expectation. Even a month felt les...,73,POSITIVE,1.0


Let's check other negative rows 

In [34]:
# create df for only NEGATIVE label
df_negative = df[df['label'] == 'NEGATIVE']

df_negative.head()

Unnamed: 0,review_id,resort,full_day_price,avg_rating,review,text_length,label,score
2,3,Perisher Ski Resort,183.0,3.0,Perisher Valley is absolutely magical – the sn...,871,NEGATIVE,0.989859
3,4,Perisher Ski Resort,183.0,2.0,We do the skiing once a year during the last 9...,1478,NEGATIVE,0.998968
4,5,Perisher Ski Resort,183.0,4.0,I think Perisher does a good job given the con...,194,NEGATIVE,0.997874
6,7,Perisher Ski Resort,183.0,1.0,My experience at Perisher was unfortunately th...,680,NEGATIVE,0.999798
10,11,Perisher Ski Resort,183.0,3.0,"I came to perisher to ski for the first time, ...",572,NEGATIVE,0.997934


Topic Modelling 

In [35]:
#export df as a csv file
df.to_csv('resort_reviews_sentiment.csv', index=False)