**Step 1:** Download the dataset

In [12]:
import kagglehub
import shutil
import os

# Download the dataset
path = kagglehub.dataset_download("arushchillar/disneyland-reviews")

# Get your current working directory
current_dir = os.getcwd()

# Move all files from the downloaded path to your current directory
for filename in os.listdir(path):
    shutil.move(os.path.join(path, filename), os.path.join(current_dir, filename))


**Step 2:** Load the dataset into a DataFrame

In [13]:
import pandas as pd

df = pd.read_csv("DisneylandReviews.csv", encoding="latin1")  # o "ISO-8859-1"
df

Unnamed: 0,Review_ID,Rating,Year_Month,Reviewer_Location,Review_Text,Branch
0,670772142,4,2019-4,Australia,If you've ever been to Disneyland anywhere you'll find Disneyland Hong Kong very similar in the ...,Disneyland_HongKong
1,670682799,4,2019-5,Philippines,"Its been a while since d last time we visit HK Disneyland .. Yet, this time we only stay in Tomo...",Disneyland_HongKong
2,670623270,4,2019-4,United Arab Emirates,Thanks God it wasn t too hot or too humid when I was visiting the park otherwise it would be...,Disneyland_HongKong
3,670607911,4,2019-4,Australia,HK Disneyland is a great compact park. Unfortunately there is quite a bit of maintenance work go...,Disneyland_HongKong
4,670607296,4,2019-4,United Kingdom,"the location is not in the city, took around 1 hour from Kowlon, my kids like disneyland so much...",Disneyland_HongKong
...,...,...,...,...,...,...
42651,1765031,5,missing,United Kingdom,i went to disneyland paris in july 03 and thought it was brilliant. i visited all the hotels and...,Disneyland_Paris
42652,1659553,5,missing,Canada,2 adults and 1 child of 11 visited Disneyland Paris beginning of Feb 04 and had an absolute fant...,Disneyland_Paris
42653,1645894,5,missing,South Africa,My eleven year old daughter and myself went to visit my son in London and we decided to go on to...,Disneyland_Paris
42654,1618637,4,missing,United States,"This hotel, part of the Disneyland Paris complex, is a wonderful place for families. Since we ha...",Disneyland_Paris


**Step 3:** Validate there are no duplicate `Review_IDs`

In [14]:
from IPython.display import display
import pandas as pd

# Summary before cleaning
summary_before = pd.DataFrame({
    "Total records": [len(df)],
    "Unique Review_IDs": [df["Review_ID"].nunique()]
})
display(summary_before)

# Clean duplicates
if len(df) != df["Review_ID"].nunique():
    df = df.drop_duplicates(subset=["Review_ID"]).reset_index(drop=True)

    # Summary after cleaning
    summary_after = pd.DataFrame({
        "Total records": [len(df)],
        "Unique Review_IDs": [df["Review_ID"].nunique()]
    })
    display(summary_after)

Unnamed: 0,Total records,Unique Review_IDs
0,42656,42636


Unnamed: 0,Total records,Unique Review_IDs
0,42636,42636


**Step 4:** Apply **zero-shot classification** to assign categories to each review.  
This means classifying text into predefined labels without needing prior training on those specific categories.

In [None]:
from transformers import pipeline
import pandas as pd

# Load the zero-shot classification pipeline
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

# Defining custom categories
labels = [
    "food", "staff", "rides", "pricing", "shops", "crowds", "cleanliness",
    "planning", "accessibility", "weather", "family", "entertainment",
    "ambience", "experience", "comparison", "park", "transportation",
    "hotel", "nostalgia"
]

# Store results for df_tags
rows = []

for _, row in df.iterrows():
    review_id = row["Review_ID"]
    text = row["Review_Text"]

    # Run zero-shot classifier with multi-label support
    result = classifier(text, candidate_labels=labels, multi_label=True)

    # Add all labels with score > 0.5 (you can tune the threshold)
    for label, score in zip(result["labels"], result["scores"]):
        if score >= 0.5:
            rows.append({"Review_ID": review_id, "Tag": label})

# Create df_tags DataFrame
df_tags = pd.DataFrame(rows)

df_tags

Unnamed: 0,Review_ID,Tag
0,670772142,rides
1,670772142,crowds
2,670772142,weather
3,670772142,experience
4,670772142,comparison
...,...,...
273889,1536786,family
273890,1536786,experience
273891,1536786,park
273892,1536786,transportation


**Step 5:** Convert the `Year_Month` column to the correct date data type.

In [16]:
df["Year_Month"] = pd.to_datetime(
    df["Year_Month"].replace("missing", pd.NA),
    format="%Y-%m",
    errors="coerce"
)
df

Unnamed: 0,Review_ID,Rating,Year_Month,Reviewer_Location,Review_Text,Branch
0,670772142,4,2019-04-01,Australia,If you've ever been to Disneyland anywhere you'll find Disneyland Hong Kong very similar in the ...,Disneyland_HongKong
1,670682799,4,2019-05-01,Philippines,"Its been a while since d last time we visit HK Disneyland .. Yet, this time we only stay in Tomo...",Disneyland_HongKong
2,670623270,4,2019-04-01,United Arab Emirates,Thanks God it wasn t too hot or too humid when I was visiting the park otherwise it would be...,Disneyland_HongKong
3,670607911,4,2019-04-01,Australia,HK Disneyland is a great compact park. Unfortunately there is quite a bit of maintenance work go...,Disneyland_HongKong
4,670607296,4,2019-04-01,United Kingdom,"the location is not in the city, took around 1 hour from Kowlon, my kids like disneyland so much...",Disneyland_HongKong
...,...,...,...,...,...,...
42631,1765031,5,NaT,United Kingdom,i went to disneyland paris in july 03 and thought it was brilliant. i visited all the hotels and...,Disneyland_Paris
42632,1659553,5,NaT,Canada,2 adults and 1 child of 11 visited Disneyland Paris beginning of Feb 04 and had an absolute fant...,Disneyland_Paris
42633,1645894,5,NaT,South Africa,My eleven year old daughter and myself went to visit my son in London and we decided to go on to...,Disneyland_Paris
42634,1618637,4,NaT,United States,"This hotel, part of the Disneyland Paris complex, is a wonderful place for families. Since we ha...",Disneyland_Paris


**Step 6:** Check the total number of unique `Branch`, so they can be replaced by the real park titles.

In [17]:
pd.DataFrame(df["Branch"].unique(), columns=["Branch"])

Unnamed: 0,Branch
0,Disneyland_HongKong
1,Disneyland_California
2,Disneyland_Paris


**Step 7:** Replace the branch codes with their corresponding real park titles using a mapping dictionary.

In [18]:
branch_map = {
    "Disneyland_HongKong": "Hong Kong Disneyland",
    "Disneyland_California": "Disneyland Resort (California)",
    "Disneyland_Paris": "Disneyland Paris"
}

df["Branch"] = df["Branch"].replace(branch_map)
df

Unnamed: 0,Review_ID,Rating,Year_Month,Reviewer_Location,Review_Text,Branch
0,670772142,4,2019-04-01,Australia,If you've ever been to Disneyland anywhere you'll find Disneyland Hong Kong very similar in the ...,Hong Kong Disneyland
1,670682799,4,2019-05-01,Philippines,"Its been a while since d last time we visit HK Disneyland .. Yet, this time we only stay in Tomo...",Hong Kong Disneyland
2,670623270,4,2019-04-01,United Arab Emirates,Thanks God it wasn t too hot or too humid when I was visiting the park otherwise it would be...,Hong Kong Disneyland
3,670607911,4,2019-04-01,Australia,HK Disneyland is a great compact park. Unfortunately there is quite a bit of maintenance work go...,Hong Kong Disneyland
4,670607296,4,2019-04-01,United Kingdom,"the location is not in the city, took around 1 hour from Kowlon, my kids like disneyland so much...",Hong Kong Disneyland
...,...,...,...,...,...,...
42631,1765031,5,NaT,United Kingdom,i went to disneyland paris in july 03 and thought it was brilliant. i visited all the hotels and...,Disneyland Paris
42632,1659553,5,NaT,Canada,2 adults and 1 child of 11 visited Disneyland Paris beginning of Feb 04 and had an absolute fant...,Disneyland Paris
42633,1645894,5,NaT,South Africa,My eleven year old daughter and myself went to visit my son in London and we decided to go on to...,Disneyland Paris
42634,1618637,4,NaT,United States,"This hotel, part of the Disneyland Paris complex, is a wonderful place for families. Since we ha...",Disneyland Paris


**Step 8:** Remove the `Review_Text` column from the dataframe, as it is not necessary for Tableau analysis.

In [19]:
df = df.drop(columns=["Review_Text"])
df

Unnamed: 0,Review_ID,Rating,Year_Month,Reviewer_Location,Branch
0,670772142,4,2019-04-01,Australia,Hong Kong Disneyland
1,670682799,4,2019-05-01,Philippines,Hong Kong Disneyland
2,670623270,4,2019-04-01,United Arab Emirates,Hong Kong Disneyland
3,670607911,4,2019-04-01,Australia,Hong Kong Disneyland
4,670607296,4,2019-04-01,United Kingdom,Hong Kong Disneyland
...,...,...,...,...,...
42631,1765031,5,NaT,United Kingdom,Disneyland Paris
42632,1659553,5,NaT,Canada,Disneyland Paris
42633,1645894,5,NaT,South Africa,Disneyland Paris
42634,1618637,4,NaT,United States,Disneyland Paris


**Step 9:** Get country ISO code for country image display in the dasboard.

In [21]:
!pip install pycountry
import pandas as pd
import pycountry

# Function to get ISO2 code from country name
def get_iso2(name):
    try:
        return pycountry.countries.lookup(name).alpha_2
    except LookupError:
        return None  # Handle unknown or misspelled countries

# Apply ISO2 conversion
df["ISO2"] = df["Reviewer_Location"].apply(get_iso2)

# Manual corrections for missing iso2
iso_overrides = {
    "U.S. Virgin Islands": "VI",
    "Turkey": "TR",
    "Russia": "RU",
    "The Bahamas": "BS",
    "Myanmar (Burma)": "MM",
    "Macau": "MO",
    "Falkland Islands (Islas Malvinas)": "FK",
    "Democratic Republic of the Congo": "CD",
    "Caribbean Netherlands": "BQ",
    "Brunei": "BN"
}

df["ISO2"] = df.apply(
    lambda row: iso_overrides.get(row["Reviewer_Location"], row["ISO2"]),
    axis=1
)
df



Unnamed: 0,Review_ID,Rating,Year_Month,Reviewer_Location,Branch,ISO2
0,670772142,4,2019-04-01,Australia,Hong Kong Disneyland,AU
1,670682799,4,2019-05-01,Philippines,Hong Kong Disneyland,PH
2,670623270,4,2019-04-01,United Arab Emirates,Hong Kong Disneyland,AE
3,670607911,4,2019-04-01,Australia,Hong Kong Disneyland,AU
4,670607296,4,2019-04-01,United Kingdom,Hong Kong Disneyland,GB
...,...,...,...,...,...,...
42631,1765031,5,NaT,United Kingdom,Disneyland Paris,GB
42632,1659553,5,NaT,Canada,Disneyland Paris,CA
42633,1645894,5,NaT,South Africa,Disneyland Paris,ZA
42634,1618637,4,NaT,United States,Disneyland Paris,US


**Step 10:** Export the dataframes to a CSVs files.

In [22]:
df.to_csv("disneyland-reviews.csv", index=False)
df_tags.to_csv("disneyland-reviews-tags.csv", index=False)