## Package installation

In [1]:
#package installation
! pip install pandas numpy matplotlib seaborn openpyxl
! pip install matplotlib seaborn
! pip install pymongo



In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
import os
from sklearn.preprocessing import StandardScaler

## Read raw data files

In [None]:
# CSV Files
reviews_df = pd.read_csv("../../data/all_countries_reviews.csv")
currency_df = pd.read_csv("../../data/southeast_asia_currency_monthly_avg.csv")
reddit_df = pd.read_csv("../../data/merged_reddit_data.csv")

# Excel File
visitors_df = pd.read_excel("../../data/monthly_num_visitors.xlsx")

# JSON Files

with open("../../data/sentiment_analysis(reddit).json", "r") as f:
    reddit_sentiment = json.load(f)

with open("../../data/sentiment_analysis_instagram_timed.json", "r") as f:
    instagram_sentiment = json.load(f)


FileNotFoundError: [Errno 2] No such file or directory: '../../data/all_countries_reviews.csv'

## Prepare reddit sentiment dataframe

In [None]:
#convert to dataframe
reddit_df = pd.DataFrame(reddit_df)

#keep only relevant columns
columns_to_keep = [
    "created_at",
    "score", #The number of upvotes minus the number of downvotes.
    "vader_compound", # From -1.0 (most negative) to +1.0 (most positive).
    "country",
]
reddit_df = reddit_df[columns_to_keep]

# Properly name the columns
reddit_df = reddit_df.rename(columns={
    'created_at': 'month_year',
    'vader_compound': 'reddit_sentiment', 
    'score': 'popularity'
})

# Convert 'month_year' to datetime, then format as 'YYYY-MM'
reddit_df['month_year'] = pd.to_datetime(reddit_df['month_year'], format='%m/%d/%y').dt.strftime('%Y-%m')

# weight the raw sentiment using popularity
total_popularity_per_group = reddit_df.groupby(['country', 'month_year'])['popularity'].transform('sum')
reddit_df['popularity_percentage'] = (reddit_df['popularity'] / total_popularity_per_group)
reddit_df['reddit_sentiment'] = (reddit_df['reddit_sentiment'] * reddit_df['popularity_percentage'])

# drop columns not in use
reddit_df = reddit_df.drop(columns=['popularity',  'popularity_percentage'])

# aggregate individual country's average sentiment score
reddit_df = reddit_df.groupby(["country", "month_year"]).agg({
    "reddit_sentiment": "sum"
}).reset_index()

#if missing data, backfill
# sort by country and month_year
reddit_df = reddit_df.sort_values(by=["country", "month_year"])

# interpolate, forward fill, then backward fill within each country group
reddit_df["reddit_sentiment"] = (
    reddit_df
    .groupby("country")["reddit_sentiment"]
    .apply(lambda group: (
        group.interpolate(method='linear', limit_direction='both')  # interpolate
             .fillna(method='ffill')                                # fill leading NaNs
             .fillna(method='bfill')                                # fill trailing NaNs
    ))
    .reset_index(level=0, drop=True)
)

#lag by 1 moth
reddit_df = reddit_df.sort_values(['country', 'month_year'])
reddit_df['reddit_sentiment_lag1'] = (
    reddit_df.groupby('country')['reddit_sentiment']
    .shift(periods=1)  # Shift down by 1 row
)



  .fillna(method='ffill')                                # fill leading NaNs
  .fillna(method='bfill')                                # fill trailing NaNs


In [None]:
reddit_df.head()

Unnamed: 0,country,month_year,reddit_sentiment
0,Brunei,2022-01,0.672472
1,Brunei,2022-02,0.515935
2,Brunei,2022-03,0.077097
3,Brunei,2022-04,0.116462
4,Brunei,2022-05,0.144865


## Prepare instagram sentiment dataframe

In [None]:
#convert to dataframe
ig_df = pd.DataFrame(instagram_sentiment)

# capitalise the first letter of countries
ig_df["country"] = ig_df["country"].str.title()

#change month_year format to align with other datasets
ig_df['date'] = pd.to_datetime(ig_df['date'], errors='coerce')
ig_df['month_year'] = ig_df['date'].dt.strftime('%Y-%m')

# weight the raw sentiment using popularity
total_likes_per_group = ig_df.groupby(['country', 'month_year'])['like_count'].transform('sum')
ig_df['like_percentage'] = (ig_df['like_count'] / total_likes_per_group)
ig_df['weighted_sentiment_score'] = (ig_df['sentiment_score'] * ig_df['like_percentage'])

#only keeping the sentiment score
columns_to_keep = [
    "month_year",
    "country",
    "sentiment_score",
    "weighted_sentiment_score"
]
ig_df = ig_df[columns_to_keep]

#agggregate sentiment score
ig_sentiment = ig_df.groupby(
    ["country", "month_year"]
)["weighted_sentiment_score"].sum().reset_index()
ig_sentiment.columns = ["country", "month_year", "ig_sentiment"]
ig_sentiment['month_year'] = pd.to_datetime(ig_sentiment['month_year'])

# interpolate, forward fill, then backward fill within each country group
ig_sentiment["ig_sentiment"] = (
    ig_sentiment
    .groupby("country")["ig_sentiment"]
    .apply(lambda group: (
        group.interpolate(method='linear', limit_direction='both')  # interpolate
             .fillna(method='ffill')                                # fill leading NaNs
             .fillna(method='bfill')                                # fill trailing NaNs
    ))
    .reset_index(level=0, drop=True)
)

#lag by 1 moth
ig_sentiment = ig_sentiment.sort_values(['country', 'month_year'])
ig_sentiment['ig_sentiment_lag1'] = (
    ig_sentiment.groupby('country')['ig_sentiment']
    .shift(periods=1)  # Shift down by 1 row
)

  .fillna(method='ffill')                                # fill leading NaNs
  .fillna(method='bfill')                                # fill trailing NaNs


In [None]:
ig_sentiment.head(5)

Unnamed: 0,country,month_year,ig_sentiment,reddit_sentiment
0,Brunei,2022-01-01,0.766694,0.766694
1,Brunei,2022-02-01,0.736188,0.736188
2,Brunei,2022-03-01,0.837473,0.837473
3,Brunei,2022-04-01,0.915479,0.915479
4,Brunei,2022-05-01,0.527822,0.527822


## Prepare tripadvisor review dataframe

In [None]:
reviews_df = pd.DataFrame(reviews_df)
# Convert trip_date to datetime and extract month-year
reviews_df["trip_date"] = pd.to_datetime(reviews_df["trip_date"], errors="coerce")
reviews_df["month_year"] = reviews_df["trip_date"].dt.strftime("%Y-%m")
columns_to_keep = [
    "month_year",
    "country",
    "rating"
]
reviews_df = reviews_df[columns_to_keep]

# aggregate to find average monthly rating
review_agg = reviews_df.groupby(["country", "month_year"])["rating"].mean().reset_index()
review_agg = review_agg.rename(columns={"rating": "trip_advisor_rating"})
review_agg['month_year'] = pd.to_datetime(review_agg['month_year'])

# interpolate, forward fill, then backward fill within each country group
review_agg["trip_advisor_rating"] = (
    review_agg
    .groupby("country")["trip_advisor_rating"]
    .apply(lambda group: (
        group.interpolate(method='linear', limit_direction='both')  # interpolate
             .fillna(method='ffill')                                # fill leading NaNs
             .fillna(method='bfill')                                # fill trailing NaNs
    ))
    .reset_index(level=0, drop=True)
)

  reviews_df["trip_date"] = pd.to_datetime(reviews_df["trip_date"], errors="coerce")


In [None]:
review_agg.head(5)

Unnamed: 0,country,month_year,trip_advisor_rating
0,Brunei,2015-09-01,4.5
1,Brunei,2015-10-01,5.0
2,Brunei,2016-01-01,5.0
3,Brunei,2016-02-01,5.0
4,Brunei,2016-04-01,4.5


## Prepare exchange rate dataframe

In [None]:
currency_df = pd.DataFrame(currency_df)
#ccolumn naming convention
currency_df = currency_df.rename(columns={
    "Currency": "country",
    "YearMonth": "month_year",
    "AverageRate": "avg_currency_rate"
})
# map currency to country
currency_to_country = {
    "BND": "Brunei",
    "IDR": "Indonesia",
    "KHR": "Cambodia",
    "LAK": "Laos",
    "MMK": "Myanmar",
    "MYR": "Malaysia",
    "PHP": "Philippines",
    "SGD": "Singapore",
    "THB": "Thailand",
    "VND": "Vietnam"
}
currency_df["country"] = currency_df["country"].map(currency_to_country)
#correct datetime format
currency_df['month_year'] = pd.to_datetime(currency_df['month_year']).dt.strftime("%Y-%m")
#log transform
currency_df['avg_currency_rate'] = np.log1p(currency_df['avg_currency_rate'])

#lag by 1 moth
currency_df = currency_df.sort_values(['country', 'month_year'])
currency_df['avg_currency_rate_lag1'] = (
    currency_df.groupby('country')['avg_currency_rate']
    .shift(periods=1)  # Shift down by 1 row
)

currency_df.head()

Unnamed: 0,month_year,country,avg_currency_rate
0,2022-01,Brunei,1.351418
1,2022-01,Indonesia,14327.056094
2,2022-01,Cambodia,4072.629258
3,2022-01,Laos,11272.253421
4,2022-01,Myanmar,1777.280733


## Prepare google trend dataframe

In [None]:
trend_df = pd.DataFrame(trend_df)
trend_df = google_trend_df.rename(columns={"value": "google_trend_score"})
trend_df['month_year'] = pd.to_datetime(trend_df['month_year'])
trend_df = trend_df.drop(columns=['_id'])
# interpolate, forward fill, then backward fill within each country group
trend_df["google_trend_score"] = (
    trend_df
    .groupby("country")["google_trend_score"]
    .apply(lambda group: (
        group.interpolate(method='linear', limit_direction='both')  # interpolate
             .fillna(method='ffill')                                # fill leading NaNs
             .fillna(method='bfill')                                # fill trailing NaNs
    ))
    .reset_index(level=0, drop=True)
)

#lag by 1 moth
trend_df = trend_df.sort_values(['country', 'month_year'])
trend_df['google_trend_score_lag1'] = (
    trend_df.groupby('country')['google_trend_score']
    .shift(periods=1)  # Shift down by 1 row
)

NameError: name 'trend_df' is not defined

## Prepare number of visitors dataframe

In [None]:
visitors_df = visitors_df.rename(columns={"value": "num_visitors"})
#log transform
visitors_df["num_visitors"] = np.log1p(visitors_df['num_visitors'])
# interpolate, forward fill, then backward fill within each country group
visitors_df["num_visitors"] = (
    visitors_df
    .groupby("country")["num_visitors"]
    .apply(lambda group: (
        group.interpolate(method='linear', limit_direction='both')  # interpolate
             .fillna(method='ffill')                                # fill leading NaNs
             .fillna(method='bfill')                                # fill trailing NaNs
    ))
    .reset_index(level=0, drop=True)
)
visitors_df.head(5)