In [None]:
import pandas as pd

In [None]:
vader_df = pd.read_parquet("after_vader_reviews.parquet")
review_emotions_df = pd.read_parquet("after_emolabels_reviews.parquet")
listing_emotions_df = pd.read_parquet("after_listing_emotions.parquet")

In [None]:
vader_df["vader_compound"] = pd.to_numeric(vader_df["vader_compound"], errors="coerce")

mean_vader = (
    vader_df.groupby("listing_id")["vader_compound"]
    .mean()
    .reset_index()
    .rename(columns={"vader_compound": "mean_vader"})
)

In [None]:
def mode_or_na(series):
    if series.dropna().empty:
        return None
    return series.value_counts().idxmax()

In [None]:
dominant_review = (
    review_emotions_df.groupby("listing_id")["dominant_emotion_review"]
    .agg(mode_or_na)
    .reset_index()
)

dominant_listing = (
    listing_emotions_df.groupby("listing_id")["dominant_emotion_listing"]
    .agg(mode_or_na)
    .reset_index()
)

merged_listing_summary = (
    mean_vader
    .merge(dominant_review, on="listing_id", how="left")
    .merge(dominant_listing, on="listing_id", how="left")
)

merged_listing_summary.to_csv("final.csv", index=False)

In [None]:
merged_listing_summary.shape

(4182, 4)

In [None]:
final["listing_id"].nunique()

In [None]:
vader_ids = set(vader_df["listing_id"].unique())
review_ids = set(review_emotions_df["listing_id"].unique())
listing_ids = set(listing_emotions_df["listing_id"].unique())

print("VADER dataset:", len(vader_ids))
print("Review Emotion dataset:", len(review_ids))
print("Listing Emotion dataset:", len(listing_ids))

VADER dataset: 4182
Review Emotion dataset: 4182
Listing Emotion dataset: 4182


In [None]:
listings = pd.read_csv("listings.csv")
final = pd.read_csv("final.csv")

(4182, 70)


In [None]:
def clean_id(x):
    try:
        return int(str(x).strip().split('.')[0])
    except:
        return None


In [None]:
listings["listing_id"] = listings["id"].apply(clean_id)
final["listing_id"] = final["listing_id"].apply(clean_id)

listings = listings.dropna(subset=["listing_id"])
final = final.dropna(subset=["listing_id"])

In [None]:
merged = final.merge(listings, on="listing_id", how="left")
print(merged.shape)

In [None]:
merged.columns

Index(['listing_id', 'mean_vader', 'dominant_emotion_review',
       'dominant_emotion_listing', 'id', 'scrape_id', 'last_scraped', 'source',
       'name', 'description', 'neighborhood_overview', 'picture_url',
       'host_id', 'host_url', 'host_name', 'host_since', 'host_location',
       'host_about', 'host_response_time', 'host_response_rate',
       'host_acceptance_rate', 'host_is_superhost', 'host_thumbnail_url',
       'host_picture_url', 'host_neighbourhood', 'host_listings_count',
       'host_total_listings_count', 'host_verifications',
       'host_has_profile_pic', 'host_identity_verified', 'neighbourhood',
       'neighbourhood_cleansed', 'latitude', 'longitude', 'property_type',
       'room_type', 'accommodates', 'bathrooms', 'bathrooms_text', 'bedrooms',
       'beds', 'amenities', 'price', 'minimum_nights', 'maximum_nights',
       'has_availability', 'availability_30', 'availability_60',
       'availability_90', 'availability_365', 'calendar_last_scraped',
       '

In [None]:
merged.to_csv("listings_reviews_final.csv", index=False)