In [1]:
!pip install faker pandas-stubs

Collecting faker
  Downloading faker-40.1.2-py3-none-any.whl.metadata (16 kB)
Collecting pandas-stubs
  Downloading pandas_stubs-2.3.3.260113-py3-none-any.whl.metadata (10 kB)
Collecting types-pytz>=2022.1.1 (from pandas-stubs)
  Using cached types_pytz-2025.2.0.20251108-py3-none-any.whl.metadata (1.7 kB)
Downloading faker-40.1.2-py3-none-any.whl (2.0 MB)
   ---------------------------------------- 0.0/2.0 MB ? eta -:--:--
   ---------------------------------------- 2.0/2.0 MB 37.5 MB/s eta 0:00:00
Downloading pandas_stubs-2.3.3.260113-py3-none-any.whl (168 kB)
Using cached types_pytz-2025.2.0.20251108-py3-none-any.whl (10 kB)
Installing collected packages: types-pytz, faker, pandas-stubs

   ------------- -------------------------- 1/3 [faker]
   ------------- -------------------------- 1/3 [faker]
   ------------- -------------------------- 1/3 [faker]
   ------------- -------------------------- 1/3 [faker]
   ------------- -------------------------- 1/3 [faker]
   ------------- ----


[notice] A new release of pip is available: 25.1.1 -> 26.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import random
import pandas as pd
from faker import Faker

fake = Faker()

bike_types = ["mountain bike", "road bike", "e-bike", "tandem bike", "kids bike"]
customer_types = ["tourist", "commuter", "student", "family", "casual rider"]
topics = ["bike_quality", "staff", "price", "process", "location", "safety", "other"]
weathers = ["sunny", "cloudy", "rainy", "windy", "snowy"]

positive_snippets = {
    "bike_quality": [
        "the bike was in excellent condition",
        "gears shifted smoothly and brakes were responsive",
        "the bike felt brand new and well maintained"
    ],
    "staff": [
        "the staff was incredibly friendly and helpful",
        "they explained everything clearly and with a smile",
        "check-in and check-out were handled professionally"
    ],
    "price": [
        "the pricing was very reasonable",
        "great value for the money",
        "I felt the rates were fair for the quality"
    ],
    "process": [
        "the rental process was quick and easy",
        "booking and pickup were seamless",
        "paperwork was minimal and straightforward"
    ],
    "location": [
        "the shop is in a perfect location near the trails",
        "easy to find and close to popular bike paths",
        "great starting point for exploring the city"
    ],
    "safety": [
        "they provided helmets and safety instructions",
        "I felt very safe riding their bikes",
        "the staff double-checked everything before I left"
    ],
    "other": [
        "overall I had a fantastic experience",
        "I would definitely rent here again",
        "I highly recommend this bike rental company"
    ]
}

neutral_snippets = {
    "bike_quality": [
        "the bike was decent and did the job",
        "some minor wear but nothing serious",
        "the bike was okay for a short ride"
    ],
    "staff": [
        "the staff was fine, not particularly memorable",
        "interaction with staff was neutral",
        "service was acceptable"
    ],
    "price": [
        "the price felt average compared to other places",
        "not cheap but not too expensive either",
        "pricing seemed standard for the area"
    ],
    "process": [
        "the rental process took a bit of time",
        "some waiting but nothing too bad",
        "overall the process was acceptable"
    ],
    "location": [
        "the location was alright",
        "a bit out of the way but manageable",
        "not the most convenient spot"
    ],
    "safety": [
        "basic safety gear was available",
        "they offered helmets if needed",
        "safety was addressed but not emphasized"
    ],
    "other": [
        "overall it was an average experience",
        "nothing special but it worked for my needs",
        "I might consider renting here again"
    ]
}

negative_snippets = {
    "bike_quality": [
        "the bike was in poor condition",
        "gears slipped and the brakes felt weak",
        "the bike clearly needed maintenance"
    ],
    "staff": [
        "the staff seemed uninterested and unhelpful",
        "customer service was disappointing",
        "I felt rushed and not listened to"
    ],
    "price": [
        "the rental was overpriced for what I got",
        "I felt the rates were too high",
        "definitely not worth the money"
    ],
    "process": [
        "the rental process was confusing and slow",
        "paperwork took far too long",
        "there was a lot of unnecessary waiting"
    ],
    "location": [
        "the shop was hard to find",
        "location is inconvenient and poorly marked",
        "parking and access were a hassle"
    ],
    "safety": [
        "they did not seem to care about safety",
        "no proper safety checks were done",
        "I felt unsafe riding the bike"
    ],
    "other": [
        "overall I had a bad experience",
        "I would not rent here again",
        "I cannot recommend this bike rental company"
    ]
}

def build_review_text(sentiment, topic, bike_type, customer_type, city, weather, duration_hours):
    if sentiment == "positive":
        base_snippets = positive_snippets
    elif sentiment == "neutral":
        base_snippets = neutral_snippets
    else:
        base_snippets = negative_snippets

    main_snippet = random.choice(base_snippets[topic])
    extra_topic = random.choice(topics)
    extra_snippet = random.choice(base_snippets[extra_topic])

    opener_options = [
        f"As a {customer_type} visiting {city}",
        f"As a local {customer_type} in {city}",
        f"During my recent ride in {city}",
        f"On a {weather} day in {city}"
    ]
    opener = random.choice(opener_options)

    duration_phrase = f"for about {duration_hours} hour{'s' if duration_hours != 1 else ''}"

    review = (
        f"{opener}, I rented a {bike_type} {duration_phrase}. "
        f"I felt that {main_snippet}. Additionally, {extra_snippet}. "
    )

    # Light noise: extra sentence, exclamation, or mild typo
    if random.random() < 0.15:
        review += fake.sentence()
    if random.random() < 0.1:
        review = review.replace("the ", "th ")

    return review.strip()

def sentiment_and_rating():
    sentiment = random.choices(
        ["positive", "neutral", "negative"],
        weights=[0.6, 0.25, 0.15]
    )[0]
    if sentiment == "positive":
        rating = random.randint(4, 5)
    elif sentiment == "neutral":
        rating = random.randint(2, 4)
    else:
        rating = random.randint(1, 3)
    return sentiment, rating

def generate_dataset(n=50000, seed=42):
    random.seed(seed)
    Faker.seed(seed)

    rows = []
    for i in range(1, n + 1):
        sentiment, rating = sentiment_and_rating()
        topic = random.choice(topics)
        bike_type = random.choice(bike_types)
        customer_type = random.choice(customer_types)
        city = fake.city()
        weather = random.choice(weathers)
        duration_hours = random.choice([1, 2, 3, 4, 5])
        rental_date = fake.date_between(start_date="-2y", end_date="today")
        is_weekend = rental_date.weekday() >= 5

        review_text = build_review_text(
            sentiment, topic, bike_type, customer_type,
            city, weather, duration_hours
        )

        rows.append({
            "review_id": i,
            "review_text": review_text,
            "rating": rating,
            "sentiment": sentiment,
            "topic": topic,
            "bike_type": bike_type,
            "customer_type": customer_type,
            "rental_duration_hours": duration_hours,
            "rental_city": city,
            "rental_date": rental_date.isoformat(),
            "weather": weather,
            "is_weekend": is_weekend
        })

    return pd.DataFrame(rows)

# if __name__ == "__main__":
    df = generate_dataset(50000)
    df.to_csv("synthetic_bike_rental_reviews.csv", index=False)
    print(df.head())


   review_id                                        review_text  rating  \
0          1  As a commuter visiting North Judithbury, I ren...       2   
1          2  On a snowy day in Lake Curtis, I rented a kids...       4   
2          3  As a family visiting New Roberttown, I rented ...       2   
3          4  During my recent ride in Robinsonshire, I rent...       4   
4          5  As a tourist visiting Port Lindachester, I ren...       3   

  sentiment     topic      bike_type customer_type  rental_duration_hours  \
0   neutral    safety         e-bike      commuter                      2   
1  positive  location      kids bike       tourist                      2   
2   neutral     other      road bike        family                      3   
3   neutral     price  mountain bike        family                      1   
4  negative     staff  mountain bike       tourist                      3   

         rental_city rental_date weather  is_weekend  
0   North Judithbury  2024-07-2