In [29]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

In [17]:
df = pd.read_csv("./london_data/london_pois_all.csv")

In [25]:
# 2. Create target variable (using popularity as an example)
min_reviews = 5  # Minimum reviews threshold for reliability
df = df[df['user_ratings_total'] >= min_reviews].copy()

In [30]:
scaler = MinMaxScaler()
df['normalised_rating'] = scaler.fit_transform(df[['user_ratings_total']])

### Creating a popularity score

In [32]:
df['popularity_score'] = df['normalised_rating'] * df['rating']
df.head()


Unnamed: 0,name,area,type_queried,latitude,longitude,types,rating,user_ratings_total,vicinity,normalised_rating,popularity_score
0,Old War Office Building,Central London,tourist_attraction,51.505419,-0.12597,"tourist_attraction,point_of_interest,establish...",4.6,505.0,"Whitehall, London",0.002706,0.012445
1,Theatre Royal Drury Lane,Central London,tourist_attraction,51.512854,-0.120372,"tourist_attraction,point_of_interest,store,est...",4.7,9781.0,"Catherine Street, London",0.052898,0.248623
2,The British Museum,Central London,tourist_attraction,51.519413,-0.126957,"museum,tourist_attraction,point_of_interest,es...",4.7,160134.0,"Great Russell Street, London",0.866466,4.072391
3,Churchill War Rooms,Central London,tourist_attraction,51.502159,-0.129357,"tourist_attraction,museum,point_of_interest,es...",4.6,13886.0,"King Charles Street, London",0.075111,0.34551
4,His Majesty's Theatre,Central London,tourist_attraction,51.508243,-0.131863,"tourist_attraction,point_of_interest,store,est...",4.7,14074.0,"Haymarket, London",0.076128,0.357802


In [33]:
# 3. Extract POI types for features
df['types_list'] = df['types'].apply(lambda x: x.split(','))


In [37]:
# Get the most common 20 types
top_types = []
for types in df['types_list']:
    top_types.extend(types)
top_types = pd.Series(top_types).value_counts().head(20).index.tolist()

#### One Hot Encoding

In [39]:
# Create binary features for top types
for poi_type in top_types:
    df[f'is_{poi_type}'] = df['types_list'].apply(lambda x: 1 if poi_type in x else 0)

In [42]:
df.drop(columns=["types_list"], inplace=True)

#### Creating features

In [48]:
# 4. Prepare features and target
feature_cols = [f'is_{poi_type}' for poi_type in top_types] 

X = df[feature_cols]
y = df['popularity_score']

#### Model Training

In [49]:
# 5. Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 6. Create and train model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [50]:
# 7. Evaluate
train_score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)
print(f"Train R² score: {train_score:.4f}")
print(f"Test R² score: {test_score:.4f}")

Train R² score: 0.1330
Test R² score: 0.1131


In [51]:
# 8. Feature importance
feature_importance = pd.DataFrame({
    'Feature': feature_cols,
    'Importance': model.feature_importances_
}).sort_values('Importance', ascending=False)
print(feature_importance.head(10))

                  Feature  Importance
2   is_tourist_attraction    0.350382
10         is_art_gallery    0.110452
12               is_store    0.090617
11              is_museum    0.080194
5                  is_bar    0.069168
3                 is_food    0.062259
6     is_place_of_worship    0.059350
4           is_restaurant    0.045921
8                 is_park    0.045279
13                is_cafe    0.042385
