In [None]:
import pandas as pd

# Load the dataset
data = pd.read_csv('./data/dataset_TSMC2014_NYC.csv')

# Display the first few rows
data.head()

Unnamed: 0,userId,venueId,venueCategoryId,venueCategory,latitude,longitude,timezoneOffset,utcTimestamp
0,470,49bbd6c0f964a520f4531fe3,4bf58dd8d48988d127951735,Arts & Crafts Store,40.71981,-74.002581,-240,Tue Apr 03 18:00:09 +0000 2012
1,979,4a43c0aef964a520c6a61fe3,4bf58dd8d48988d1df941735,Bridge,40.6068,-74.04417,-240,Tue Apr 03 18:00:25 +0000 2012
2,69,4c5cc7b485a1e21e00d35711,4bf58dd8d48988d103941735,Home (private),40.716162,-73.88307,-240,Tue Apr 03 18:02:24 +0000 2012
3,395,4bc7086715a7ef3bef9878da,4bf58dd8d48988d104941735,Medical Center,40.745164,-73.982519,-240,Tue Apr 03 18:02:41 +0000 2012
4,87,4cf2c5321d18a143951b5cec,4bf58dd8d48988d1cb941735,Food Truck,40.740104,-73.989658,-240,Tue Apr 03 18:03:00 +0000 2012


In [None]:
# Convert the UTC timestamps to datetime format
data['utcTimestamp'] = pd.to_datetime(data['utcTimestamp'], format='%a %b %d %H:%M:%S +0000 %Y')

# Extract useful time features
data['year'] = data['utcTimestamp'].dt.year
data['month'] = data['utcTimestamp'].dt.month
data['day'] = data['utcTimestamp'].dt.day
data['hour'] = data['utcTimestamp'].dt.hour

In [None]:
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="venue_locator")

In [None]:
# Example latitude and longitude from your data
latitude = 40.71981038
longitude = -74.00258103

# Reverse geocoding to get the location name
location = geolocator.reverse((latitude, longitude), exactly_one=True)

# Print the address
if location:
    print(f"Exact location: {location.address}")
else:
    print("Location not found.")

Exact location: 308, Canal Street, Manhattan Community Board 1, Manhattan, New York County, City of New York, New York, 10013, United States


In [None]:
from sklearn.preprocessing import MinMaxScaler

# Normalizing latitude and longitude
scaler = MinMaxScaler()
data[['latitude', 'longitude']] = scaler.fit_transform(data[['latitude', 'longitude']])

In [None]:
from sklearn.preprocessing import LabelEncoder

# Encode userId and venueId
le_user = LabelEncoder()
le_venue = LabelEncoder()

data['userId'] = le_user.fit_transform(data['userId'])
data['venueId'] = le_venue.fit_transform(data['venueId'])

## Building the LSTM Model

This model will predict the next venue a user might visit based on their check-in history.

In [None]:
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Prepare sequences for LSTM
def create_sequences(data, user_col, venue_col, seq_length=10):
    sequences = []
    targets = []
    user_groups = data.groupby(user_col)

    for user_id, group in user_groups:
        venues = group[venue_col].values
        for i in range(len(venues) - seq_length):
            seq = venues[i:i + seq_length]
            target = venues[i + seq_length]
            sequences.append(seq)
            targets.append(target)

    return np.array(sequences), np.array(targets)

seq_length = 10
X, y = create_sequences(data, 'userId', 'venueId', seq_length=seq_length)

# Pad sequences to ensure they have the same length
X = pad_sequences(X, maxlen=seq_length)

In [8]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding

# Define the LSTM model
model = Sequential([
    Embedding(input_dim=len(le_venue.classes_), output_dim=50, input_length=seq_length),
    LSTM(128, return_sequences=False),
    Dense(64, activation='relu'),
    Dense(len(le_venue.classes_), activation='softmax')
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X, y, epochs=10, batch_size=32, validation_split=0.2)

Epoch 1/10




[1m5415/5415[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m284s[0m 52ms/step - accuracy: 0.0084 - loss: 9.5530 - val_accuracy: 0.0074 - val_loss: 11.0494
Epoch 2/10
[1m5415/5415[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m308s[0m 57ms/step - accuracy: 0.0692 - loss: 7.8271 - val_accuracy: 0.0109 - val_loss: 11.7918
Epoch 3/10
[1m5415/5415[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m248s[0m 46ms/step - accuracy: 0.1353 - loss: 6.4702 - val_accuracy: 0.0149 - val_loss: 12.2384
Epoch 4/10
[1m5415/5415[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m303s[0m 56ms/step - accuracy: 0.1712 - loss: 5.5525 - val_accuracy: 0.0129 - val_loss: 12.9545
Epoch 5/10
[1m5415/5415[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m334s[0m 62ms/step - accuracy: 0.1995 - loss: 4.8737 - val_accuracy: 0.0117 - val_loss: 13.7795
Epoch 6/10
[1m5415/5415[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m250s[0m 46ms/step - accuracy: 0.2263 - loss: 4.3214 - val_accuracy: 0.0097 - val_loss: 14.7360
Epo

<keras.src.callbacks.history.History at 0x1dc4c4d1250>

## Building the KNN Model

The KNN model will be used to find similar users or venues based on geographical data.



In [9]:
from sklearn.neighbors import NearestNeighbors

# Features for KNN (latitude, longitude, budget)
X_knn = data[['latitude', 'longitude']]

# Initialize and fit the KNN model
knn = NearestNeighbors(n_neighbors=5, algorithm='auto').fit(X_knn)

## Extracting a Sample User's Data

In [10]:
# Select a sample user
sample_user_id = data['userId'].iloc[1]
sample_user_data = data[data['userId'] == sample_user_id]

# Display the sample user's data
sample_user_data.head()

Unnamed: 0,userId,venueId,venueCategoryId,venueCategory,latitude,longitude,timezoneOffset,utcTimestamp,year,month,day,hour
1,978,3921,4bf58dd8d48988d1df941735,Bridge,0.127885,0.390219,-240,2012-04-03 18:00:25,2012,4,3,18
712,978,3715,4bf58dd8d48988d146941735,Deli / Bodega,0.383874,0.477134,-240,2012-04-04 00:03:53,2012,4,4,0
2004,978,17576,4bf58dd8d48988d1e2941735,Beach,0.052763,0.322333,-240,2012-04-04 17:21:27,2012,4,4,17
2012,978,28383,4bf58dd8d48988d1e2941735,Beach,0.090808,0.35293,-240,2012-04-04 17:24:08,2012,4,4,17
2052,978,3668,4bf58dd8d48988d1df941735,Bridge,0.354384,0.481883,-240,2012-04-04 17:46:09,2012,4,4,17


In [11]:
# Function to get exact location from latitude and longitude
def get_exact_location(latitude, longitude):
    location = geolocator.reverse((latitude, longitude), exactly_one=True)
    return location.address if location else "Location not found"

## Testing the LSTM Model with Venue Category

In [12]:
# Prepare the last 10 venues visited by the sample user
sample_sequence = sample_user_data['venueId'].values[-seq_length:]
sample_sequence = pad_sequences([sample_sequence], maxlen=seq_length)

# Predict the next venue
lstm_prediction = model.predict(sample_sequence).argmax(axis=1)[0]
predicted_venue_id = le_venue.inverse_transform([lstm_prediction])[0]

# Find the venue category and location
predicted_venue_row = data[data['venueId'] == lstm_prediction].iloc[0]
predicted_venue_category = predicted_venue_row['venueCategory']
predicted_venue_latitude = predicted_venue_row['latitude']
predicted_venue_longitude = predicted_venue_row['longitude']

# Reverse transform the scaled latitude and longitude to get original values
original_location = scaler.inverse_transform([[predicted_venue_latitude, predicted_venue_longitude]])
original_latitude = original_location[0][0]
original_longitude = original_location[0][1]

# Get the exact location using reverse geocoding
exact_location = get_exact_location(original_latitude, original_longitude)

print(f"LSTM Model Prediction: Next venue for user {sample_user_id} is likely '{predicted_venue_category}' (Venue ID: {predicted_venue_id})")
print(f"Destination: {exact_location}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 442ms/step
LSTM Model Prediction: Next venue for user 978 is likely 'Cupcake Shop' (Venue ID: 4b26a833f964a5200e7f24e3)
Destination: Garment District, Manhattan, New York County, City of New York, New York, 10018, United States


## Testing the KNN Model

In [13]:
# Use the last known location of the sample user
last_location = sample_user_data[['latitude', 'longitude']].values[-1]

# Find the nearest neighbors
knn_distances, knn_indices = knn.kneighbors([last_location])

# Get the recommended venues
recommended_venue_ids = data.iloc[knn_indices[0]]['venueId'].unique()

# Get venue categories for the recommended venues
recommended_venue_categories = data[data['venueId'].isin(recommended_venue_ids)]['venueCategory'].unique()

print(f"KNN Model Recommendations: Nearby venues for user {sample_user_id} are {recommended_venue_categories}.")

# Iterate through recommended venue IDs and print their exact locations
for venue_id in recommended_venue_ids:
    venue_row = data[data['venueId'] == venue_id].iloc[0]
    venue_latitude = venue_row['latitude']
    venue_longitude = venue_row['longitude']

    # Reverse transform the scaled latitude and longitude to get original values
    original_location = scaler.inverse_transform([[venue_latitude, venue_longitude]])
    original_latitude = original_location[0][0]
    original_longitude = original_location[0][1]

    exact_location = get_exact_location(original_latitude, original_longitude)
    print(f"Venue ID: {venue_id}, Location: {exact_location}")

KNN Model Recommendations: Nearby venues for user 978 are ['Beach'].




Venue ID: 28383, Location: Father Capodanno Boulevard, South Beach, Staten Island, Richmond County, City of New York, New York, 10305, United States


## Retraining the model after Rating

In [14]:
# Get user feedback (rating) for the LSTM prediction
user_rating = int(input(f"Rate the LSTM recommendation (1-5, where 5 is excellent): "))

# Adjust model weights based on user feedback (simplified example)
if user_rating >= 4:
  # Positive feedback: Slightly increase the weight of the predicted venue in the output layer
  predicted_venue_index = le_venue.transform([predicted_venue_id])[0]
  model.layers[-1].weights[0][:, predicted_venue_index] *= 1.1  # Increase weight by 10%
elif user_rating <= 2:
  # Negative feedback: Slightly decrease the weight of the predicted venue in the output layer
  predicted_venue_index = le_venue.transform([predicted_venue_id])[0]
  model.layers[-1].weights[0][:, predicted_venue_index] *= 0.9  # Decrease weight by 10%

# Recompile the model with the updated weights
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Retrain the model with the updated weights
model.fit(X, y, epochs=10, batch_size=32, validation_split=0.2)

# Predict the next venue again after incorporating feedback
lstm_prediction_updated = model.predict(sample_sequence).argmax(axis=1)[0]
predicted_venue_id_updated = le_venue.inverse_transform([lstm_prediction_updated])[0]

# Find the venue category and location
predicted_venue_row = data[data['venueId'] == lstm_prediction_updated].iloc[0]
predicted_venue_category = predicted_venue_row['venueCategory']
predicted_venue_latitude = predicted_venue_row['latitude']
predicted_venue_longitude = predicted_venue_row['longitude']

# Reverse transform the scaled latitude and longitude to get original values
original_location = scaler.inverse_transform([[predicted_venue_latitude, predicted_venue_longitude]])
original_latitude = original_location[0][0]
original_longitude = original_location[0][1]

# Get the exact location using reverse geocoding
exact_location = get_exact_location(original_latitude, original_longitude)

print(f"LSTM Model Prediction After Rating: Next venue for user {sample_user_id} is likely '{predicted_venue_category}' (Venue ID: {predicted_venue_id_updated})")
print(f"Destination: {exact_location}")

TypeError: 'Variable' object does not support item assignment

In [15]:
import pickle

# Save the LSTM model
with open('./modelv2//lstm_model.pkl', 'wb') as file:
    pickle.dump(model, file)

# Save the KNN model
with open('./modelv2/knn_model.pkl', 'wb') as file:
    pickle.dump(knn, file)

In [16]:
# Save the scaler
with open('modelv2/scaler.pkl', 'wb') as scaler_file:
    pickle.dump(scaler, scaler_file)

# Save the user label encoder
with open('modelv2/le_user.pkl', 'wb') as le_user_file:
    pickle.dump(le_user, le_user_file)

# Save the venue label encoder
with open('modelv2/le_venue.pkl', 'wb') as le_venue_file:
    pickle.dump(le_venue, le_venue_file)
