<a href="https://colab.research.google.com/github/aaubs/ds-master/blob/main/notebooks/M1_Recap_Recommender_System.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Importing necessary libraries.
import pandas as pd
import scipy.sparse as ss
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_distances

In [2]:
df_trips = pd.read_csv('https://sds-aau.github.io/SDS-master/M1/data/trips.csv')

In [3]:
df_trips.head()

Unnamed: 0.1,Unnamed: 0,username,country,country_code,country_slug,date_end,date_start,latitude,longitude,place,place_slug
0,0,@lewellenmichael,Mexico,MX,mexico,2018-06-15,2018-06-04,21,-101,Guanajuato,mexico
1,1,@lewellenmichael,Mexico,MX,mexico,2018-06-03,2018-05-31,19,-99,Mexico City,mexico-city-mexico
2,2,@lewellenmichael,Mexico,MX,mexico,2017-11-05,2017-11-01,21,-86,Cancun,cancun-mexico
3,3,@lewellenmichael,Jordan,JO,jordan,2017-08-07,2017-07-24,31,35,Amman,amman-jordan
4,4,@waylandchin,China,CN,china,2017-03-18,2017-02-17,40,122,Yingkou,china


In [4]:
###################################################
#   Step 1: Label Encoding and Matrix Creation    #
###################################################
# Initialize label encoders
le_user = LabelEncoder()
le_place = LabelEncoder()
# Label encode usernames and place slugs
df_trips['username_id'] = le_user.fit_transform(df_trips['username'])
df_trips['place_slug_id'] = le_place.fit_transform(df_trips['place_slug'])

# Create a sparse matrix
# Create an array of ones, with length equal to the number of rows in the DataFrame
ones = np.ones(len(df_trips), np.uint64)
# Create the sparse matrix using the 'username_id' and 'place_slug_id' columns as indices
matrix = ss.coo_matrix((ones, (df_trips['username_id'], df_trips['place_slug_id'])))

###################################################
#    Step 2: Perform Dimensionality Reduction     #
###################################################
svd = TruncatedSVD(n_components=5, n_iter=7, random_state=42)
#These latent features could potentially capture characteristics like how popular a place is
matrix_places = svd.fit_transform(matrix.T)

###################################################
#    Step 3: Calculate The Similarity Matrix      #
###################################################
cosine_distance_matrix_places = cosine_distances(matrix_places)

In [5]:
cosine_distance_matrix_places.shape

(961, 961)

In [6]:
cosine_distance_matrix_places

array([[0.        , 0.93737257, 0.94042055, ..., 0.55101577, 0.94479183,
        0.66700793],
       [0.93737257, 0.        , 0.2219977 , ..., 0.55865144, 1.01683665,
        0.55894041],
       [0.94042055, 0.2219977 , 0.        , ..., 0.42547908, 0.90915149,
        0.12785817],
       ...,
       [0.55101577, 0.55865144, 0.42547908, ..., 0.        , 0.35913659,
        0.36126287],
       [0.94479183, 1.01683665, 0.90915149, ..., 0.35913659, 0.        ,
        0.89817704],
       [0.66700793, 0.55894041, 0.12785817, ..., 0.36126287, 0.89817704,
        0.        ]])