In [28]:
import sys
import os
sys.path.append(os.path.abspath('../utilities'))
import global_utils

# Data Manipulation and Analysis
import numpy as np  
import pandas as pd 

# Data Visualization
import matplotlib.pyplot as plt 
import seaborn as sns  
import plotly.express as px

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import joblib
from scipy import sparse

In [11]:
df = global_utils.import_csv('./../data/customer1.csv')
global_utils.define_df_settings()

In [13]:
df.head()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   customer_ID  500 non-null    int64 
 1   first_name   500 non-null    object
 2   last_name    500 non-null    object
 3   email        500 non-null    object
 4   question_1   500 non-null    object
 5   question_2   500 non-null    object
 6   question_3   500 non-null    object
 7   question_4   500 non-null    object
 8   question_5   500 non-null    object
dtypes: int64(1), object(8)
memory usage: 35.3+ KB


Let's PreProcess the data and and combine all of our questions column into one.

In [15]:
q_cols = [f"question_{i}" for i in range(1, 6)]

df["questions_concat"] = (
    df[q_cols]
      .fillna("")                                             
      .apply(lambda row: " ".join(str(v).replace(",", " ").strip() 
                                  for v in row), axis=1)      
      .str.replace(r"\s+", " ", regex=True)                   
      .str.strip()                                            
)

df.head()

Unnamed: 0,customer_ID,first_name,last_name,email,question_1,question_2,question_3,question_4,question_5,questions_concat
0,1000001,Abigail,Lewis,abigail.lewis1000001@example.com,"Walking, Jogging, Pilates",2–3 days,Lunch,Medium,"Weight Loss, Improve Health, Training",Walking Jogging Pilates 2–3 days Lunch Medium Weight Loss Improve Health Training
1,1000002,Mason,Brown,mason.brown1000002@example.com,"Jogging, Walking, Running",0–1 days,Mid-Morning,High,"Social, Reduce Stress, Build Strength",Jogging Walking Running 0–1 days Mid-Morning High Social Reduce Stress Build Strength
2,1000003,David,Jones,david.jones1000003@example.com,"Hiking, Cycling, Walking",2–3 days,Lunch,Medium,"Social, Weight Loss, Build Strength",Hiking Cycling Walking 2–3 days Lunch Medium Social Weight Loss Build Strength
3,1000004,Liam,Martin,liam.martin1000004@example.com,"Running, Group Fitness Class, Hiking",4–5 days,Lunch,Low,"Improve Health, Training, Social",Running Group Fitness Class Hiking 4–5 days Lunch Low Improve Health Training Social
4,1000005,Samuel,Perez,samuel.perez1000005@example.com,"Running, Swimming, Jogging",4–5 days,Lunch,High,"Training, Build Strength, Social",Running Swimming Jogging 4–5 days Lunch High Training Build Strength Social


Now we will vectorize our newly created combined coulmn using the TF-IDF vectorizer. It will help us to compare the coine similarity between the new user and the existing user.

In [17]:
vectorizer = TfidfVectorizer(stop_words = "english")
TF_IDF_matrix = vectorizer.fit_transform(df['questions_concat'])

In [19]:
TF_IDF_matrix.shape

(500, 31)

As we can see after vectorizing process we have 31 features to work with.

Let's print the cosine-similarity between our existing users as a summary.

In [23]:
similarity = cosine_similarity(TF_IDF_matrix,dense_output=False)
print(similarity)

  (0, 499)	0.3705929112443899
  (0, 496)	0.18985349864472761
  (0, 493)	0.13163467550385777
  (0, 486)	0.27857192728721647
  (0, 478)	0.41405733929484523
  (0, 475)	0.1651826263479633
  (0, 474)	0.3447717399500473
  (0, 461)	0.4982390235957516
  (0, 458)	0.2281801712827467
  (0, 457)	0.38278131213864647
  (0, 455)	0.5567777876227006
  (0, 454)	0.19137238340199075
  (0, 453)	0.28773565397146295
  (0, 450)	0.3075841389087709
  (0, 437)	0.3375334564327407
  (0, 435)	0.46178131350020424
  (0, 433)	0.07547989632032162
  (0, 432)	0.22546100473459718
  (0, 426)	0.1479200283954869
  (0, 422)	0.3679363272178139
  (0, 418)	0.42356526270648087
  (0, 417)	0.22387126650374126
  (0, 413)	0.07318285736133794
  (0, 409)	0.3144668803255763
  (0, 407)	0.24911537267447703
  :	:
  (499, 24)	0.6023774059886153
  (499, 23)	0.5261575490396135
  (499, 22)	0.41217493144443346
  (499, 21)	0.47181749357103336
  (499, 20)	0.2904441780867202
  (499, 19)	0.1723906514821607
  (499, 18)	0.3998964333093258
  (499, 17)

Also try to compare the couple of users with their using the cosine-similarity to check how our matrix is working.

In [46]:
user_1 = TF_IDF_matrix[(df['customer_ID'] == 1000001).values,]
user_2 = TF_IDF_matrix[(df['customer_ID'] == 1000002).values,]

print("Similarity:", cosine_similarity(user_1, user_2))
                                                         

Similarity: [[0.27819005]]


In [48]:
user_1 = TF_IDF_matrix[(df['customer_ID'] == 1000001).values,]
user_3 = TF_IDF_matrix[(df['customer_ID'] == 1000003).values,]

print("Similarity:", cosine_similarity(user_1, user_3))                                                  

Similarity: [[0.50057243]]


In [56]:
# Get the column based upon the index
customer_index = df[df['customer_ID'] == 1000001].index

# Create a dataframe with the movie titles
sim_df = pd.DataFrame({'customer_ID':df['customer_ID'],
                       'similarity': np.array(similarity[customer_index, :].todense()).squeeze()})

In [64]:
sim_df[sim_df['similarity'] > 0.7]

Unnamed: 0,customer_ID,similarity
0,1000001,1.0
62,1000063,0.750298
92,1000093,0.708946
161,1000162,0.714471
200,1000201,0.759645
245,1000246,0.701282
307,1000308,0.884253
319,1000320,0.747453
424,1000425,0.759921


Now it is the time to export our data and vectorizer file to use it to compare our new user.

In [32]:
joblib.dump(vectorizer, "./../models/tfidf_vectorizer.pkl")
sparse.save_npz("./../models/tfidf_matrix.npz", TF_IDF_matrix)
df.to_pickle("./../models/users_df.pkl")

In [39]:
pip install geopy

Collecting geopy
  Obtaining dependency information for geopy from https://files.pythonhosted.org/packages/e5/15/cf2a69ade4b194aa524ac75112d5caac37414b20a3a03e6865dfe0bd1539/geopy-2.4.1-py3-none-any.whl.metadata
  Downloading geopy-2.4.1-py3-none-any.whl.metadata (6.8 kB)
Collecting geographiclib<3,>=1.52 (from geopy)
  Obtaining dependency information for geographiclib<3,>=1.52 from https://files.pythonhosted.org/packages/9f/5a/a26132406f1f40cf51ea349a5f11b0a46cec02a2031ff82e391c2537247a/geographiclib-2.0-py3-none-any.whl.metadata
  Downloading geographiclib-2.0-py3-none-any.whl.metadata (1.4 kB)
Downloading geopy-2.4.1-py3-none-any.whl (125 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m125.4/125.4 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25hDownloading geographiclib-2.0-py3-none-any.whl (40 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.3/40.3 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected pa

In [2]:
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter

import csv 
import time

geolocator = Nominatim(user_agent="geo_app", timeout=10)  # Increased timeout
geocode = RateLimiter(geolocator.geocode, min_delay_seconds=2)  # 2-second delay

with open('./../data/postalcodes.csv', 'r') as f_in, open('./../data/postalcodes_new.csv', 'w') as f_out:
    reader = csv.reader(f_in)
    writer = csv.writer(f_out)
    writer.writerow(['postal_code', 'lat', 'lng'])
    next(reader)  # Skip header

    for row in reader:
        try:
            # Clean the postal code
            postal_code = row[0].replace('\xa0', '').replace(' ', '')  # Fix spaces
            location = geocode(f"{postal_code}, Canada")
            
            if location:
                writer.writerow([postal_code, location.latitude, location.longitude])
            else:
                writer.writerow([postal_code, "Not found", "Not found"])
        
        except Exception as e:
            print(f"Error for {postal_code}: {str(e)}")
            writer.writerow([postal_code, "Error", "Error"])
            time.sleep(5)  # Wait longer if an error occurs

RateLimiter caught an error, retrying (0/2 tries). Called with (*('M9A1M9, Canada',), **{}).
Traceback (most recent call last):
  File "/opt/anaconda3/envs/heart_prediction_env/lib/python3.11/site-packages/urllib3/connectionpool.py", line 466, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/opt/anaconda3/envs/heart_prediction_env/lib/python3.11/site-packages/urllib3/connectionpool.py", line 461, in _make_request
    httplib_response = conn.getresponse()
                       ^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/envs/heart_prediction_env/lib/python3.11/http/client.py", line 1378, in getresponse
    response.begin()
  File "/opt/anaconda3/envs/heart_prediction_env/lib/python3.11/http/client.py", line 318, in begin
    version, status, reason = self._read_status()
                              ^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/envs/heart_prediction_env/lib/python3.11/http/client.py", line 279, in _read_status
    line = str(sel