In [8]:
import sys
import os
sys.path.append(os.path.abspath('../utilities'))
import global_utils

# Data Manipulation and Analysis
import numpy as np  
import pandas as pd 

# Data Visualization
import matplotlib.pyplot as plt 
import seaborn as sns  
import plotly.express as px

In [10]:
df = global_utils.import_csv('./../data/customer1.csv')
global_utils.define_df_settings()

In [12]:
df.head()

Unnamed: 0,customer_ID,first_name,last_name,email,question_1,question_2,question_3,question_4,question_5
0,1000001,Abigail,Lewis,abigail.lewis1000001@example.com,"Walking, Jogging, Pilates",2–3 days,Lunch,Medium,"Weight Loss, Improve Health, Training"
1,1000002,Mason,Brown,mason.brown1000002@example.com,"Jogging, Walking, Running",0–1 days,Mid-Morning,High,"Social, Reduce Stress, Build Strength"
2,1000003,David,Jones,david.jones1000003@example.com,"Hiking, Cycling, Walking",2–3 days,Lunch,Medium,"Social, Weight Loss, Build Strength"
3,1000004,Liam,Martin,liam.martin1000004@example.com,"Running, Group Fitness Class, Hiking",4–5 days,Lunch,Low,"Improve Health, Training, Social"
4,1000005,Samuel,Perez,samuel.perez1000005@example.com,"Running, Swimming, Jogging",4–5 days,Lunch,High,"Training, Build Strength, Social"


In [18]:
q_cols = [f"question_{i}" for i in range(1, 6)]

df["questions_concat"] = (
    df[q_cols]
      .fillna("")                                             
      .apply(lambda row: " ".join(str(v).replace(",", " ").strip() 
                                  for v in row), axis=1)      
      .str.replace(r"\s+", " ", regex=True)                   
      .str.strip()                                            
)

df.head()

Unnamed: 0,customer_ID,first_name,last_name,email,question_1,question_2,question_3,question_4,question_5,questions_concat
0,1000001,Abigail,Lewis,abigail.lewis1000001@example.com,"Walking, Jogging, Pilates",2–3 days,Lunch,Medium,"Weight Loss, Improve Health, Training",Walking Jogging Pilates 2–3 days Lunch Medium Weight Loss Improve Health Training
1,1000002,Mason,Brown,mason.brown1000002@example.com,"Jogging, Walking, Running",0–1 days,Mid-Morning,High,"Social, Reduce Stress, Build Strength",Jogging Walking Running 0–1 days Mid-Morning High Social Reduce Stress Build Strength
2,1000003,David,Jones,david.jones1000003@example.com,"Hiking, Cycling, Walking",2–3 days,Lunch,Medium,"Social, Weight Loss, Build Strength",Hiking Cycling Walking 2–3 days Lunch Medium Social Weight Loss Build Strength
3,1000004,Liam,Martin,liam.martin1000004@example.com,"Running, Group Fitness Class, Hiking",4–5 days,Lunch,Low,"Improve Health, Training, Social",Running Group Fitness Class Hiking 4–5 days Lunch Low Improve Health Training Social
4,1000005,Samuel,Perez,samuel.perez1000005@example.com,"Running, Swimming, Jogging",4–5 days,Lunch,High,"Training, Build Strength, Social",Running Swimming Jogging 4–5 days Lunch High Training Build Strength Social


In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words = "english")

TF_IDF_matrix = vectorizer.fit_transform(df['questions_concat'])

In [26]:
TF_IDF_matrix.shape

(500, 31)

In [38]:
from sklearn.metrics.pairwise import cosine_similarity

similarity = cosine_similarity(TF_IDF_matrix,dense_output=False)

print(similarity)

  (0, 487)	0.2837485065238119
  (0, 404)	0.1543489844486175
  (0, 333)	0.2838601846661028
  (0, 321)	0.15402097193477632
  (0, 237)	0.022385855388238486
  (0, 208)	0.42111718414094
  (0, 178)	0.021888115516049873
  (0, 54)	0.12659343645870988
  (0, 15)	0.024704567310010563
  (0, 1)	0.27819004710182704
  (0, 390)	0.400634790513381
  (0, 365)	0.2647829808286474
  (0, 289)	0.12920602081482938
  (0, 189)	0.13558747323295817
  (0, 172)	0.40230454469028415
  (0, 120)	0.22623927334885496
  (0, 11)	0.120247310407996
  (0, 449)	0.23456725194922307
  (0, 302)	0.11653983537586782
  (0, 140)	0.1175664149786507
  (0, 125)	0.38456887904210235
  (0, 56)	0.12491565979382968
  (0, 493)	0.13163467550385777
  (0, 490)	0.23666236302749766
  (0, 489)	0.3953932705008196
  :	:
  (499, 107)	0.6233027960353765
  (499, 105)	0.5611181800488493
  (499, 103)	0.41151808015083363
  (499, 99)	0.5709368656730043
  (499, 98)	0.6049769210058578
  (499, 97)	0.42919311124463266
  (499, 94)	0.6324884784355861
  (499, 90)	0

In [35]:
similarity.shape

(500, 500)

In [46]:
user_1 = TF_IDF_matrix[(df['customer_ID'] == 1000001).values,]
user_2 = TF_IDF_matrix[(df['customer_ID'] == 1000002).values,]

print("Similarity:", cosine_similarity(user_1, user_2)) # Notice the result is a 2D 1X1 array, so to grab
                                                          # the number we will need to index

Similarity: [[0.27819005]]


In [48]:
user_1 = TF_IDF_matrix[(df['customer_ID'] == 1000001).values,]
user_3 = TF_IDF_matrix[(df['customer_ID'] == 1000003).values,]

print("Similarity:", cosine_similarity(user_1, user_3)) # Notice the result is a 2D 1X1 array, so to grab
                                                          # the number we will need to index

Similarity: [[0.50057243]]


In [56]:
# Get the column based upon the index
customer_index = df[df['customer_ID'] == 1000001].index

# Create a dataframe with the movie titles
sim_df = pd.DataFrame({'customer_ID':df['customer_ID'],
                       'similarity': np.array(similarity[customer_index, :].todense()).squeeze()})

In [64]:
sim_df[sim_df['similarity'] > 0.7]

Unnamed: 0,customer_ID,similarity
0,1000001,1.0
62,1000063,0.750298
92,1000093,0.708946
161,1000162,0.714471
200,1000201,0.759645
245,1000246,0.701282
307,1000308,0.884253
319,1000320,0.747453
424,1000425,0.759921
