### Installing Required Libraries

In [39]:
import pandas as pd

In [40]:
from  sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import OneHotEncoder

In [41]:
from datetime import datetime

### Loading the Dataset

In [42]:
dataset = pd.read_csv('SocialMediaUsersDataset.csv')
dataset = dataset.head(10000)

In [43]:
dataset

Unnamed: 0,UserID,Name,Gender,DOB,Interests,City,Country
0,1,Charles Anderson,Female,1993-01-21,"['Outdoor activities', 'Photography', 'Books',...",Anak,North Korea
1,2,Tammy Kuykendall,Female,2002-11-13,"['Fitness', 'Gaming']",Tha Bo,Thailand
2,3,Juan Ross,Male,1992-06-18,['Food and dining'],Mohyliv-Podilskyy,Ukraine
3,4,Vernetta Aycock,Male,1971-12-28,"['Nature', 'Education and learning', 'Health a...",Highland Park,United States
4,5,Anne Miyamoto,Female,1964-05-12,"['History', 'Fashion', 'Parenting and family',...",Sorel-Tracy,Canada
...,...,...,...,...,...,...,...
9995,9996,James Jackson,Female,2000-09-28,"['Finance and investments', 'Gardening', 'Tech...",Malinau,Indonesia
9996,9997,Candice Dixon,Female,1965-04-25,['Pets'],Falkenhagener Feld,Germany
9997,9998,Heather Wages,Male,1980-11-30,"['Education and learning', 'Gardening', 'Techn...",Milford,United States
9998,9999,Luis Baley,Male,1987-09-05,"['Parenting and family', 'Fitness', 'Music']",Flint,United States


# Feature Extraction

### Interests Feature Extraction

In [44]:
interests = dataset['Interests'].str.get_dummies(', ')  # One-hot encode interests
interests.fillna(0, inplace=True)  # Replace NaN values with 0

In [45]:
interests

Unnamed: 0,'Art','Art'],'Beauty','Beauty'],'Books','Books'],'Business and entrepreneurship','Business and entrepreneurship'],'Cars and automobiles','Cars and automobiles'],...,['Science',['Science'],['Social causes and activism',['Social causes and activism'],['Sports',['Sports'],['Technology',['Technology'],['Travel',['Travel']
0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Calculating Age and adding it as a column

In [46]:
dob = pd.to_datetime(dataset['DOB'])
current_date = datetime.now()
dataset['Age'] = (current_date - dob).astype('<m8[Y]')
dataset

Unnamed: 0,UserID,Name,Gender,DOB,Interests,City,Country,Age
0,1,Charles Anderson,Female,1993-01-21,"['Outdoor activities', 'Photography', 'Books',...",Anak,North Korea,30.0
1,2,Tammy Kuykendall,Female,2002-11-13,"['Fitness', 'Gaming']",Tha Bo,Thailand,20.0
2,3,Juan Ross,Male,1992-06-18,['Food and dining'],Mohyliv-Podilskyy,Ukraine,30.0
3,4,Vernetta Aycock,Male,1971-12-28,"['Nature', 'Education and learning', 'Health a...",Highland Park,United States,51.0
4,5,Anne Miyamoto,Female,1964-05-12,"['History', 'Fashion', 'Parenting and family',...",Sorel-Tracy,Canada,59.0
...,...,...,...,...,...,...,...,...
9995,9996,James Jackson,Female,2000-09-28,"['Finance and investments', 'Gardening', 'Tech...",Malinau,Indonesia,22.0
9996,9997,Candice Dixon,Female,1965-04-25,['Pets'],Falkenhagener Feld,Germany,58.0
9997,9998,Heather Wages,Male,1980-11-30,"['Education and learning', 'Gardening', 'Techn...",Milford,United States,42.0
9998,9999,Luis Baley,Male,1987-09-05,"['Parenting and family', 'Fitness', 'Music']",Flint,United States,35.0


### Encoded Gender Feature Extraction

In [47]:
gender = dataset[['Gender']]
gender_encoded = pd.get_dummies(gender)

In [48]:
gender_encoded

Unnamed: 0,Gender_Female,Gender_Male
0,1,0
1,1,0
2,0,1
3,0,1
4,1,0
...,...,...
9995,1,0
9996,1,0
9997,0,1
9998,0,1


In [49]:
age = dataset[['Age']]

In [50]:
age

Unnamed: 0,Age
0,30.0
1,20.0
2,30.0
3,51.0
4,59.0
...,...
9995,22.0
9996,58.0
9997,42.0
9998,35.0


### Encoded Location Feature Extraction

In [51]:
location = dataset[['City', 'Country']]
location_encoded = pd.get_dummies(location)

In [52]:
location_encoded

Unnamed: 0,City_'s-Gravenzande,City_A Coruña,City_Aalborg,City_Aarschot,City_Aba,City_Abakaliki,City_Abasolo,City_Abdulino,City_Aberdare,City_Aberdeen,...,Country_United Kingdom,Country_United States,Country_Uruguay,Country_Uzbekistan,Country_Venezuela,Country_Vietnam,Country_Western Sahara,Country_Yemen,Country_Zambia,Country_Zimbabwe
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9997,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
9998,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


# User similarity calculation

In [53]:
features = pd.concat([interests], axis=1)

In [54]:
features

Unnamed: 0,'Art','Art'],'Beauty','Beauty'],'Books','Books'],'Business and entrepreneurship','Business and entrepreneurship'],'Cars and automobiles','Cars and automobiles'],...,['Science',['Science'],['Social causes and activism',['Social causes and activism'],['Sports',['Sports'],['Technology',['Technology'],['Travel',['Travel']
0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Calculating cosine similarity between users based on their extracted features

In [55]:
similarity_matrix = cosine_similarity(features)

In [56]:
user_id = 5
similar_users_indices = similarity_matrix[user_id - 1].argsort()[::-1]  # Sort indices in descending order
similar_users = similar_users_indices[1:6]  # Get top 5 similar users (excluding the user itself)

In [57]:
# Print the similar users
for similar_user_index in similar_users:
    print(f"Similar User ID: {similar_user_index + 1}")

Similar User ID: 2067
Similar User ID: 1189
Similar User ID: 3672
Similar User ID: 8487
Similar User ID: 1835
