# recommendation system

In [2]:
# Step 1: Import libraries
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

## 📊 Creating a Sample Dataset for User-Based Content Recommendation System

In [6]:
data = { "user_id" : [1,2,3,4,5,6,7,8,9,10],
        
         "Name" : ["Eva","Enaya","Yukti","Bhavika","Bhavya","Shivi","Amit","Astha","Ravi","Anubhav"],
        
        "profession": ["Data Scientist", "Chef", "Sports Coach", "Blockchain Developer", "Film Critic",
                       "Business Analyst", "Artist", "Software Engineer", "Fitness Trainer", "Author"],
        
        "Interests":[ "machine learning, AI, data science",
                      "cooking, baking, recipes",
                      "football, cricket, sports",
                      "blockchain, cryptocurrency, web3",
                      "movies, drama, emotions",
                      "Business, Marketing, Trading",
                      "Artist, Panting, Sketching",
                      "python, programming, AI",
                      "fitness, gym, health",
                      "books, writing, literature"],
                   
         "rating":[5, 4, 3, 4, 2, 5, 4, 5, 3, 4]
       }
df_users = pd.DataFrame(data)
df_users

Unnamed: 0,user_id,Name,profession,Interests,rating
0,1,Eva,Data Scientist,"machine learning, AI, data science",5
1,2,Enaya,Chef,"cooking, baking, recipes",4
2,3,Yukti,Sports Coach,"football, cricket, sports",3
3,4,Bhavika,Blockchain Developer,"blockchain, cryptocurrency, web3",4
4,5,Bhavya,Film Critic,"movies, drama, emotions",2
5,6,Shivi,Business Analyst,"Business, Marketing, Trading",5
6,7,Amit,Artist,"Artist, Panting, Sketching",4
7,8,Astha,Software Engineer,"python, programming, AI",5
8,9,Ravi,Fitness Trainer,"fitness, gym, health",3
9,10,Anubhav,Author,"books, writing, literature",4


In [8]:
df_users["combined"] = df_users["profession"] + "," + df_users["Interests"]

# Convert combined text into TF-IDF matrix

In [17]:
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(df_users["combined"])
tfidf

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'


# Meaning: 10 users, 42 unique terms across all profiles.

In [18]:
print(tfidf_matrix.shape)

(10, 42)


# To see the feature names (words) and their positions

In [16]:
print(tfidf.get_feature_names_out())

['ai' 'analyst' 'artist' 'author' 'baking' 'blockchain' 'books' 'business'
 'chef' 'coach' 'cooking' 'cricket' 'critic' 'cryptocurrency' 'data'
 'developer' 'drama' 'emotions' 'engineer' 'film' 'fitness' 'football'
 'gym' 'health' 'learning' 'literature' 'machine' 'marketing' 'movies'
 'panting' 'programming' 'python' 'recipes' 'science' 'scientist'
 'sketching' 'software' 'sports' 'trading' 'trainer' 'web3' 'writing']


# To view matrix content as a DataFrame:

In [15]:
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf.get_feature_names_out())
tfidf_df.head()

Unnamed: 0,ai,analyst,artist,author,baking,blockchain,books,business,chef,coach,...,recipes,science,scientist,sketching,software,sports,trading,trainer,web3,writing
0,0.287833,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.338591,0.338591,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.5,0.0,...,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.377964,...,0.0,0.0,0.0,0.0,0.0,0.755929,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.755929,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.377964,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Function to recommend top N users based on similarity of interests and profession using TF-IDF and cosine similarity:

In [12]:
def recommend(input_text, top_n=3):
    input_vec = tfidf.transform([input_text])
    similarity = cosine_similarity(input_vec, tfidf_matrix).flatten()

    # Multiply similarity with user rating to improve recommendations
    weighted_similarity = similarity * df_users["rating"]

    top_indices = weighted_similarity.argsort()[-top_n:][::-1]
    recommendations = df_users.iloc[top_indices].copy()
    recommendations["similarity_score"] = similarity[top_indices]
    recommendations["weighted_score"] = weighted_similarity[top_indices]
    return recommendations


In [13]:
input_interest = "I'm interested in AI, Python, and Data Science"
recommend(input_interest)


Unnamed: 0,user_id,Name,profession,Interests,rating,combined,similarity_score,weighted_score
0,1,Eva,Data Scientist,"machine learning, AI, data science",5,"Data Scientist,machine learning, AI, data science",0.653284,3.26642
7,8,Astha,Software Engineer,"python, programming, AI",5,"Software Engineer,python, programming, AI",0.410846,2.054229
8,9,Ravi,Fitness Trainer,"fitness, gym, health",3,"Fitness Trainer,fitness, gym, health",0.0,0.0


# # Summary of Key Functions and Concepts

# Import Libraries

numpy and pandas: For handling arrays and tabular data.

TfidfVectorizer: Converts text to numerical vectors based on importance of words.

cosine_similarity: Measures similarity between text vectors.

# Combine Text Columns

Combines profession and Interests into a single string for better text vectorization.

Helps TF-IDF understand context more accurately.

# Create TF-IDF Matrix

TfidfVectorizer(): Initializes the TF-IDF engine.

fit_transform(): Converts the combined text into numerical vectors (importance-based).

# View Feature Names and Matrix

Shows the unique words/features extracted.

Displays the shape of the TF-IDF matrix (rows = users, columns = terms).

# Convert to DataFrame (Optional) 

Converts sparse TF-IDF matrix to a readable DataFrame.

Useful for visualizing how each word contributes to each user profile.

# Recommendation Function

Takes user input and recommends top n similar user profiles.

Transforms user input into TF-IDF vector using the same vocabulary.

Calculates similarity between input and each user profile (0 to 1 scale).

Enhances similarity score by multiplying with rating.

Sorts users by highest weighted similarity.

Returns top recommendations with similarity and weighted score for better interpretability.

# Make a Recommendation

Input your interest as a string.

The system returns the top 3 matching user profiles based on content similarity and rating.   

