In [None]:
!pip install nltk scikit-learn




In [None]:
import pandas as pd
import nltk
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.metrics import jaccard_score
from sklearn.preprocessing import MultiLabelBinarizer

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('punkt_tab')

# Load data
df = pd.read_csv("/content/NikeProductDescriptions (1).csv")

# Text preprocessing function
def preprocess(text):
    # Lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenize
    tokens = nltk.word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [t for t in tokens if t not in stop_words]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    return ' '.join(tokens), set(tokens)  # For TF-IDF and Jaccard

# Apply preprocessing
df['clean_text'], df['token_set'] = zip(*df['Product Description'].map(preprocess))

# ----- TF-IDF + Cosine Similarity -----
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(df['clean_text'])

cosine_sim_matrix = cosine_similarity(tfidf_matrix)

# Example: show top 5 most similar descriptions (excluding itself) for product 0
top_cosine_sim = cosine_sim_matrix[0].argsort()[::-1][1:6]
print("Top 5 similar (TF-IDF + Cosine) to product 0:")
for i in top_cosine_sim:
    print(f"- Product {i} | Similarity: {cosine_sim_matrix[0][i]:.3f}")

# ----- Jaccard Similarity -----
# Use sets of tokens and compute Jaccard index
def jaccard_sim(set1, set2):
    return len(set1 & set2) / len(set1 | set2)

# Example: compute Jaccard similarity between product 0 and all others
jaccard_scores = [jaccard_sim(df['token_set'][0], df['token_set'][i]) for i in range(len(df))]

top_jaccard_sim = sorted(list(enumerate(jaccard_scores)), key=lambda x: x[1], reverse=True)[1:6]
print("\nTop 5 similar (Jaccard) to product 0:")
for i, score in top_jaccard_sim:
    print(f"- Product {i} | Jaccard: {score:.3f}")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Top 5 similar (TF-IDF + Cosine) to product 0:
- Product 14 | Similarity: 0.285
- Product 159 | Similarity: 0.230
- Product 343 | Similarity: 0.159
- Product 6 | Similarity: 0.155
- Product 163 | Similarity: 0.149

Top 5 similar (Jaccard) to product 0:
- Product 14 | Jaccard: 0.220
- Product 159 | Jaccard: 0.164
- Product 335 | Jaccard: 0.125
- Product 37 | Jaccard: 0.121
- Product 7 | Jaccard: 0.119
