In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

# Sample documents
documents = [
    "Cats and dogs are popular pets.",
    "Dogs bark and chase cats.",
    "Cats climb trees and sleep a lot.",
    "Python is a popular programming language.",
    "Programming in Python is fun and powerful.",
    "Many developers enjoy coding in Python."
]

# Step 1: Convert the documents to a TF-IDF matrix
vectorizer = TfidfVectorizer(stop_words='english')
tfidf = vectorizer.fit_transform(documents)

# Step 2: Apply NMF
n_topics = 2  # Number of topics
nmf_model = NMF(n_components=n_topics, random_state=42)
W = nmf_model.fit_transform(tfidf)  # Document-topic matrix
H = nmf_model.components_           # Topic-term matrix

# Step 3: Display top terms per topic
feature_names = vectorizer.get_feature_names_out()
n_top_words = 5

print("Topics discovered by NMF:\n")
for topic_idx, topic in enumerate(H):
    top_features = [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
    print(f"Topic #{topic_idx + 1}: {', '.join(top_features)}")

# Optional: Show topic distribution per document
import pandas as pd
doc_topic_df = pd.DataFrame(W, columns=[f"Topic {i+1}" for i in range(n_topics)])
print("\nDocument-topic distribution:")
print(doc_topic_df)


Topics discovered by NMF:

Topic #1: python, programming, language, powerful, fun
Topic #2: cats, dogs, pets, chase, bark

Document-topic distribution:
    Topic 1   Topic 2
0  0.049596  0.717892
1  0.000000  0.718985
2  0.000000  0.411200
3  0.543100  0.070814
4  0.545700  0.000000
5  0.341266  0.000000


In [2]:
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise import accuracy
import pandas as pd

# Step 1: Create sample user-item rating data
ratings_dict = {
    "userID": ["A", "A", "A", "B", "B", "C", "C", "D", "D", "E"],
    "itemID": ["Item1", "Item2", "Item3", "Item1", "Item2", "Item2", "Item3", "Item1", "Item3", "Item2"],
    "rating": [5, 3, 4, 4, 2, 5, 3, 3, 5, 4]
}
df = pd.DataFrame(ratings_dict)

# Step 2: Load into Surprise
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df[["userID", "itemID", "rating"]], reader)

# Step 3: Train-test split
trainset, testset = train_test_split(data, test_size=0.25, random_state=42)

# Step 4: Apply SVD (a type of matrix factorization)
model = SVD()
model.fit(trainset)

# Step 5: Make predictions and evaluate
predictions = model.test(testset)
rmse = accuracy.rmse(predictions)

# Step 6: Predict a specific rating
pred = model.predict(uid="A", iid="Item1")
print(f"\nPredicted rating of user A for Item1: {pred.est:.2f}")


ModuleNotFoundError: No module named 'surprise'