In [3]:
import torch
import transformers
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
from sklearn.decomposition import LatentDirichletAllocation, NMF

# Set device for PyTorch (CPU or GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load pre-trained sentiment analysis model (e.g., BERT or RoBERTa)
tokenizer = transformers.AutoTokenizer.from_pretrained("bert-base-uncased")
model = transformers.AutoModelForSequenceClassification.from_pretrained("bert-base-uncased")
model.to(device)
model.eval()

# Example text
text = "I loved the movie. It was fantastic!"

# Tokenize and encode the text for sentiment analysis
inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt")
inputs = {key: val.to(device) for key, val in inputs.items()}

# Perform sentiment analysis
with torch.no_grad():
    outputs = model(**inputs)
    predicted_class = torch.argmax(outputs.logits, dim=1).item()
    predicted_sentiment = "positive" if predicted_class == 1 else "negative"

print("Sentiment:", predicted_sentiment)

# TF-IDF vectorization
corpus = ["I loved the movie", "It was fantastic", "I disliked the film"]
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)
print("TF-IDF Matrix:")
print(tfidf_matrix.toarray())

# Word2Vec modeling
sentences = [text.split(), "It was a great film".split(), "The acting was superb".split()]
word2vec_model = Word2Vec(sentences, min_count=1)
print("Word2Vec Similarity:")
print(word2vec_model.wv.similarity("movie", "film"))

# Topic modeling using LDA
lda = LatentDirichletAllocation(n_components=2)
lda.fit(tfidf_matrix)
print("LDA Topics:")
for topic_idx, topic in enumerate(lda.components_):
    print("Topic {}: {}".format(topic_idx, ", ".join(tfidf_vectorizer.get_feature_names()[i] for i in topic.argsort()[:-6:-1])))

# Topic modeling using NMF
nmf = NMF(n_components=2)
nmf.fit(tfidf_matrix)
print("NMF Topics:")
for topic_idx, topic in enumerate(nmf.components_):
    print("Topic {}: {}".format(topic_idx, ", ".join(tfidf_vectorizer.get_feature_names()[i] for i in topic.argsort()[:-6:-1])))



ValueError: numpy.ndarray size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import numpy as np

# Example texts
texts = [
    "I loved the movie. It was fantastic!",
    "The acting was great, but the plot was weak.",
    "The film was a disappointment. I wouldn't recommend it.",
    "The movie had an amazing soundtrack.",
]

# Random text input
random_text = "The movie was captivating and the performances were outstanding."

# Combine texts with random text
combined_texts = texts + [random_text]

# TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(combined_texts)

# Topic modeling using LDA
lda = LatentDirichletAllocation(n_components=3, random_state=42)
lda.fit(tfidf_matrix[:-1])  # Exclude the last row (random text)

# Get the topic probabilities for the random text
random_text_tfidf = tfidf_matrix[-1]
random_text_topics = lda.transform(random_text_tfidf.reshape(1, -1))

# Get the top topic for the random text
top_topic_idx = np.argmax(random_text_topics)

# Get the top words for the top topic
top_topic_words = lda.components_[top_topic_idx].argsort()[:-6:-1]
feature_names = tfidf_vectorizer.get_feature_names()
top_words = [feature_names[idx] for idx in top_topic_words]

# Print the results
print("Random Text: ", random_text)
print("Top Topic: Topic", top_topic_idx)
print("Top Words:", ", ".join(top_words))


Random Text:  The movie was captivating and the performances were outstanding.
Top Topic: Topic 0
Top Words: fantastic, loved, it, movie, was




In [5]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification

# Load pre-trained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

# Example text
text = "I really enjoyed the movie!"

# Tokenize and encode the text
inputs = tokenizer(text, padding=True, truncation=True, return_tensors='pt')

# Make predictions
outputs = model(**inputs)
predictions = torch.argmax(outputs.logits, dim=1).item()

# Print the predicted label
print("Predicted Label:", predictions)


Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

Predicted Label: 1


In [7]:
import torch
from transformers import BertTokenizer, BertModel
from sklearn.feature_extraction.text import TfidfVectorizer

import torch
from transformers import BertTokenizer, BertModel
from sklearn.feature_extraction.text import TfidfVectorizer

# Load pre-trained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Example longer text
text = "I watched the movie last night and it was absolutely amazing! The story, the acting, and the cinematography were all top-notch. Highly recommended!"

# Tokenize and encode the text
tokens = tokenizer.tokenize(text)
token_ids = tokenizer.convert_tokens_to_ids(tokens)
inputs = torch.tensor([token_ids])

# Get BERT embeddings
with torch.no_grad():
    embeddings = model(inputs)[0]

# Average the embeddings to get sentence-level representation
sentence_embedding = torch.mean(embeddings, dim=1)

# Convert embeddings to numpy array
sentence_embedding = sentence_embedding.detach().numpy()

# Convert embeddings back to tokens
token_embeddings = torch.squeeze(embeddings, dim=0)
token_embeddings = token_embeddings.transpose(0, 1).numpy()

# Get the top-n important words based on TF-IDF
n_top_words = 5
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform([text])
feature_names = tfidf_vectorizer.get_feature_names()
top_word_indices = tfidf_matrix.toarray()[0].argsort()[-n_top_words:][::-1]
top_words = [feature_names[idx] for idx in top_word_indices]

# Get the salient words using attention scores from BERT
n_salient_words = 5
attention_scores = torch.mean(torch.abs(token_embeddings), dim=2)[0]
salient_word_indices = attention_scores.argsort()[-n_salient_words:][::-1]
salient_words = [tokens[idx] for idx in salient_word_indices]

print("Top Important Words (TF-IDF):", top_words)
print("Salient Words (BERT Attention):", salient_words)



Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


TypeError: abs(): argument 'input' (position 1) must be Tensor, not numpy.ndarray