In [1]:
from transformers import pipeline

In [54]:
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk import sent_tokenize
from nltk.corpus import stopwords
from transformers import pipeline
from typing import List, Tuple
from sklearn.decomposition import NMF
import numpy as np

nltk.download('stopwords')
nltk.download('punkt_tab')

class DreamAnalyzer():    
    def __init__(self):
        self.ner_pipeline = pipeline(model = "dbmdz/bert-large-cased-finetuned-conll03-english",grouped_entities = True)
        self.sentiment_pipeline = pipeline(model = "distilbert/distilbert-base-uncased-finetuned-sst-2-english")
        self.vectorizer = TfidfVectorizer(stop_words='english',max_features=5000)

    
    def extract_named_entity(self, text: str) -> List[str]:
        entities = self.ner_pipeline(text)
        themes = [entity['word'] for entity in entities]
        unique_themes = list(set(themes))
        return unique_themes
    
    def analyze_sentiment(self, text: str) -> Tuple[str, float]:
        sentiment = self.sentiment_pipeline(text)[0]
        emotion = sentiment['label']
        score = sentiment['score']
        return emotion, score

    def extract_keywords(self, text: List[str], num_keywords: int = 10) -> List[str]:
        tfidf_matrix = self.vectorizer.fit_transform(text)
        feature_names = self.vectorizer.get_feature_names_out()
        scores = np.asarray(tfidf_matrix.sum(axis=0)).ravel()
        feature_names = self.vectorizer.get_feature_names_out()
        keyword_scores = dict(zip(feature_names, scores))
        sorted_keywords = sorted(keyword_scores.items(), key=lambda x: x[1], reverse=True)
        top_keywords = [keyword for keyword, score in sorted_keywords[:num_keywords]]
        return top_keywords
    
    def get_themes(self,text:List[str]) :
        if not hasattr(self.vectorizer, 'vocabulary_'):
            self.vectorizer.fit(text)
        num_topics = 2
        tfidf_matrix = self.vectorizer.transform(text)
        nmf_model = NMF(n_components=num_topics, init='random', random_state=42)
        nmf_matrix = nmf_model.fit_transform(tfidf_matrix)
        feature_names = np.array(self.vectorizer.get_feature_names_out())
        top_feature_indices = np.argsort(nmf_model.components_, axis=1)[:, -num_words:]
        top_features = [list(feature_names[indices]) for indices in top_feature_indices]
        return top_features
    
    def analyze_dream(self, text: str) -> Tuple[List[str], Tuple[str, float], List[str], List[List[str]]]:
        named_entity = self.extract_named_entity(text)
        sentiment = self.analyze_sentiment(text)
        document = sent_tokenize(text)
        keywords = self.extract_keywords(document)
        themes = self.get_themes(document)
        return named_entity, sentiment, keywords, themes


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\avane\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\avane\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [55]:
dream_analyser  = DreamAnalyzer()

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [56]:
dream = "I dreamt about flying in the sky with Navi  The ocean was calm, and the sun was shining.felt a sense of freedom while soaring above the clouds    There were strange creatures in the forest.I was exploring an ancient castle."""

In [57]:
dream_analyser.analyze_dream(dream)

  return np.sqrt(res * 2)


(['Navi'],
 ('POSITIVE', 0.999600350856781),
 ['ancient',
  'calm',
  'castle',
  'clouds',
  'creatures',
  'dreamt',
  'exploring',
  'felt',
  'flying',
  'forest'],
 array([['sense', 'clouds', 'exploring', 'strange', 'castle', 'freedom',
         'flying', 'dreamt', 'forest', 'navi'],
        ['sun', 'ocean', 'calm', 'creatures', 'ancient', 'soaring',
         'felt', 'shining', 'sky', 'navi']], dtype=object))