In [7]:
import json 
import spacy
from transformers import pipeline, DistilBertTokenizer

# Load SpaCy English model
nlp = spacy.load("en_core_web_sm")

# Load the sentiment analysis pipeline
sentiment_analyzer = pipeline('sentiment-analysis', model="distilbert-base-uncased-finetuned-sst-2-english")

# Initialize the DistilBert tokenizer for text truncation
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Text Preprocessing: Tokenization, Lemmatization, and Stopword Removal
def preprocess_text(text):
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
    return " ".join(tokens)

# Named Entity Recognition (NER)
def extract_entities(text):
    doc = nlp(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    return entities

# Sentiment Analysis with text truncation
def analyze_sentiment(text):
    # Tokenize and truncate the text to the max length (512)
    encoded_input = tokenizer(text, max_length=512, truncation=True, return_tensors='tf')

    # Pass the encoded input to the sentiment analyzer
    result = sentiment_analyzer(encoded_input['input_ids'])
    
    return result[0]['label'], result[0]['score']

# Function to summarize events from the JSON input
def summarize_events_from_json(json_filename):
    try:
        # Load JSON data
        with open(json_filename, 'r') as f:
            post_data = json.load(f)
        
        # Initialize an empty list to store summarized events
        summarized_events = []

        # Loop through each post in the JSON file
        for post in post_data:
            title = post['Title']
            body = post['Body/Content']
            link = post['Link']

            # Combine title and body for processing
            full_text = f"{title}. {body}" if body else title

            # Preprocess the text
            preprocessed_text = preprocess_text(full_text)

            # Perform NER to extract entities
            entities = extract_entities(full_text)

            # Perform sentiment analysis
            sentiment_label, sentiment_score = analyze_sentiment(full_text)

            # Append the summarized event to the list
            summarized_events.append({
                'Title': title,
                'Sentiment': f"{sentiment_label} ({sentiment_score:.2f})",
                'Entities': entities,
                'Link': link
            })

        # Print or return the summarized events
        print("Summarized Events:")
        for event in summarized_events:
            print(f"- {event['Title']}")
            print(f"  Sentiment: {event['Sentiment']}")
            print(f"  Entities: {', '.join([f'{entity[0]} ({entity[1]})' for entity in event['Entities']])}")
            print(f"  Link: {event['Link']}")
            print()

    except Exception as e:
        print(f"Error occurred: {e}")

# Main function to take JSON filename as input and run the summarization
if __name__ == "__main__":
    json_filename = input("Enter the name of the JSON file: ")
    summarize_events_from_json(json_filename)


All PyTorch model weights were used when initializing TFDistilBertForSequenceClassification.

All the weights of TFDistilBertForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForSequenceClassification for predictions without further training.


Error occurred: text input must be of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples).


In [9]:
import spacy
from textblob import TextBlob
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import json
import nltk

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Load the spaCy English model
nlp = spacy.load('en_core_web_sm')

# Load stopwords
stop_words = set(stopwords.words('english'))

# Function to preprocess text (tokenization, lemmatization, stopword removal)
def preprocess_text(text):
    doc = nlp(text)
    # Tokenize, lemmatize, remove stopwords and non-alphabetic tokens
    tokens = [token.lemma_ for token in doc if token.text.isalpha() and token.text.lower() not in stop_words]
    return ' '.join(tokens)

# Function to extract entities and sentiment
def extract_info(text):
    doc = nlp(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]  # Extract named entities
    sentiment = TextBlob(text).sentiment.polarity  # Sentiment analysis (-1 to 1)
    return entities, sentiment

# Function to summarize events from posts
def summarize_events(posts):
    summaries = []
    for post in posts:
        title = post.get('Title', '')
        body = post.get('Body/Content', '')
        link = post.get('Link', '')
        
        # Preprocess the text
        full_text = f"{title} {body}"
        preprocessed_text = preprocess_text(full_text)
        
        # Extract entities and sentiment
        entities, sentiment = extract_info(full_text)
        
        # Create a summary dictionary
        summary = {
            'Title': title,
            'Link': link,
            'Sentiment': sentiment,
            'Entities': entities
        }
        summaries.append(summary)
    
    return summaries

# Load JSON data from file
def load_json(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
    return data

# Main function to process the file and display summaries
def main():
    file_path = r'C:\Users\yashswi shukla\Desktop\Narrative\Code\reddit_top_posts_indian school.json'  # Replace with your file path
    posts = load_json(file_path)
    
    # Summarize events
    summaries = summarize_events(posts)
    
    # Display summaries
    for summary in summaries:
        print(f"Title: {summary['Title']}")
        print(f"Link: {summary['Link']}")
        print(f"Sentiment: {'Positive' if summary['Sentiment'] > 0 else 'Negative' if summary['Sentiment'] < 0 else 'Neutral'}")
        print(f"Entities: {summary['Entities']}")
        print("\n")

if __name__ == '__main__':
    main()


[nltk_data] Downloading package punkt to C:\Users\yashswi
[nltk_data]     shukla\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to C:\Users\yashswi
[nltk_data]     shukla\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


Title: TIL that a school in Poland is named after an Indian Maharaja, in honour of him providing shelter and free education to hundreds of Polish women and children after they had to escape Poland due to the World War 2.
Link: http://www.thehindu.com/features/magazine/a-maharaja-in-warsaw/article3360283.ece
Sentiment: Positive
Entities: [('TIL', 'ORG'), ('Poland', 'GPE'), ('Indian', 'NORP'), ('hundreds', 'CARDINAL'), ('Polish', 'NORP'), ('Poland', 'GPE'), ('the World War 2', 'EVENT')]


Title: List of the shit we've been through (so far) in 2020
Link: https://www.reddit.com/r/teenagers/comments/iva8jo/list_of_the_shit_weve_been_through_so_far_in_2020/
Sentiment: Positive
Entities: [('2020', 'DATE'), ('Wikipedia', 'ORG'), ('2020', 'DATE'), ('June', 'DATE'), ('RIP Ruth Ginsberg', 'PERSON'), ('January 1st', 'DATE'), ('Annual', 'DATE'), ('Hong Kong', 'GPE'), ('more than one million', 'CARDINAL'), ('Indonesian', 'NORP'), ('Jakarta', 'GPE'), ('at least 66', 'CARDINAL'), ('about 60,000', 'CAR

In [11]:
import spacy
from textblob import TextBlob
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import json
import nltk
from gensim.summarization import summarize  # For overall summarization

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Load the spaCy English model
nlp = spacy.load('en_core_web_sm')

# Load stopwords
stop_words = set(stopwords.words('english'))

# Function to preprocess text (tokenization, lemmatization, stopword removal)
def preprocess_text(text):
    doc = nlp(text)
    # Tokenize, lemmatize, remove stopwords and non-alphabetic tokens
    tokens = [token.lemma_ for token in doc if token.text.isalpha() and token.text.lower() not in stop_words]
    return ' '.join(tokens)

# Function to extract entities and sentiment
def extract_info(text):
    if isinstance(text, list):
        text = ' '.join(text)  # Combine list of strings into a single string
    elif not isinstance(text, str):
        raise ValueError("Input must be a string or a list of strings.")

    doc = nlp(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]  # Extract named entities
    sentiment = TextBlob(text).sentiment.polarity  # Sentiment analysis (-1 to 1)
    return entities, sentiment

# Function to summarize events from posts
def summarize_events(posts):
    summaries = []
    combined_text = ""  # For overall summarization
    for post in posts:
        title = post.get('Title', '')
        body = post.get('Body/Content', '')
        link = post.get('Link', '')
        
        # Preprocess the text
        full_text = f"{title} {body}"
        preprocessed_text = preprocess_text(full_text)
        
        # Extract entities and sentiment
        entities, sentiment = extract_info(full_text)
        
        # Create a summary dictionary
        summary = {
            'Title': title,
            'Link': link,
            'Sentiment': sentiment,
            'Entities': entities
        }
        summaries.append(summary)
        
        # Combine text for overall summary
        combined_text += full_text + ' '  # Add to combined text
    
    return summaries, combined_text

# Load JSON data from file
def load_json(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
    return data

# Function to generate overall summary
def generate_overall_summary(combined_text):
    try:
        overall_summary = summarize(combined_text, ratio=0.1)  # Adjust ratio for summary length
    except ValueError:
        overall_summary = "Summary could not be generated due to insufficient text length."
    return overall_summary

# Main function to process the file and display summaries
def main():
    file_path = '/mnt/data/reddit_top_posts_indian school.json'  # Replace with your file path
    posts = load_json(file_path)
    
    # Summarize events
    summaries, combined_text = summarize_events(posts)
    
    # Generate overall summary
    overall_summary = generate_overall_summary(combined_text)
    
    # Display individual summaries
    for summary in summaries:
        print(f"Title: {summary['Title']}")
        print(f"Link: {summary['Link']}")
        print(f"Sentiment: {'Positive' if summary['Sentiment'] > 0 else 'Negative' if summary['Sentiment'] < 0 else 'Neutral'}")
        print(f"Entities: {summary['Entities']}")
        print("\n")
    
    # Display overall summary
    print("Overall Summary:")
    print(overall_summary)

if __name__ == '__main__':
    main()


Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "c:\Users\yashswi shukla\anaconda3\lib\site-packages\gensim\matutils.py", line 1356, in <module>
    from gensim.corpora._mmreader import MmReader  # noqa: F401
  File "gensim\corpora\_mmreader.pyx", line 11, in init gensim.corpora._mmreader
ImportError: cannot import name utils

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "c:\Users\yashswi shukla\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3369, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "C:\Users\yashswi shukla\AppData\Local\Temp\ipykernel_21740\923213520.py", line 7, in <cell line: 7>
    from gensim.summarization import summarize  # For overall summarization
  File "c:\Users\yashswi shukla\anaconda3\lib\site-packages\gensim\__init__.py", line 11, in <module>
    from gensim import parsing, corpora, matutils, interfaces, models, similarities, utils  # noqa:F401
  File "c:\