In [1]:
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from heapq import nlargest

# Initialize Sentiment Analyzer
analyzer = SentimentIntensityAnalyzer()

# Define stopwords for text preprocessing
stop_words = set(stopwords.words("english"))

def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    return text

def tfidf_summarize(article_text, num_sentences=3):
    # Tokenize the article into sentences
    sentences = sent_tokenize(article_text)
    
    # Initialize TF-IDF vectorizer
    tfidf_vectorizer = TfidfVectorizer(stop_words='english')
    
    # Fit and transform the article text
    tfidf_matrix = tfidf_vectorizer.fit_transform(sentences)
    
    # Calculate sentence scores based on TF-IDF
    sentence_scores = {}
    for i in range(len(sentences)):
        score = sum(tfidf_matrix[i].toarray()[0])
        sentence_scores[sentences[i]] = score
    
    # Select top N sentences with highest scores
    summarized_sentences = nlargest(num_sentences, sentence_scores, key=sentence_scores.get)
    
    return summarized_sentences

def predict_category(article_text):
    # Convert article text to lowercase for case-insensitive matching
    article_text_lower = article_text.lower()

    # Define keywords or phrases indicative of different categories
    categories = {
        'Politics': ['election', 'government', 'parliament', 'minister', 'president','MLA'],
        'Sports': ['football', 'soccer', 'basketball', 'tennis', 'olympics', 'cricket','players','game'],
        'Technology': ['tech', 'innovation', 'digital', 'internet', 'software','smartphone'],
        'Entertainment': ['movie', 'music', 'celebrity', 'entertainment', 'film'],
        'Education': ['books','exams','students','teacher','school','college','university'],
        'Business': ['economy', 'finance', 'stock', 'market', 'business','money'],
        'Weather' : ['climate'],
        'Health'  : ['hospitals', 'medication', 'cholestrol','drug','disease']
    }

    # Count occurrences of keywords or phrases from each category
    category_counts = {category: sum(keyword in article_text_lower for keyword in keywords) 
                       for category, keywords in categories.items()}

    # Predict the category with the highest count
    predicted_category = max(category_counts, key=category_counts.get)

    return predicted_category

def process_input(user_input):
    # Sentiment analysis
    preprocessed_input = preprocess_text(user_input)
    sentiment_scores = analyzer.polarity_scores(preprocessed_input)
    if sentiment_scores['compound'] >= 0.05:
        overall_sentiment = "Positive"
    elif sentiment_scores['compound'] <= -0.05:
        overall_sentiment = "Negative"
    else:
        overall_sentiment = "Neutral"
    
    summarized_text = tfidf_summarize(user_input)
    predicted_category = predict_category(user_input)
    
    return summarized_text, overall_sentiment, predicted_category

# Take input from the user
user_input = input("Enter your text: ")

# Process the input
summarized_text, sentiment, category = process_input(user_input)

# Display the results
print("\nSummarized Text:")
for sentence in summarized_text:
    print("-", sentence)
print("\nSentiment:", sentiment)
print("Predicted Category:", category)


Enter your text:  I kill a girl



Summarized Text:
- I kill a girl

Sentiment: Negative
Predicted Category: Politics
