In [1]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk


# Load dataset
file_path = r"C:\Users\Vaishnavi B\Downloads\news_sentiment_analysis.csv"
df = pd.read_csv(file_path)


In [2]:
df.head()

Unnamed: 0,Source,Author,Title,Description,URL,Published At,Sentiment,Type
0,stgnews,Bridger Palmer,Pine View High teacher wins Best in State awar...,"ST. GEORGE — Kaitlyn Larson, a first-year teac...",https://www.stgeorgeutah.com/news/archive/2024...,2024-07-12T23:45:25+00:00,positive,Business
1,Zimbabwe Mail,Staff Reporter,Businesses Face Financial Strain Amid Liquidit...,"Harare, Zimbabwe – Local businesses are grappl...",https://www.thezimbabwemail.com/business/busin...,2024-07-12T22:59:42+00:00,neutral,Business
2,4-traders,,Musk donates to super pac working to elect Tru...,(marketscreener.com) Billionaire Elon Musk has...,https://www.marketscreener.com/business-leader...,2024-07-12T22:52:55+00:00,positive,Business
3,4-traders,,US FTC issues warning to franchisors over unfa...,(marketscreener.com) A U.S. trade regulator on...,https://www.marketscreener.com/quote/stock/MCD...,2024-07-12T22:41:01+00:00,negative,Business
4,PLANET,,Rooftop solar's dark side,4.5 million households in the U.S. have solar ...,https://www.npr.org/2024/07/12/1197961036/roof...,2024-07-12T22:28:19+00:00,positive,Business


In [3]:
df.shape

(3500, 8)

In [4]:
df.columns

Index(['Source', 'Author', 'Title', 'Description', 'URL', 'Published At',
       'Sentiment', 'Type'],
      dtype='object')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3500 entries, 0 to 3499
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Source        3500 non-null   object
 1   Author        2512 non-null   object
 2   Title         3500 non-null   object
 3   Description   3500 non-null   object
 4   URL           3500 non-null   object
 5   Published At  3500 non-null   object
 6   Sentiment     3500 non-null   object
 7   Type          3500 non-null   object
dtypes: object(8)
memory usage: 218.9+ KB


In [6]:
# Combine Title and Description for better context
df['Text'] = df['Title'].astype(str) + ' ' + df['Description'].astype(str)


In [7]:
# Convert Sentiment labels to numerical values
df['Sentiment'] = df['Sentiment'].map({'positive': 1, 'neutral': 0, 'negative': -1})


In [8]:
# Keep only required columns
dfnew = df[['Text', 'Sentiment']]


In [9]:
dfnew

Unnamed: 0,Text,Sentiment
0,Pine View High teacher wins Best in State awar...,1
1,Businesses Face Financial Strain Amid Liquidit...,0
2,Musk donates to super pac working to elect Tru...,1
3,US FTC issues warning to franchisors over unfa...,-1
4,Rooftop solar's dark side 4.5 million househol...,1
...,...,...
3495,"Arrow Electronics, Inc. (NYSE:ARW) Shares Purc...",1
3496,"3,120 Shares in NICE Ltd. (NASDAQ:NICE) Bought...",1
3497,"QRG Capital Management Inc. Has $857,000 Stock...",1
3498,Biotechnology Market: Surging Investments and ...,0


In [10]:
dfnew.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3500 entries, 0 to 3499
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Text       3500 non-null   object
 1   Sentiment  3500 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 54.8+ KB


In [11]:
# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize lemmatizer and stopwords list
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to C:\Users\Vaishnavi
[nltk_data]     B\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Vaishnavi
[nltk_data]     B\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [12]:
# 1. Text Cleaning (basic preprocessing)
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove non-alphabetic characters
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]  # Lemmatize and remove stopwords
    return ' '.join(words)

dfnew['Cleaned_Text'] = dfnew['Text'].apply(clean_text)
dfnew = dfnew.drop('Text', axis=1)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfnew['Cleaned_Text'] = dfnew['Text'].apply(clean_text)


In [13]:
dfnew.head()

Unnamed: 0,Sentiment,Cleaned_Text
0,1,pine view high teacher win best state award bu...
1,0,business face financial strain amid liquidity ...
2,1,musk donates super pac working elect trump blo...
3,-1,u ftc issue warning franchisors unfair busines...
4,1,rooftop solars dark side million household u s...


In [14]:
# 2. Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(dfnew['Cleaned_Text'], dfnew['Sentiment'],
                                                    test_size=0.2, random_state=42)


In [15]:
# 3. Vectorize the text using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))  # Added bigrams (1,2)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [16]:
# 4. Train a Multinomial Naive Bayes model
model = MultinomialNB()
model.fit(X_train_tfidf, y_train)

In [17]:
# 5. Predict and evaluate the model
y_pred = model.predict(X_test_tfidf)

In [18]:
# 6. Print performance metrics
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.7471428571428571
              precision    recall  f1-score   support

          -1       0.91      0.40      0.56       131
           0       0.76      0.49      0.60       147
           1       0.73      0.94      0.82       422

    accuracy                           0.75       700
   macro avg       0.80      0.61      0.66       700
weighted avg       0.77      0.75      0.73       700



In [45]:
# Sample News Article
news_article = "Tesla unveils new EV model."

# Preprocess the news article (cleaning the text)
news_article_cleaned = clean_text(news_article)

# Transform the cleaned news article using the same vectorizer (cv) used for training
input_data = [news_article_cleaned]
input_data = vectorizer.transform(input_data).toarray()

# Predict sentiment (assuming classifier is the trained model)
input_pred = model.predict(input_data)

# Check the prediction and print sentiment
if input_pred[0] == 1:
    print("News article has a positive sentiment.")
elif input_pred[0] == -1:
    print("News article has a negative sentiment.")
else:
    print("News article has a neutral sentiment.")


News article has a positive sentiment.
