In [1]:
import pandas as pd
import numpy as np


In [2]:
df = pd.read_csv("../datasets/all-data.csv",delimiter=',',encoding='latin-1')
df.head()
x = df.iloc[:,-1].values #news content
y = df.iloc[:,0].values #sentiment

## Data Preprocessing

In [3]:
df = df.rename(columns={'neutral':'Sentiment','According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .':'Message'})
df.head()

Unnamed: 0,Sentiment,Message
0,neutral,Technopolis plans to develop in stages an area...
1,negative,The international electronic industry company ...
2,positive,With the new production plant the company woul...
3,positive,According to the company 's updated strategy f...
4,positive,FINANCING OF ASPOCOMP 'S GROWTH Aspocomp is ag...


In [4]:
df["Sentiment"].unique()
df["Sentiment"].replace({"neutral":0,"negative":-1,"positive":1},inplace=True) #replacing char values
df["Sentiment"].unique()

array([ 0, -1,  1], dtype=int64)

## Text Cleaning

In [5]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string

nltk.download('omw-1.4')
nltk.download('stopwords')
nltk.download('wordnet')

def text_cleaner(text):
    stemmer = WordNetLemmatizer()
    nopunc = [char for char in text if char not in string.punctuation]
    nopunc = ''.join([i for i in nopunc if not i.isdigit()])
    nopunc = [word.lower() for word in nopunc.split()]
    return [stemmer.lemmatize(word) for word in nopunc]

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\andre\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\andre\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\andre\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [6]:

corpus = []
for i in range(0,len(df["Message"])):
    news = text_cleaner(df["Message"][i])
    news = ' '.join(news)
    corpus.append(news)
#print(corpus)

In [7]:
#bag of words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=4800) #converts text to numerical data - sparse matrix
x = cv.fit_transform(corpus).toarray()
y=df.iloc[:,0].values


In [8]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42) 

In [9]:
#training_naive_bayes
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(x_train,y_train)
y_pred = classifier.predict(x_test)
accuracy_score(y_test,y_pred)

0.5356037151702786

In [10]:
#training_logistic regression
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0,max_iter=4000)
classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_test)
accuracy_score(y_test,y_pred)


0.7657378740970072

In [19]:
sentence = text_cleaner("Blinkit's revenue zooms 207% to Rs 724 cr in FY23, loss widens to Rs 1,190 cr")
sentence = ' '.join(sentence)

input = cv.transform([sentence]).toarray()
classifier.predict(input)

array([1], dtype=int64)