In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [None]:
data = pd.read_csv('financial_news_sentiment.csv',encoding = 'latin-1',names = ['Sentiments','Text'])

In [None]:
data.head()

Unnamed: 0,Sentiments,Text
0,neutral,"According to Gran , the company has no plans t..."
1,neutral,Technopolis plans to develop in stages an area...
2,negative,The international electronic industry company ...
3,positive,With the new production plant the company woul...
4,positive,According to the company 's updated strategy f...


In [None]:
data.shape

(4846, 2)

In [None]:
df = data.copy()

In [None]:
df['Text'][0]

'According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .'

In [None]:
df['Text'][2]

'The international electronic industry company Elcoteq has laid off tens of employees from its Tallinn facility ; contrary to earlier layoffs the company contracted the ranks of its office workers , the daily Postimees reported .'

In [None]:
df['Text'][3]

'With the new production plant the company would increase its capacity to meet the expected increase in demand and would improve the use of raw materials and therefore increase the production profitability .'

In [None]:
df['Sentiments'].value_counts(normalize = 'True')

Unnamed: 0_level_0,proportion
Sentiments,Unnamed: 1_level_1
neutral,0.594098
positive,0.281263
negative,0.124639


In [None]:
# label encoding Sentiments
df['Sentiments'] = df['Sentiments'].map({'neutral':0,'positive':1,'negative':2})

In [None]:
df['Sentiments'].value_counts(normalize = 'True')

Unnamed: 0_level_0,proportion
Sentiments,Unnamed: 1_level_1
0,0.594098
1,0.281263
2,0.124639


In [None]:
# split the data in independent and dependent variables

X = df['Text']
Y = df['Sentiments']


In [None]:
X.head()

Unnamed: 0,Text
0,"According to Gran , the company has no plans t..."
1,Technopolis plans to develop in stages an area...
2,The international electronic industry company ...
3,With the new production plant the company woul...
4,According to the company 's updated strategy f...


In [None]:
Y.head()

Unnamed: 0,Sentiments
0,0
1,0
2,2
3,1
4,1


In [None]:
# preprocessing of text data
import string

In [None]:
X = [x.lower() for x in X]

In [None]:
X

['according to gran , the company has no plans to move all production to russia , although that is where the company is growing .',
 'technopolis plans to develop in stages an area of no less than 100,000 square meters in order to host companies working in computer technologies and telecommunications , the statement said .',
 'the international electronic industry company elcoteq has laid off tens of employees from its tallinn facility ; contrary to earlier layoffs the company contracted the ranks of its office workers , the daily postimees reported .',
 'with the new production plant the company would increase its capacity to meet the expected increase in demand and would improve the use of raw materials and therefore increase the production profitability .',
 "according to the company 's updated strategy for the years 2009-2012 , basware targets a long-term net sales growth in the range of 20 % -40 % with an operating profit margin of 10 % -20 % of net sales .",
 "financing of aspoco

In [None]:
X = [''.join(c for c in x if c not in string.punctuation) for x in X ]

In [None]:
X

['according to gran  the company has no plans to move all production to russia  although that is where the company is growing ',
 'technopolis plans to develop in stages an area of no less than 100000 square meters in order to host companies working in computer technologies and telecommunications  the statement said ',
 'the international electronic industry company elcoteq has laid off tens of employees from its tallinn facility  contrary to earlier layoffs the company contracted the ranks of its office workers  the daily postimees reported ',
 'with the new production plant the company would increase its capacity to meet the expected increase in demand and would improve the use of raw materials and therefore increase the production profitability ',
 'according to the company s updated strategy for the years 20092012  basware targets a longterm net sales growth in the range of 20  40  with an operating profit margin of 10  20  of net sales ',
 'financing of aspocomp s growth aspocomp 

In [None]:
# removed numbers
X = [''.join(c for c in x if c not in '0123456789') for x in X]

In [None]:
X

['according to gran  the company has no plans to move all production to russia  although that is where the company is growing ',
 'technopolis plans to develop in stages an area of no less than  square meters in order to host companies working in computer technologies and telecommunications  the statement said ',
 'the international electronic industry company elcoteq has laid off tens of employees from its tallinn facility  contrary to earlier layoffs the company contracted the ranks of its office workers  the daily postimees reported ',
 'with the new production plant the company would increase its capacity to meet the expected increase in demand and would improve the use of raw materials and therefore increase the production profitability ',
 'according to the company s updated strategy for the years   basware targets a longterm net sales growth in the range of     with an operating profit margin of     of net sales ',
 'financing of aspocomp s growth aspocomp is aggressively pursui

In [None]:
X = [' '.join(x.split()) for x in X]

In [None]:
X

['according to gran the company has no plans to move all production to russia although that is where the company is growing',
 'technopolis plans to develop in stages an area of no less than square meters in order to host companies working in computer technologies and telecommunications the statement said',
 'the international electronic industry company elcoteq has laid off tens of employees from its tallinn facility contrary to earlier layoffs the company contracted the ranks of its office workers the daily postimees reported',
 'with the new production plant the company would increase its capacity to meet the expected increase in demand and would improve the use of raw materials and therefore increase the production profitability',
 'according to the company s updated strategy for the years basware targets a longterm net sales growth in the range of with an operating profit margin of of net sales',
 'financing of aspocomp s growth aspocomp is aggressively pursuing its growth strateg

In [None]:
from sklearn.model_selection import train_test_split

X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,random_state=100,stratify=Y)

In [None]:
X_train

['capital base and capital adequacy measurement is based on approaches under basel ii',
 'net sales are expected to remain on the same level as in',
 'finnish industrial group ruukki group plc omx helsinki rugv said on friday november that its furniture business segment incap furniture has concluded personnel negotiations that were started at the end of september',
 'net sales for the period are expected to fall well below that of last year and the result after nonrecurring items is expected to be in the red',
 'finlandbased international machinery rental company ramirent plc omx helsinki rmrv reported on friday august an operating profit of eur m on net sales of eur m for the period januaryjune',
 'a broad range of connectivity options including g hspa and wifi gives consumers high speed access to the internet',
 'the order includes a new crecent former headbox and reel',
 'boomerang boats had net sales of eur mn and it made an operating profit of eur mn in',
 'cs cabot exports of its

In [None]:
Y_train

Unnamed: 0,Sentiments
2840,0
2158,0
2457,0
4634,2
2096,0
...,...
3314,0
2909,0
3006,0
3729,0


In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [None]:
#countvec = CountVectorizer(stop_words='english')
countvec = TfidfVectorizer(stop_words='english',norm = None)

In [None]:
new_X_train1 = countvec.fit(X_train)


In [None]:
X_train_vectorize = countvec.transform(X_train)


In [None]:
X_test_vectorized=countvec.transform(X_test)


In [None]:
from sklearn.naive_bayes import MultinomialNB, GaussianNB


In [None]:
mnb = MultinomialNB()
tfidfmnb = MultinomialNB()

In [None]:
mnb.fit(X_train_vectorize,Y_train)


In [None]:
prediction = mnb.predict(X_test_vectorized)

In [None]:
from sklearn.metrics import accuracy_score,confusion_matrix, classification_report

In [None]:
accuracy_score(Y_test,prediction)

0.654639175257732

In [None]:
confusion_matrix(Y_test,prediction)

array([[418,  92,  66],
       [ 87, 130,  56],
       [ 18,  16,  87]])

In [None]:
print(classification_report(Y_test,prediction))

              precision    recall  f1-score   support

           0       0.80      0.73      0.76       576
           1       0.55      0.48      0.51       273
           2       0.42      0.72      0.53       121

    accuracy                           0.65       970
   macro avg       0.59      0.64      0.60       970
weighted avg       0.68      0.65      0.66       970



In [None]:
# lets write a function to check new sentence -- category

In [None]:
def enter_text(message):
  if mnb.predict(countvec.transform([message])) == 0:
    print("This message is neutral")
  elif mnb.predict(countvec.transform([message]))==1:
    print("This message is positive")
  else:
    print("This message is negative")

In [None]:
enter_text(" Hello friends , whats the latest update in today match")

This message is neutral


In [None]:
enter_text(" The google has laid off hundreds of employees from its facility")

This message is negative


In [None]:
enter_text("In T20 summit indian shows leadership skill")

This message is positive


In [None]:
enter_text("Its that day light")

This message is neutral


In [None]:
# reason of not getting good accuracy
#1 - stopwords can hold some meaning and may not be required to be removed
#2 - normalization - Tfidf by default is L2 normalize it scale down between 0 and 1
#3 - if we change it to None it sometime improve performance