In [None]:
import pandas as pd

In [5]:
df = pd.read_json("hf://datasets/Daniel-ML/sentiment-analysis-for-financial-news-v2/pd_df_json.json", lines=True)
df.head()

Unnamed: 0,sentiment,text
0,neutral,"According to Gran , the company has no plans t..."
1,neutral,Technopolis plans to develop in stages an area...
2,negative,The international electronic industry company ...
3,positive,With the new production plant the company woul...
4,positive,According to the company 's updated strategy f...


In [6]:
# keeping only important columns
df = df.dropna()
print(df.head)
print(df.info())

<bound method NDFrame.head of      sentiment                                               text
0      neutral  According to Gran , the company has no plans t...
1      neutral  Technopolis plans to develop in stages an area...
2     negative  The international electronic industry company ...
3     positive  With the new production plant the company woul...
4     positive  According to the company 's updated strategy f...
...        ...                                                ...
4841  negative  LONDON MarketWatch -- Share prices ended lower...
4842   neutral  Rinkuskiai 's beer sales fell by 6.5 per cent ...
4843  negative  Operating profit fell to EUR 35.4 mn from EUR ...
4844  negative  Net sales of the Paper segment decreased to EU...
4845  negative  Sales in Finland decreased by 10.5 % in Januar...

[4846 rows x 2 columns]>
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4846 entries, 0 to 4845
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  --

In [8]:
# counting words
import spacy
nlp = spacy.load("en_core_web_sm")

def count_words(text):
  doc = nlp(text)
  words = [token for token in doc if token.is_alpha]
  return len(words)

df["words_count"] = df["text"].apply(count_words)
print(df.head())

  sentiment                                               text  words_count
0   neutral  According to Gran , the company has no plans t...           22
1   neutral  Technopolis plans to develop in stages an area...           28
2  negative  The international electronic industry company ...           33
3  positive  With the new production plant the company woul...           32
4  positive  According to the company 's updated strategy f...           30


In [10]:
# Vader scores
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')



vader = SentimentIntensityAnalyzer()

def vader_analysis(text):
  return vader.polarity_scores(text)

df_vader = df['text'].apply(vader_analysis).apply(pd.Series)

df = pd.concat([df, df_vader], axis=1)

print(df.head())

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/abaziz/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


  sentiment                                               text  words_count  \
0   neutral  According to Gran , the company has no plans t...           22   
1   neutral  Technopolis plans to develop in stages an area...           28   
2  negative  The international electronic industry company ...           33   
3  positive  With the new production plant the company woul...           32   
4  positive  According to the company 's updated strategy f...           30   

     neg    neu    pos  compound  
0  0.092  0.837  0.071   -0.1280  
1  0.073  0.927  0.000   -0.2960  
2  0.000  1.000  0.000    0.0000  
3  0.038  0.660  0.302    0.8555  
4  0.000  0.853  0.147    0.6705  


In [12]:
# encoding labels
label_map = {
    "negative": 0,
    "neutral": 1,
    "positive": 2
}
df['sentiment_encoded'] = df['sentiment'].map(label_map)
df.head()

Unnamed: 0,sentiment,text,words_count,neg,neu,pos,compound,sentiment_encoded
0,neutral,"According to Gran , the company has no plans t...",22,0.092,0.837,0.071,-0.128,1
1,neutral,Technopolis plans to develop in stages an area...,28,0.073,0.927,0.0,-0.296,1
2,negative,The international electronic industry company ...,33,0.0,1.0,0.0,0.0,0
3,positive,With the new production plant the company woul...,32,0.038,0.66,0.302,0.8555,2
4,positive,According to the company 's updated strategy f...,30,0.0,0.853,0.147,0.6705,2


In [13]:
# undersampling
min_class = df["sentiment_encoded"].value_counts().min()
df_balanced = df.groupby("sentiment_encoded").sample(n=min_class, random_state=42)
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

In [15]:
df_balanced.to_csv('../data/news_postprocess.csv', index=False, sep=";")