In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**Import addtional libraries**

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns 
import nltk

**Adding the dataset**

In [None]:
df= pd.read_csv('/kaggle/input/financial-sentiment-analysis/data.csv')
df.head(10)

In [None]:
df.shape

**EXAMPLE (Basic NLTK)**

In [None]:
example= df["Sentence"][100]
print (example)

**Tokenize**

In [None]:
token= nltk.word_tokenize(example)

**Pos Tag**

In [None]:
tag= nltk.pos_tag(token)

**Chunk**

In [None]:
entities= nltk.chunk.ne_chunk(tag)
entities.pprint()

**VADER (Valence Aware Dictionary and Sentiment Reasoner) SENTIMENT SCORING**

In [None]:
#import the library
from nltk.sentiment import SentimentIntensityAnalyzer
from tqdm.notebook import tqdm

sia= SentimentIntensityAnalyzer()

In [None]:
#testing 
sia.polarity_scores('I am so boring')

In [None]:
sia.polarity_scores(example)

In [None]:
df['ID']= df.index.values
df

In [None]:
#applying the polarity scores in entire dataframe
result= {}
for i, row in tqdm(df.iterrows(), total= len(df)):
    text= row['Sentence']
    id= row['ID']
    result[id]= sia.polarity_scores(text)


In [None]:
result

In [None]:
#making it pandas dataframe and adding the result to intial dataframe
vader= pd.DataFrame(result).T
vader= vader.reset_index().rename(columns= {'index': 'ID'})
vader= vader.merge(df, how= 'left')

In [None]:
vader

**ROBERTA PRETRAINED MODEL**

In [None]:
#import the libaries
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from scipy.special import softmax


**Load the model**

In [None]:
MODEL= f"cardiffnlp/twitter-roberta-base-sentiment"
tokenizer= AutoTokenizer.from_pretrained(MODEL)
model= AutoModelForSequenceClassification.from_pretrained(MODEL)

**Trying ROBERTA model in example**

In [None]:
Encoded= tokenizer(example, return_tensors= 'pt') #pt stands for pytorch
output= model(**Encoded)
scores= output[0][0].detach().numpy() #convert the values to numpy array
scores= softmax(scores)

#storing the result
scores_dict= {
    'roberta neg' : scores[0],
    'roberta neu' : scores[1],
    'roberta pos' : scores[2]
}
scores_dict

**Applying the model into dataframe**

In [None]:
#make the function
def polarscores_roberta(example):
    Encoded= tokenizer(example, return_tensors= 'pt') #pt stands for pytorch
    output= model(**Encoded)
    scores= output[0][0].detach().numpy() #convert the values to numpy array
    scores= softmax(scores)

    #storing the result
    scores_dict= {
        'roberta neg' : scores[0],
        'roberta neu' : scores[1],
        'roberta pos' : scores[2]
    }
    return scores_dict

In [None]:
result= {}
for i, row in tqdm(df.iterrows(), total= len(df)):
#    try: #using this if the loop broke
    text= row['Sentence']
    Id= row['ID']
    vader_result= sia.polarity_scores(text)

    vader_result_rename= {}
    for key, value in vader_result.items(): #changing the key name of vader result
        vader_result_rename[f"vader_{key}"]= value
    
    roberta_result= polarscores_roberta(text)
    bothres= {**vader_result_rename, **roberta_result}
    result[Id]= bothres
#    except RuntimeError: 
#      print(f'Broke for id {Id}')

In [None]:
#making it pandas dataframe and adding the result to intial dataframe
resultfinal= pd.DataFrame(result).T
resultfinal= resultfinal.reset_index().rename(columns= {'index': 'ID'})
resultfinal= resultfinal.merge(df, how= 'left')

In [None]:
resultfinal.head()

**Compare between result**

In [None]:
resultfinal.columns

In [None]:
sns.pairplot(data= resultfinal, 
             vars= ['vader_neg', 'vader_neu', 'vader_pos',
       'roberta neg', 'roberta neu', 'roberta pos'],
            hue= 'Sentiment',
            palette= 'tab10')
plt.show()

**THE TRANSFORMERS PIPELINE**

In [None]:
from transformers import pipeline

sent_pipeline= pipeline("sentiment-analysis")

In [None]:
#example
sent_pipeline (example)