# Sentiment Analysis Code

In [None]:
import pandas as pd
import numpy as np

In [None]:
articles = pd.read_csv("labeled_data_final.txt", encoding = 'unicode_escape')

recode = {
    "positive" : 0, 
    "negative" : 1
}

articles.sentiment = articles.sentiment.map(recode)
labeled_subset = articles.dropna(subset = ['sentiment'])

company = pd.read_csv("Company-2022-05-08.txt")

import nltk
from nltk.corpus import stopwords
from string import punctuation
import re
stops = set(stopwords.words('english'))
stops.add("'s")

In [None]:
def clean(sentence):
    """
    Cleans the paragraphs within a dataframe of articles before processing text.
    Requires the apply function on the column of text. 
    """
    # delete stopwords
    temp = " ".join(filter(lambda x: x not in stops, sentence.split()))
    # remove non-english characters
    temp = temp.encode("ascii", "ignore").decode()
    # Delete excessive spaces and return
    return re.sub("  ", " ", temp)


labeled_subset["clean_text"] = labeled_subset.text.apply(clean)


In [None]:
def seperate_companies(articles_data):
    """
    Creates a row for individual companies within the articles dataset.
    New rows are created for every value within the companies column that contains
    a comma (e.g. "31,287").
    Expected input is a dataframe of the articles with columns titled
    "companies". 
    """
    # reset the index of articles data in case rows were manipulated
    articles_data = articles_data.reset_index(drop = True)
    
    for row in range(len(articles_data)):
        # pull the company ids
        company_id = str(articles_data.companies[row])
        
        # fix if there are multiple companies in the same cell
        if ("," in company_id):
            multiple_companies = company_id.split(",") # create a list for each term split by commas
            # create a dataframe that copies the original row for as many companies as are listed
            sub_companies = pd.concat([articles_data.iloc[[row]]]*len(multiple_companies), ignore_index = True)
            
            # change each row to have its own company code
            for j in range(len(sub_companies)):
                sub_companies.at[j, 'companies'] = multiple_companies[j]
            
            # add the new dataframe to the old one
            articles_data = pd.concat([articles_data, sub_companies], ignore_index = True)
            
            # drop the original row with multiple companies in the same cell
            articles_data.drop(row, inplace = True)
    
    

    return articles_data

companies_seperated = seperate_companies(labeled_subset)

In [None]:
def isolate_to_company(articles_data, company_data):
  
    """
    Expected inputs and expected columns:
        articles_data, a dataframe of articles:
            - "id"
            - "companies"
            - "clean_text"
            
        company_data, a dataframe of companies and their codes:
            - "id"
            - "short_name"
    
    Outputs a dataframe with the same columns as the articles_data input, but with an additional "relevant_sentences column."
    This column includes only sentences that directly reference the company for the use of targeted sentiment analysis. 
    
    """
    final_sentences = []
    # reset the index of articles data in case rows were manipulated
    articles_data = articles_data.reset_index(drop = True)
    
    for row in range(len(articles_data)):
        # pull the company short name from the company_data using articles_data
        company_id = articles_data.companies[row]
        index = company_data[company_data['id'] == int(company_id)].index[0]
        company_name = company_data.short_name[index]
        
        # isolating the text to sentences where companies appear
        look_for_string = r"([^.]*?" + company_name + "[^.]*\.)"
        relevant_sentences = re.findall(look_for_string,articles_data.clean_text[row])
        final_sentences.append("".join(relevant_sentences))
        
    articles_data["relevant_sentences"] = final_sentences
        
    return articles_data

df = isolate_to_company(companies_seperated, company)

In [None]:
def relevant_sentences_to_list(relevant_sentences):
    """
    Helper function for apply method used in split_relevant_sentences().
    Convert large string of sentences into list of strings for each sentences.
    """
    l = [sentence for sentence in relevant_sentences.split(".")]
    return l[:-1] #drop last element since it is always empty string ''

In [None]:
def split_relevant_sentences(articles_data, company_data):
    """
    Expected inputs and expected columns:
        articles_data, a dataframe of articles:
            - "relevant_sentences"
            
        company_data, a dataframe of companies and their codes:
            - "id"
            - "short_name"

    Outputs a dataframe with the same columns as the articles_data input, 
    but with an additional "relevant_sentences_list" and "split_sentences" columns.
    - "relevant_sentences_list" is a list of strings, where each element is a sentence from column "relevant sentences".
    - "split_sentences" is list of dictionaries. All keys are left, center, right, where 'left' and 'right' are 
    portions of sentences to the left and right of the targeted company, and 'center' is the company name
    """
    
    df["relevant_sentences_list"] = df.relevant_sentences.apply(relevant_sentences_to_list)
    
    final_dictionaries = []
    # reset the index of articles data in case rows were manipulated
    articles_data = articles_data.reset_index(drop = True)

    for row in range(len(articles_data)):
        # pull the company short name from the company_data using articles_data
        company_id = articles_data.companies[row]
        index = company_data[company_data['id'] == int(company_id)].index[0]
        company_name = company_data.short_name[index]
        
        dict_list = [] #list for each article/company pair
        for sentence in articles_data.iloc[row].relevant_sentences_list:
            d = {'left'   : '',
                 'center' : company_name,
                 'right'  : ''}
            split_sentence = sentence.split(company_name, maxsplit = 1) #take first occurance of company, if mentioned multiple times in single sentence
            d['left']   = split_sentence[0]
            d['right']  = split_sentence[1]
            dict_list.append(d)
        
        final_dictionaries.append(dict_list)
     
    
    articles_data["split_sentences"] = final_dictionaries
        
    return articles_data

In [None]:
#pip install NewsSentiment
from NewsSentiment import TargetSentimentClassifier

def get_sentiments(articles_data):
    """
    Expected inputs and expected columns:
        articles_data, a dataframe of articles:
            - "split_sentences"

    Outputs a dataframe with the same columns as the articles_data input, but with an additional 
    "all_sentiments" and "targeted_sentiment" columns.
    - "all_sentiments" is a dictionary, with keys positive, neutral, negative. Values are averaged sentiment 
    scores from all relevant sentences in the article. Uses model from TargetSentimentClassifer from NewsSentiment library.
    - "targeted_sentiment" is string either 'positive', 'neutral', 'negative' 
    determined by max value from column "all sentiments".
    """
    tsc = TargetSentimentClassifier()
    
    final_dictionaries = []
    highest_sentiments = []
    for row in range(len(articles_data)):
        n_sentences = len(articles_data.split_sentences[row])
        
        if n_sentences != 0:
            d = {'positive' : 0,
                 'neutral'  : 0,
                 'negative' : 0}

            for i in range(n_sentences):
                #sentiment is list of dictionaries, where sentiment[0] has the highest probability, whether that be pos, neu, neg
                sentiment = tsc.infer_from_text(articles_data.split_sentences[row][i]['left'],
                                                articles_data.split_sentences[row][i]['center'], 
                                                articles_data.split_sentences[row][i]['right'])
                ordered_sentiment = sorted(sentiment, key=lambda d: d['class_label'], reverse = True) #order so always in pos, neu, neg order
                d['positive'] += ordered_sentiment[0]['class_prob']
                d['neutral']  += ordered_sentiment[1]['class_prob']
                d['negative'] += ordered_sentiment[2]['class_prob']

            # average scores
            d['positive'] = d['positive'] / n_sentences
            d['neutral']  = d['neutral']  / n_sentences 
            d['negative'] = d['negative'] / n_sentences 
            final_dictionaries.append(d)
            
            #store highest average sentiment
            highest_sentiments.append(max(d, key = d.get))
        else:
            final_dictionaries.append(dict(positive = None, neutral = None, negative = None))
            highest_sentiments.append(None)
            
        
    articles_data['all_sentiments'] = final_dictionaries
    articles_data['targeted_sentiment'] = highest_sentiments
    
    return articles_data