In [1]:
import os
import re
import csv
import nltk
import requests
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from nltk import sent_tokenize, word_tokenize

# Sentiment Analysis

Sentimental analysis is the process of determining whether a piece of writing is positive, negative, or neutral. The below Algorithm is designed for use in Financial Texts. It consists of steps:

### Cleaning the Text

In [2]:
# Clean text so that we dont have unneccessary characters
def clean_text(text):
    
    # retain words like don't
    # text = re.sub(r'[a-z]+[''][a-z]')
    
    text = re.sub('the media could not be loaded.', '', text)
    
    # Include only alphabets
    text = re.sub(r'[^a-zA-Z.]', ' ', text)
    
    # Remove Unicode characters
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)
    
    # Convert to lowercase
    text = text.lower()
    
    # remove continuous spaces
    text = re.sub(r'\s\s+', ' ', text)
       
    return text

### Cleaning using Stop Words List

The Stop Words Lists are used to clean the text so that Sentiment Analysis can be performed by excluding the words found in Stop Words List. 

In [3]:
# Extracting stopwords

stop_word_files = os.listdir('StopWords')

STOPWORDS = []
for file in stop_word_files:
    path = os.path.join('StopWords', file)
    
    with open(path) as stopwords:
        reader = stopwords.read()
        STOPWORDS.extend(reader.split())
STOPWORDS = pd.Series(STOPWORDS).apply(str.lower)

In [4]:
# Removing Stopwords

def remove_stopwords(clean_text):
    words = []
    word_list = clean_text.split()
    for word in word_list:
        if word in STOPWORDS:
            continue
        words.append(word)
        
    return words

### Creating a dictionary of Positive and Negative words
The Master Dictionary is used for creating a dictionary of Positive and Negative words. We add only those words in the dictionary if they are not found in the Stop Words Lists. 

In [5]:
MasterDictionary = {'Positive': [], 'Negative': []}

with open('MasterDictionary\\negative-words.txt') as negative:
    negative_words = negative.read().split()
    for word in negative_words:
        if word not in STOPWORDS:
            MasterDictionary['Negative'].append(word)
    
with open('MasterDictionary\\positive-words.txt') as positive:
    positive_words = positive.read().split()
    for word in positive_words:
        if word not in STOPWORDS:
            MasterDictionary['Positive'].append(word)

### Extracting Derived variables
We convert the text into a list of tokens using the nltk tokenize module and use these tokens to calculate the 4 variables described below:

In [6]:
# Positive Score
def positive_score(words):
    score = 0
    for word in words:
        if word in MasterDictionary['Positive']:
            score += 1
    
    return score

In [7]:
# Negative Score

def negative_score(words):
    score = 0
    for word in words:
        if word in MasterDictionary['Negative']:
            score += 1
    
    return score * 1

In [8]:
# Polarity Score

def polarity_score(positive_score, negative_score):
    score = (positive_score - negative_score) / ((positive_score + negative_score) + 0.000001)
    return score

In [9]:
# Subjectivity Score

def subjectivity_score(positive_score, negative_score, words):
    score = (positive_score + negative_score) / ((len(words)) + 0.000001)
    return score

### Analysis of Readability
Analysis of Readability is calculated using the Gunning Fox index formula.

In [10]:
# Average Sentence Length

def avg_sent_len(text, words):
    sentences = sent_tokenize(text)
    length = len(words) / len(sentences)
    return length

In [11]:
# Percentage of Complex Words

def percentage_of_comp_words(words):
    return (complex_words(words) / len(words)) * 100

# Fog Index

def fog_index(avg_sent_length, percentage_of_comp_words):
    return (0.4 * (avg_sent_length + percentage_of_comp_words))

### Complex Word Count

In [12]:
def count_syllables(word):
    vowels = 'aeiouy'
    count = 0
    if word[0] in vowels:
        count += 1
        
    for index in range(1, len(word)):
        if word[index] in vowels and word[index-1] not in vowels:
            count += 1

    return count

In [13]:
def complex_words(words):
    count = 0
    for word in words:
        syllable_counts = count_syllables(word)
        if syllable_counts >=2:
            count += 1
    
    return count

In [14]:
def word_counts(words_):
    count = 0
    stop_words = set(stopwords.words('english'))
    for word in words_:
        if word not in stop_words:
            count += 1

    return count

### Syllable Count Per Word
Counting the number of Syllables in each word of the text by counting the vowels present in each word. We also handle some exceptions like words ending with "es","ed" by not counting them as a syllable.

In [15]:
def syllable_count_per_word(words):
    vowels = 'aeiouy'
    count = 0
    for word in words:
        if word[0] in vowels:
            count += 1
        for index in range(1, len(word)):
            if word[index] in vowels and word[index-1] not in vowels:
                count += 1
        if word.endswith('es' or 'ed'):
            count -= 1
        if count == 0:
            count += 1
    return count / len(words)

### Personal Pronouns

In [16]:
def count_personal_pronouns(text):
    pattern = r'\b(I|we|my|ours|he|she|us|him|they|them)\b'
    return len(re.findall(pattern, text))

### Average Word Length

In [17]:
def avg_word_length(words):
    word_length = 0
    for word in words:
        word_length = word_length + len(word)
        
    return word_length / len(words)

# Calculating Variables

In [18]:
def extract_variables(text):
    cleaned_text = clean_text(text)
    words = remove_stopwords(cleaned_text)
    
    pos_score = positive_score(words)
    neg_score = negative_score(words)
    pol_score = polarity_score(pos_score, neg_score)
    sub_score = subjectivity_score(pos_score, neg_score, words)
    
    sent_len = avg_sent_len(cleaned_text, words)
    p_comp_words = percentage_of_comp_words(words)
    fog_ind = fog_index(sent_len, p_comp_words)
    
    comp_words = complex_words(words)
    counts = word_counts(words)
    avg_syllable_per_word = syllable_count_per_word(words)
    personal_pronouns = count_personal_pronouns(cleaned_text)
    word_len = avg_word_length(words)
    
    variables = [pos_score, neg_score, pol_score, sub_score, sent_len, p_comp_words, fog_ind, comp_words, counts, avg_syllable_per_word, personal_pronouns, word_len]
    return variables

In [19]:
data = pd.read_csv(r'C:\Users\gowda\Jupiter Projects\reviews.csv')

header = ['POSITIVE SCORE', 'NEGATIVE SCORE', 'POLARITY SCORE', 'SUBJECTIVITY SCORE', 'AVG SENTENCE LENGTH', 'PERCENTAGE OF COMPLEX WORDS', 'FOG INDEX', 'COMPLEX WORD COUNT', 'WORD COUNT', 'SYLLABLE PER WORD', 'PERSONAL PRONOUNS', 'AVG WORD LENGTH']
root_path = 'Articles'

csv_file = open('rev.csv', 'a', newline='')
csv_writer = csv.writer(csv_file)
csv_writer.writerow(header)

from nltk.corpus import stopwords
for review in data['desciption']:
    try:
        variables = extract_variables(review)
        csv_writer.writerow(variables)
    except:
        pass

csv_file.close()