# Sentiment Analysis

In [24]:
import os
import re
import pandas as pd
from nltk.tokenize import word_tokenize


In [25]:
def read_words_from_file(file_path):
    with open(file_path, 'r', encoding='latin-1') as file:
        words = [line.strip() for line in file if line.strip()] 
    return set(words)


In [26]:
def find_special_characters(words):
    special_characters = set()
    for word in words:
        special_characters.update(char for char in word if not char.isalnum())
    return special_characters


In [27]:
def clean_text(text):
    cleaned_text = re.sub(r'[^A-Za-z\s\+\-\*]', '', text)
    return cleaned_text

In [28]:
def calculate_scores(words):
    positive_score = sum(1 for word in words if word in positive_words)
    negative_score = sum(1 for word in words if word in negative_words)
    
    polarity_score = (positive_score - negative_score) / (positive_score + negative_score + 0.000001)
    subjectivity_score = (positive_score + negative_score) / (len(words) + 0.000001)
    
    return positive_score, negative_score, polarity_score, subjectivity_score

In [29]:
def process_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()

    cleaned_text = clean_text(content)
    words = word_tokenize(cleaned_text)
    
    positive_score, negative_score, polarity_score, subjectivity_score = calculate_scores(words)
    
    return {
        'Positive Score': positive_score,
        'Negative Score': negative_score,
        'Polarity Score': polarity_score,
        'Subjectivity Score': subjectivity_score
    }

In [30]:
def process_output(output_folder):
    results = []
    for filename in os.listdir(output_folder):
        file_path = os.path.join(output_folder, filename)
        if filename.endswith(".txt"):
            file_result = process_file(file_path)
            results.append({'File': filename, **file_result})

    return pd.DataFrame(results)

In [31]:

positive_words = read_words_from_file('positive-words.txt')
negative_words = read_words_from_file('negative-words.txt')

special_characters_positive = find_special_characters(positive_words)
print("Special characters in positive words:", special_characters_positive)

special_characters_negative = find_special_characters(negative_words)
print("Special characters in negative words:", special_characters_negative)


Special characters in positive words: {'-', '+'}
Special characters in negative words: {'-', '*'}


In [32]:
input_file = 'output_no_stopwords'

df_results = process_output(input_file)

df_results.to_csv('file1.csv', index=False)

df_results

Unnamed: 0,File,Positive Score,Negative Score,Polarity Score,Subjectivity Score
0,blackassign_blackassign0001.txt,31,7,0.631579,0.064189
1,blackassign_blackassign0002.txt,58,30,0.318182,0.113695
2,blackassign_blackassign0003.txt,41,24,0.261538,0.088919
3,blackassign_blackassign0004.txt,42,75,-0.282051,0.165957
4,blackassign_blackassign0005.txt,23,9,0.437500,0.069869
...,...,...,...,...,...
93,blackassign_blackassign0096.txt,31,55,-0.279070,0.130501
94,blackassign_blackassign0097.txt,27,36,-0.142857,0.120921
95,blackassign_blackassign0098.txt,6,1,0.714286,0.034314
96,blackassign_blackassign0099.txt,16,4,0.600000,0.056818
