# Subjectivity Detection

### 1. Gathering Data

In [None]:
import pandas as pd

# Path to your CSV file
csv_file = '../../data/test_sample.csv'

# Read CSV into a DataFrame
df = pd.read_csv(csv_file)

df
df["lemmatized"]


### 2. Subjectivity Detection Example

In [None]:
import sys
sys.path.append('/home/mca/Opinion-Mining-Project/src')

from tokenization_utils import tokenize_sentence 
from preprocess_text import better_sentence_splitter
# from training_model import get_obj_docs_extra,get_subj_docs_extra
#nltk.corpus.subjectivity is a built-in NLTK corpus that contains a dataset of subjective and objective sentences extracted from movie reviews. 
from nltk.corpus import subjectivity
# NaiveBayesClassifier is a simple probabilistic classifier from NLTK that we use to train on those labeled sentences so it learns to classify new sentences as subjective or objective.
from nltk.classify import NaiveBayesClassifier
from nltk.classify.util import accuracy
#sent_tokenize is used to split raw text (big sentences) into sentences before classifying each one.
from nltk.tokenize import sent_tokenize
import nltk
nltk.download('subjectivity')
# We create subj_docs and obj_docs as training data for the classifier.
# The classifier uses this data to learn how to tell if a sentence is an opinion or a fact.
subj_docs = [(sent, 'subj') for sent in subjectivity.sents(categories='subj')]
obj_docs = [(sent, 'obj') for sent in subjectivity.sents(categories='obj')] 

# - Pair features with labels.
additional_subj_docs = [
    (['brand', 'guy', 'rock', 'keep'], 'subj'),
    (['thanks', 'brand', 'quick', 'support'], 'subj'),
    (['awesome', 'job'], 'subj'),
    (['love', 'this'], 'subj'),
    (['great', 'service'], 'subj'),
    (['really', 'liked', 'it'], 'subj'),
    (['highly', 'recommended'], 'subj'),
    (['fantastic', 'work'], 'subj'),
    (['well', 'done'], 'subj'),
    (['perfect', 'experience'], 'subj')
]
additional_subj_docs += [
    (['excellent', 'product'], 'subj'),
    (['very', 'happy', 'with', 'this'], 'subj'),
    (['not', 'what', 'I', 'expected'], 'subj'),
    (['disappointed', 'with', 'service'], 'subj'),
    (['highly', 'suggest', 'to', 'try'], 'subj'),
    (['would', 'buy', 'again'], 'subj'),
    (['terrible', 'quality'], 'subj'),
    (['best', 'purchase', 'ever'], 'subj'),
    (['not', 'recommend'], 'subj'),
    (['loved', 'it'], 'subj'),
]
subj_docs_extended = subj_docs + additional_subj_docs
# Feature extractor example: presence of words
def extract_features(words):
#extract_features(['The', 'movie', 'was', 'Great'])
# {
#   'the': True,
#   'movie': True,
#   'was': True,
#   'great': True
# }
    return {word.lower(): True for word in words}
# Prepare training data
# - Combine all subjective and objective labeled sentences.

# - Convert each sentence into a feature dictionary using extract_features.
# Combine with original subjective documents
# - Store all pairs in train for classifier training.
train = [(extract_features(doc), label) for doc, label in subj_docs_extended + obj_docs]
# Train classifier
classifier = NaiveBayesClassifier.train(train)
all_labels = []
for index, row in df.iterrows():
    lemmatized_text = row['lemmatized']
    print("l",lemmatized_text)
    sentences = better_sentence_splitter(lemmatized_text)
    print("sentences : ",sentences)
    sentence_labels = []
    for sentence in sentences:
        words = tokenize_sentence(sentence)   # word tokenizer
        features = extract_features(words)
        print("after extracting features : ", features)
        label = classifier.classify(features)
        sentence_labels.append(label)
        print(f"{label.upper():>4} → {sentence}")
    all_labels.append(sentence_labels)
df['sentence_subjectivity'] = all_labels
df


### 3. Change to csv

In [6]:
df.to_csv('../../data/test_sample.csv')
