# About the dataset:

Sourced from [cs.uic.edu](https://www.cs.uic.edu/~liub/FBS/sentiment-analysis.html)
- Currently analyzing [Customer Review Datasets (5 products)](http://www.cs.uic.edu/~liub/FBS/CustomerReviewData.zip)
- Contains reviews for 5 products:
	1. digital camera: Canon G3
	2. digital camera: Nikon coolpix 4300
	3. celluar phone:  Nokia 6610
	4. mp3 player:     Creative Labs Nomad Jukebox Zen Xtra 40GB
	5. dvd player:     Apex AD2600 Progressive-scan DVD player

# Preprocessing

## Parsing raw text files

Note:
- One sentence within text file does not have a ## to split on, mistakenly only have one pound sign #. Found on line number: 485

In [39]:
import re
import pandas as pd

raw_container_path = 'raw_data/customer review data/'

file_name_dict = {
    'canon_g3': "Canon G3.txt",
    'nikon_coolpix_4300': "Nikon coolpix 4300.txt",
    'nokia_6610': "Nokia 6610.txt",
    'nomad_jukebox_zen_xtra': "Creative Labs Nomad Jukebox Zen Xtra 40GB.txt",
    'apex_ad2600_dvd_player': "Apex AD2600 Progressive-scan DVD player.txt",
}

# Working on the parsing of the annotations part: Testing out how I can parse this annotations section to extract information
def parse_annotations(annotations_part):
    feature_sentiment_dict = {}
    feature_sentiment_matches =  re.findall(r'(\w+)\[(\+|-)(\d)\]', annotations_part) # need to rewrite this regex
    for match in feature_sentiment_matches:
        feature_name = match[0]
        if match[1] == '+':
            sentiment = int(match[2])
        elif match[1] == '-':
            sentiment = int(match[2]) * -1
        else:
            raise Exception("Invalid sentiment: " + match[1])
        feature_sentiment_dict[feature_name] = sentiment
    return feature_sentiment_dict

print(parse_annotations("video output[-3]"))

def parse_reviews(file_content):
    reviews = re.split(r'\[t\]', file_content) # Split the content by the review title tag [t]
    reviews = reviews[1:] # Skip header by skipping to the first [t] tag
    # display(reviews[0])
    # display(reviews[1])

    data = []
    for review in reviews:
        # 1. Remove leading and trailing whitespace from review
        # 2. Split into a list of individual lines by '\n'
        # 3. Remove leading and trailing whitespace from individual line
        lines = [line.strip() for line in review.strip().split(sep = '\n')]
        
        title = lines[0] # First line of each review is the review title
        sentences = lines[1:] # The rest are sentences
        
        for sentence in sentences:
            # Split annotations and sentence text
            # The annotations are before '##', the sentence text is after
            if '##' in sentence:
                annotations_part, sentence_text = sentence.split(sep = '##')
            elif '#' in sentence:
                annotations_part, sentence_text = sentence.split(sep = '#')
            else:
                raise Exception("Sentence does not contain a pound sign: " + sentence)

            # Append the data
            data.append({
                'title': title,
                'sentence': sentence_text.strip(),
                'annotations': annotations_part
            })
    
    # Create DataFrame
    df = pd.DataFrame(data)
    return df

# Starting with the apex:
with open(raw_container_path + file_name_dict['apex_ad2600_dvd_player'], 'r') as f:
    content = f.read()
    # print(content)
with open(raw_container_path + file_name_dict['apex_ad2600_dvd_player'], 'r') as f:
    print(len(f.readlines()))

df = parse_reviews(content)

{'output': -3}
850


In [40]:
target_sentence = "run[+3], dvd media[+2]#apex ad-2600 runs all the dvd media including dvd + r/rw and dvd-r / rw ( unlike sony or panasonic - one supports only + r/rw and another supports - r/rw ) . "

with open(raw_container_path + file_name_dict['apex_ad2600_dvd_player'], 'r') as f:
    for line_num, line in enumerate(f, start=1):
        # Check if the target sentence is in the current line
        if target_sentence in line:
            print(f"Found target sentence on line {line_num}")
            break
    else:
        print("Target sentence not found in the file.")

Found target sentence on line 485


In [41]:
df.__len__()

740