# Dataset Parser
We need to parse the original dataset br.csv in order to label each review as positive, negative or neutral, and to remove user reviews that are empty, that have impossible to encode characters, that appear more than once, that are not in English or that do not have a user rating score that prevents us from assigning a label to them (which we need for our supervised learning naïve-Bayes algorithm)

In [None]:
import pandas as pd
import numpy as np
import csv
import re
import os
from textblob import TextBlob


with open('temp.csv', 'w') as w:
    with open("br.csv", encoding='utf-8') as f:
        reader = csv.reader(f)
        #Read every line of the file
        for row in reader:
            #Check if the row contains a score for reviewer rating (so we can label the review as neutral, positive, or negative)
            reviewer_rating = len(row) - 2
            if(row[reviewer_rating]):
                #Check if the row contains a written review (to train the classifier)
                review = len(row) - 1
                if(row[review]):
                    #Check if you are working with the first line so you can add a new column, "label"
                    if(row[0] == "bookID"):
                        row.append("label")
                        w.write(",".join(row))
                        w.write("\n")
                    else:
                        #Copy the existing data rows into a new csv file   
                        for i in range(0, len(row)):
                            if("," in row[i] and i != 8):
                                #Remove identifiable symbols
                                regex = re.compile('[•�,\n\[\].Ã=«*✿!·(©)"‘’¿“”€™?/ÃÏŸâœ@$-;:_#%&^{}+´ñÑ¨1234567890<≠>–]')
                                row[i] = regex.sub(' ', row[i])
                                try:
                                    w.write('"{}"'.format(row[i]))
                                #If the review is composed purely of non-latin alphabetic characters or if it written in a language that is not English or if its made purely of symbols e.g. "ÙŠØ³ØªØ·ÙŠØ¹ Ø§Ù„Ù†Ø¬Ø§ØØ¨Ù‡Ø§ ØŒ ÙƒØ°Ù„Ùƒ ØªØØ¯Ø« Ù‡Ø°Ø§ Ø§Ù„Ø¬Ø²Ø¡ Ø¹Ù† Ù…Ø±Ø¶" then leave the review column empty
                                except ValueError:
                                    print("Line with bookID", row[0], "was not copied because of illegal characters.")    
                            elif(i == 8):
                                row[i] = row[i].replace("'ll", " will")
                                row[i]  = row[i].replace("'ve", " have")
                                row[i]  = row[i].replace("'d", " would")
                                row[i]  = row[i].replace("'m", " am")
                                row[i]  = row[i].replace("'re", " are")
                                row[i]  = row[i].replace("n't", " not")
                                #Remove identifiable symbols
                                regex = re.compile('[•�,\n\[\].Ã=«*✿!·(©)"‘’¿“”€™?/ÃÏŸâœ@$-;:_#%&^{}+´ñÑ¨1234567890<≠>–]')
                                row[i] = regex.sub(' ', row[i])
                                try:
                                    w.write(row[i])
                                #If the review is composed purely of non-latin alphabetic characters or if it written in a language that is not English or if its made purely of symbols e.g. "ÙŠØ³ØªØ·ÙŠØ¹ Ø§Ù„Ù†Ø¬Ø§ØØ¨Ù‡Ø§ ØŒ ÙƒØ°Ù„Ùƒ ØªØØ¯Ø« Ù‡Ø°Ø§ Ø§Ù„Ø¬Ø²Ø¡ Ø¹Ù† Ù…Ø±Ø¶" then leave the review column empty
                                except ValueError:
                                    print("Line with bookID", row[0], "was not copied because of illegal characters.")    
                            else:
                                try:
                                    w.write(row[i])
                                #If the review is composed purely of non-latin alphabetic characters or if it written in a language that is not English or if its made purely of symbols e.g. "ÙŠØ³ØªØ·ÙŠØ¹ Ø§Ù„Ù†Ø¬Ø§ØØ¨Ù‡Ø§ ØŒ ÙƒØ°Ù„Ùƒ ØªØØ¯Ø« Ù‡Ø°Ø§ Ø§Ù„Ø¬Ø²Ø¡ Ø¹Ù† Ù…Ø±Ø¶" then leave the review column empty
                                except ValueError:
                                    print("Line with bookID", row[0], "was not copied because of illegal characters.")                              
                            w.write(",")
                        #Label each row, if the reviewer rating is 3 then neutral, if more positive, if less negative   
                        if(int(row[reviewer_rating]) == 3):
                            w.write("0")
                        elif(int(row[reviewer_rating]) > 3):
                            w.write("1")
                        elif(int(row[reviewer_rating]) < 3):
                            w.write("-1")
                        w.write("\n")
                        
print("Parsing Terminated.")
#Create a dataframe and store reviews there, so that before adding a new review we can check against it and make sure we are not adding a duplicate review
df = pd.DataFrame()
with open('dataset.csv', "w") as w:
    with open('temp.csv') as f:
        reader = csv.reader(f)
        positive_reviews_count = 0
        negative_reviews_count = 0
        neutral_reviews_count = 0
        for row in reader:
            review = len(row) - 2
            label = len(row) - 1
            #Check if this row is in our dataframe already
            if(len(df) == 0 or len(df[df.review == row[review]]) == 0):
                #If not, add it so we know this review has already been processed
                df = df.append({'review': row[review]}, ignore_index=True)
                #Check if the row contains a written review (to train the classifier)  
                if(row[review].split() != "".split()):
                    #Check if that review is in English
                    if(TextBlob(row[review]).detect_language() == "en"):
                        #If so, add that row to our final dataset
                        print("Wait, we're copying line with bookID", row[0])
                        if(row[label] == '1'):
                            positive_reviews_count = 1 + positive_reviews_count
                        elif(row[label] == '0'):
                            neutral_reviews_count = 1 + neutral_reviews_count
                        elif(row[label] == '-1'):
                            negative_reviews_count = 1 + negative_reviews_count
                        for i in range(0, len(row) - 1):
                            if("," in row[i]):
                                w.write('"{}"'.format(row[i]))
                            else:
                                w.write(row[i])
                            w.write(",")
                        w.write(row[len(row) - 1])
                        w.write("\n")
                
row_count = positive_reviews_count + negative_reviews_count + neutral_reviews_count
print("\nDataset Ready.")
os.remove("temp.csv")
print("\nStatistics:")
print("Number of positive reviews in dataset:", positive_reviews_count)
print("Number of negative reviews in dataset:", negative_reviews_count)
print("Number of neutral reviews in dataset:", neutral_reviews_count)
print("Total number of rows in dataset:", row_count) #The first row with the column names doesn't count
