In [None]:
#Mounting the Google drive
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
#Changing working directory
%cd /content/gdrive/My Drive/ML-in-colab/Sentiment_analysis_amazon_fine_food_review

/content/gdrive/My Drive/ML-in-colab/Sentiment_analysis_amazon_fine_food_review


In [None]:
import os
os.environ['KAGGLE_CONFIG_DIR'] = "/content/gdrive/My Drive/ML-in-colab"

In [None]:
#downloading kaggle dataset
!kaggle datasets download -d snap/amazon-fine-food-reviews -p data

Downloading amazon-fine-food-reviews.zip to data
 97% 235M/242M [00:01<00:00, 140MB/s]
100% 242M/242M [00:01<00:00, 140MB/s]


In [None]:
#we can check the content by ls command
!ls data

amazon-fine-food-reviews.zip


In [None]:
#Unziping the file into data folder
!unzip data/\*.zip -d data

Archive:  data/amazon-fine-food-reviews.zip
  inflating: data/Reviews.csv        
  inflating: data/database.sqlite    
  inflating: data/hashes.txt         


In [None]:
#checking the file content
!ls data

amazon-fine-food-reviews.zip  database.sqlite  hashes.txt  Reviews.csv


In [None]:
#removing zip file from data folder
!rm data/*.zip

In [None]:
!ls data

database.sqlite  hashes.txt  Reviews.csv


In [None]:
#importing required libraries
import pandas as pd
import re
from bs4 import BeautifulSoup
import csv

In [None]:
#Reading the file
file = pd.read_csv('data/Reviews.csv')

In [None]:
#Checking the number of total reviews
reviews = 0
with open('data/Reviews.csv') as file:
  reader = csv.reader(file)
  for row in reader:
    reviews +=1
print("Total number of Reviews is: {}".format(reviews))

Total number of Reviews is: 568455


In [None]:
file.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [None]:
#keeping the Required columns in DF
data = file[['Text','Score']]
data.head()
#we are experimenting with text only now we will later take the Summary column also

Unnamed: 0,Text,Score
0,I have bought several of the Vitality canned d...,5
1,Product arrived labeled as Jumbo Salted Peanut...,1
2,This is a confection that has been around a fe...,4
3,If you are looking for the secret ingredient i...,2
4,Great taffy at a great price. There was a wid...,5


In [None]:
#We have '5' scores for the reviews
#We coinsider '1' & '2' as Negative Reviews
#we coinsider '4' & '5' as Positive Reviews
data['Score'] = data['Score'].map({1:0, 2:0, 4:1, 5:1})
#We are remove Reviews with Score '3' bcz they are neighter Positive or negative,
#they are coinsidered to be Neutral
data = data[data['Score']!=3]

In [None]:
print("total no of reviews now: {}".format(len(data)))

total no of reviews now: 525814


In [None]:
#converting float to int
data['Score'] = data['Score'].astype(int)
data.head()

Unnamed: 0,Text,Score
0,I have bought several of the Vitality canned d...,1
1,Product arrived labeled as Jumbo Salted Peanut...,0
2,This is a confection that has been around a fe...,1
3,If you are looking for the secret ingredient i...,0
4,Great taffy at a great price. There was a wid...,1


Text Preprocessing

In [None]:
#Defining StopWords
stopwords= set(['br', 'the', 'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've",\
            "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', \
            'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their',\
            'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', \
            'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', \
            'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', \
            'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after',\
            'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further',\
            'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more',\
            'most', 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than', 'too', 'very', \
            's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', \
            've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn',\
            "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn',\
            "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", \
            'won', "won't", 'wouldn', "wouldn't"])

In [None]:
def decontracted(phrase):
    # specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

def text_preprocess(df, text_column_to_preprocess):

  from tqdm import tqdm
  preprocessed_reviews = []

  for sentance in tqdm(df[text_column_to_preprocess].values):
      sentance = re.sub(r"http\S+", "", sentance)
      sentance = BeautifulSoup(sentance, 'lxml').get_text()
      sentance = decontracted(sentance)
      sentance = re.sub("\S*\d\S*", "", sentance).strip()
      sentance = re.sub('[^A-Za-z]+', ' ', sentance)
      # https://gist.github.com/sebleier/554280
      # now using stopwords as of now
      sentance = ' '.join(e.lower() for e in sentance.split()) # if e.lower() not in stopwords)
      preprocessed_reviews.append(sentance.strip())
  
  df[text_column_to_preprocess] = preprocessed_reviews
  return df

In [None]:
preprocessed_reviews = text_preprocess(data, 'Text')

100%|██████████| 525814/525814 [03:56<00:00, 2219.08it/s]


In [None]:
preprocessed_reviews.sample()

Unnamed: 0,Text,Score
182195,i have been using this product for more that y...,1


In [None]:
#no of Posive reviews
print("Positive Reviews:",len(preprocessed_reviews[preprocessed_reviews['Score']==1]))

Positive Reviews: 443777


In [None]:
#no of negative reviews
print("Negative Reviews:",len(preprocessed_reviews[preprocessed_reviews['Score']==0]))

Negative Reviews: 82037


We can see its a highly imbalance data so we will add more negative data by duplicating the existing negative reviews

In [None]:
# creating Negative Reviews Df for adding
negative_review_df = preprocessed_reviews[preprocessed_reviews['Score']==0]

In [None]:
#Adding back to the dataframe
preprocessed_reviews = preprocessed_reviews.append([negative_review_df,negative_review_df,negative_review_df,negative_review_df], ignore_index=True)

In [None]:
# shuffle the DataFrame rows
preprocessed_reviews = preprocessed_reviews.sample(frac = 1)

In [None]:
preprocessed_reviews.to_csv('data/preprocessed_reviews.csv', index=False )

Preprocessing Done


In [None]:
len(preprocessed_reviews[preprocessed_reviews['Score']==0])

410185

In [None]:
len(preprocessed_reviews[preprocessed_reviews['Score']==1])

443777