**Importing libraries**

In [1]:
#imports

import pandas as pd
import numpy as np

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

from google.colab import drive
import nltk
import re
import string
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [2]:
drive.mount('/content/drive')

Mounted at /content/drive


**Dataframe preprocessing**

In [3]:
# splits review rating into different classes, say positive(2), neutral(1) and negative(0). 
def classify(x):
    if x == 5.0 or x==4.0:
        return 2
    if x==3.0:
        return 1 
    return 0

In [4]:
def clean_dataframe(df):
  # creates new column with corresponding class labels, the output variable.
  df['y'] = df['overall'].apply(classify)

  # dropping uneccesary columns for the analysis
  df = df.drop(labels=['Unnamed: 0', 'verified','asin' ,'style','reviewerName',  'description','title', 'rank', 'main_cat' ],axis=1)
  
  # dropping all NaN values from the column reviewText
  df = df.dropna(axis=0, subset=['reviewText'])
  return df

**Text Preprocessing**

In [5]:
# removes all the punctuations for the strings of reviewText, ie '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
def remove_punctuation(text):
    no_punct=[words for words in text if words not in string.punctuation]
    words_wo_punct=''.join(no_punct)
    return words_wo_punct

In [6]:
def text_process(df):
  # Removing all the punctuations from the words, and changing the words to lower case to maintain uniformity
  df['reviewText']=df['reviewText'].apply(lambda x: remove_punctuation(x.lower()))
  # stemming
  stemmer = PorterStemmer()
  # stop words are the words like "the, I, our etc"
  words = stopwords.words("english") 
  df['cleaned_reviews'] = df['reviewText'].apply(lambda x: " ".join([stemmer.stem(i) for i in re.sub("[^a-zA-Z]", " ", x).split() if i not in words]).lower())
  return df

**Main process**


In [7]:
def process_df(df):
  df = clean_dataframe(df)
  df = text_process(df)
  return df

In [11]:
#dataset links source and destination

URL_UNCLEAN = "/content/drive/MyDrive/Colab Notebooks/Amazon_datset.csv"
URL_CLEAN = "/content/drive/MyDrive/Colab Notebooks/Amazon_datset_cleaned.csv"

In [12]:
# reading dataset and writing cleaned version to it
def main(URL_CLEAN=URL_CLEAN, URL_UNCLEAN=URL_UNCLEAN):
  df_unclean = pd.read_csv(URL_UNCLEAN)
  print("UNCLEANED DATASET HEAD:\n",df_unclean.head(),"\n")

  df_clean = process_df(df_unclean)
  print("CLEANED DATASET HEAD:\n",df_clean.head(),"\n")

  #writing to dataframe
  df_clean.to_csv(URL_CLEAN)

In [13]:
main()

UNCLEANED DATASET HEAD:
    Unnamed: 0  overall  verified        asin  \
0     4359171      5.0      True  B002TLT10I   
1     3938786      5.0      True  B001UQ6E3K   
2     6176764      5.0      True  B005FYNSUA   
3     1546547      5.0      True  B0007ZFLYI   
4     3178737      5.0      True  B0016JMS90   

                                           style      reviewerName  \
0  {'Product Packaging:': ' Standard Packaging'}   Catherine Hynes   
1                        {'Style:': ' 20-60x60'}         rschoepke   
2                        {'Capacity:': ' 16 GB'}              Carl   
3                                            NaN  Scott W. Soyster   
4                                            NaN            Dancer   

                                          reviewText  \
0  Love these things.  The supplied batteries wer...   
1            great value, good range and easy to use   
2    Smallest thumb drive ever! Can't beat the price   
3  I bought this webcam mostly because of