<a href="https://colab.research.google.com/github/avinash-vk/Sentiment-analysis-on-amazon-reviews/blob/main/datacleaning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Importing libraries**

In [None]:
#imports

import pandas as pd
import numpy as np

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

import nltk
import re
import string
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

**Dataframe preprocessing**

In [None]:
# splits review rating into different classes, say positive(2), neutral(1) and negative(0). 
def classify(x):
    if x == 5.0 or x==4.0:
        return 2
    if x==3.0:
        return 1 
    return 0

In [None]:
def clean_dataframe(df):
  # creates new column with corresponding class labels, the output variable.
  df['y'] = df['overall'].apply(classify)

  # dropping uneccesary columns for the analysis
  df = df.drop(labels=['Unnamed: 0', 'verified','asin' ,'style','reviewerName',  'description','title', 'rank', 'main_cat' ],axis=1)
  
  # dropping all NaN values from the column reviewText
  df = df.dropna(axis=0, subset=['reviewText'])
  return df

**Text Preprocessing**

In [None]:
# removes all the punctuations for the strings of reviewText, ie '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
def remove_punctuation(text):
    no_punct=[words for words in text if words not in string.punctuation]
    words_wo_punct=''.join(no_punct)
    return words_wo_punct

In [None]:
def text_process(df):
  # Removing all the punctuations from the words, and changing the words to lower case to maintain uniformity
  df['reviewText']=df['reviewText'].apply(lambda x: remove_punctuation(x.lower()))
  # stemming
  stemmer = PorterStemmer()
  # stop words are the words like "the, I, our etc"
  words = stopwords.words("english") 
  df['cleaned_reviews'] = df['reviewText'].apply(lambda x: " ".join([stemmer.stem(i) for i in re.sub("[^a-zA-Z]", " ", x).split() if i not in words]).lower())
  return df

**Main process**


In [None]:
def process_df(df):
  df = clean_dataframe(df)
  df = text_process(df)
  return df

In [None]:
#dataset links source and destination

URL_UNCLEAN = "/content/drive/MyDrive/datasets fragmented/coldropDS-50k.csv"
URL_CLEAN = "/content/drive/MyDrive/datasets fragmented/coldropDS-50k_cleaned.csv"

In [None]:
# reading dataset and writing cleaned version to it
def main(URL_CLEAN=URL_CLEAN, URL_UNCLEAN=URL_UNCLEAN):
  df_unclean = pd.read_csv(URL_UNCLEAN)
  print("UNCLEANED DATASET HEAD:\n",df_unclean.head(),"\n")

  df_clean = process_df(df_unclean)
  print("CLEANED DATASET HEAD:\n",df_clean.head(),"\n")

  #writing to dataframe
  df_clean.to_csv(URL_CLEAN)

In [None]:
main()

UNCLEANED DATASET HEAD:
    Unnamed: 0  ...              main_cat
0     4359171  ...  Home Audio & Theater
1     3938786  ...        Camera & Photo
2     6176764  ...       All Electronics
3     1546547  ...       All Electronics
4     3178737  ...       All Electronics

[5 rows x 12 columns] 

CLEANED DATASET HEAD:
    overall  ...                                    cleaned_reviews
0      5.0  ...              love thing suppli batteri dead replac
1      5.0  ...                      great valu good rang easi use
2      5.0  ...          smallest thumb drive ever cant beat price
3      5.0  ...  bought webcam mostli price fact logitech prett...
4      5.0  ...  got mom she tech savvi set she problem one tim...

[5 rows x 5 columns] 

