<a href="https://colab.research.google.com/github/YS-JEOUNG/amazon-review-project/blob/main/final_1_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [None]:
# 기본
import pandas as pd
import numpy as np

# 파일 오픈
import gzip
import json

# datetime
from datetime import datetime

# Warnings       ###### 왜 넣지?
import warnings
from scipy import stats
warnings.filterwarnings('ignore')

# Data

In [None]:
def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield json.loads(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

In [None]:
df = getDF('/content/drive/MyDrive/Colab Notebooks/project/data/Arts_Crafts_and_Sewing.json.gz')

In [None]:
metadf = getDF('/content/drive/MyDrive/Colab Notebooks/project/data/meta_Arts_Crafts_and_Sewing.json.gz')

In [None]:
total = pd.merge(df, metadf, how='left', on='asin')

In [None]:
total.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2911389 entries, 0 to 2911388
Data columns (total 29 columns):
 #   Column          Dtype  
---  ------          -----  
 0   overall         float64
 1   vote            object 
 2   verified        bool   
 3   reviewTime      object 
 4   reviewerID      object 
 5   asin            object 
 6   style           object 
 7   reviewerName    object 
 8   reviewText      object 
 9   summary         object 
 10  unixReviewTime  int64  
 11  image_x         object 
 12  category        object 
 13  tech1           object 
 14  description     object 
 15  fit             object 
 16  title           object 
 17  also_buy        object 
 18  image_y         object 
 19  tech2           object 
 20  brand           object 
 21  feature         object 
 22  rank            object 
 23  also_view       object 
 24  details         object 
 25  main_cat        object 
 26  similar_item    object 
 27  date            object 
 28  price       

In [None]:
# 필요없는 column 제거
total = total.drop(['style', 'image_x', 'category', 'tech1', 'description', 'fit', 'also_buy', 'image_y', 'date',
                    'tech2', 'brand', 'feature', 'rank', 'also_view', 'details', 'main_cat', 'similar_item'], axis=1)

In [None]:
# null 값 존재여부 확인
total.isnull().sum()

overall                 0
vote              2534349
verified                0
reviewTime              0
reviewerID              0
asin                    0
reviewerName          201
reviewText           2544
summary               965
unixReviewTime          0
title                8668
price                8668
dtype: int64

In [None]:
# 제품명이 중요하므로, 제품명(title) 기준으로 null값 제거
total = total.dropna(subset=['title'])

In [None]:
# null값 존재여부 재확인
total.isnull().sum()

overall                 0
vote              2527428
verified                0
reviewTime              0
reviewerID              0
asin                    0
reviewerName          201
reviewText           2540
summary               963
unixReviewTime          0
title                   0
price                   0
dtype: int64

In [None]:
# vote는 리뷰 추천수. NaN을 0으로 변환
total['vote'] = total['vote'].fillna(0)

In [None]:
# null값 존재여부 재확인
total.isnull().sum()

overall              0
vote                 0
verified             0
reviewTime           0
reviewerID           0
asin                 0
reviewerName       201
reviewText        2540
summary            963
unixReviewTime       0
title                0
price                0
dtype: int64

In [None]:
# 남아있는 null값 제거
total = total.dropna(subset=['reviewerName', 'reviewText', 'summary'])

In [None]:
# null값 존재여부 최종확인
total.isnull().sum()

overall           0
vote              0
verified          0
reviewTime        0
reviewerID        0
asin              0
reviewerName      0
reviewText        0
summary           0
unixReviewTime    0
title             0
price             0
dtype: int64

In [None]:
# summary와 reviewText를 합쳐 'review_text' 컬럼 생성
total['review_text'] = total[['summary', 'reviewText']].apply(lambda x: ' '.join(str(y) for y in x if str(y) != 'nan'), axis=1)

In [None]:
# summary와 reviewText는 필요 없으니까 제거
total = total.drop(['summary', 'reviewText'], axis=1)

In [None]:
# review_text가 제대로 만들어졌는지 확인
total['review_text'][2041]

'Five Stars Great quality product, and top notch service from the seller!'

In [None]:
# 별점(overall)이 3점 이상이면 good, 미만이면 bad로 하는 rating_class column 생성
total['rating_class'] = total['overall'].apply(lambda x: 'bad' if x < 3 else 'good')
total.head(3)

Unnamed: 0,overall,vote,verified,reviewTime,reviewerID,asin,reviewerName,unixReviewTime,title,price,review_text,rating_class
2041,5.0,0,True,"03 13, 2017",A3DKYRF9YGF6A6,6665560953,Amazon Customer,1489363200,You Son of a Bitch! 1987 Embroidered Patch,$6.41,"Five Stars Great quality product, and top notc...",good
2042,5.0,0,True,"02 6, 2017",A1I2TJ3PQ4WHK1,6665560953,Eric Breneman,1486339200,You Son of a Bitch! 1987 Embroidered Patch,$6.41,don't push too many pencils details on this pa...,good
2043,5.0,0,True,"12 23, 2016",A1ZDJ2BYZXZHLQ,6665560953,Kirkja,1482451200,You Son of a Bitch! 1987 Embroidered Patch,$6.41,Great deal if you buy in bulk! Bought these as...,good


In [None]:
# asin, reviewName, unixReviewTime 컬럼을 사용해서 중복된 리뷰 존재가 있으면 그 중 첫번째 것만 남기기
total = total.drop_duplicates(['asin', 'reviewerName', 'unixReviewTime'], keep='first')

In [None]:
# reviewTime 컬럼의 날짜 표현형식을 바꿔서 time 컬럼 생성 및 reviewTime 제거
total['time'] = total.reviewTime.str.replace(',', '')
total['time'] = pd.to_datetime(total['time'], format='%m %d %Y')
total = total.drop('reviewTime', axis=1)
total.head(3)

Unnamed: 0,overall,vote,verified,reviewerID,asin,reviewerName,unixReviewTime,title,price,review_text,rating_class,time
2041,5.0,0,True,A3DKYRF9YGF6A6,6665560953,Amazon Customer,1489363200,You Son of a Bitch! 1987 Embroidered Patch,$6.41,"Five Stars Great quality product, and top notc...",good,2017-03-13
2042,5.0,0,True,A1I2TJ3PQ4WHK1,6665560953,Eric Breneman,1486339200,You Son of a Bitch! 1987 Embroidered Patch,$6.41,don't push too many pencils details on this pa...,good,2017-02-06
2043,5.0,0,True,A1ZDJ2BYZXZHLQ,6665560953,Kirkja,1482451200,You Son of a Bitch! 1987 Embroidered Patch,$6.41,Great deal if you buy in bulk! Bought these as...,good,2016-12-23


In [None]:
# 컬럼 이름 수정
total.columns = ['ratings', 'vote', 'verified', 'reviewer_id', 'prod_id', 'reviewer_name', 'unix_review_time', 'prod_name', 'price', 'review_text', 'rating_class', 'time']

In [None]:
total.head(3)

Unnamed: 0,ratings,vote,verified,reviewer_id,prod_id,reviewer_name,unix_review_time,prod_name,price,review_text,rating_class,time
2041,5.0,0,True,A3DKYRF9YGF6A6,6665560953,Amazon Customer,1489363200,You Son of a Bitch! 1987 Embroidered Patch,$6.41,"Five Stars Great quality product, and top notc...",good,2017-03-13
2042,5.0,0,True,A1I2TJ3PQ4WHK1,6665560953,Eric Breneman,1486339200,You Son of a Bitch! 1987 Embroidered Patch,$6.41,don't push too many pencils details on this pa...,good,2017-02-06
2043,5.0,0,True,A1ZDJ2BYZXZHLQ,6665560953,Kirkja,1482451200,You Son of a Bitch! 1987 Embroidered Patch,$6.41,Great deal if you buy in bulk! Bought these as...,good,2016-12-23


In [None]:
# 파일 저장
total.to_csv('/content/drive/MyDrive/Colab Notebooks/project/final_data.csv', sep=',', encoding='utf-8', index=False)

KeyboardInterrupt: ignored

# 텍스트 전처리

In [None]:
# 파일 불러오기
total = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/project/final_data.csv')

## Import

In [None]:
import re
import urllib.request       # 왜 넣는걸까

# 시각화
%matplotlib inline
import matplotlib.pyplot as plt

# nltk
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk import word_tokenize, sent_tokenize, regexp_tokenize
from nltk.corpus import stopwords
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem import WordNetLemmatizer, PorterStemmer

import string, unicodedata
from bs4 import BeautifulSoup

!pip install contractions
from contractions import contractions_dict

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
Collecting contractions
  Downloading https://files.pythonhosted.org/packages/0a/04/d5e0bb9f2cef5d15616ebf68087a725c5dbdd71bd422bcfb35d709f98ce7/contractions-0.0.48-py2.py3-none-any.whl
Collecting textsearch>=0.0.21
  Downloading https://files.pythonhosted.org/packages/d3/fe/021d7d76961b5ceb9f8d022c4138461d83beff36c3938dc424586085e559/textsearch-0.0.21-py2.py3-none-any.whl
Collecting pyahocorasick
[?25l  Downloading https://files.pythonhosted.org/packages/7f/c2/eae730037ae1cbbfaa229d27030d1d5e34a1e41114b21447d1202ae9c220/pyahocorasick-1.4.2.tar.gz (321kB)
[K     |████████████████████████████████| 327kB 8.7MB/s 
[?25hCollecting anyascii
[?25l  Downloading https://f

In [None]:
# 긍정/부정리뷰 비율 확인 긍정/부정리뷰 비율 확인
total['rating_class'].value_counts()

good    2407786
bad      315538
Name: rating_class, dtype: int64

## 전처리

In [None]:
total['review_text']

2041       Five Stars Great quality product, and top notc...
2042       don't push too many pencils details on this pa...
2043       Great deal if you buy in bulk! Bought these as...
2044       It's made extremely well and looks GREAT! I've...
2045       Nice and perfect for an address book Thin, you...
                                 ...                        
2911384    ... took weeks to arrive and ran out of ink pr...
2911385    Great product!! Used these on our wedding canv...
2911386                            Five Stars LOVE LOVE LOVE
2911387    One Star color won't stop running once you act...
2911388    One color I received these earlier than expect...
Name: review_text, Length: 2723324, dtype: object

In [None]:
def strip_html(text):
  # html코드 제거 함수
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

def remove_between_square_brackets(text):
  # 괄호로 묶여있는 단어 제거
    return re.sub('\[[^]]*\]', '', text)

def denoise_text(text):
  # 위에 함수 두개 과정 진행하는 함수
    text = strip_html(text)
    text = remove_between_square_brackets(text)
    return text


# Define function to expand contractions
def expand_contractions(text):
  # IGNORECASE =  대소문자 관련 없이, DOTALL = 줄바꿈기호 상관없이 매칭해달라는 의미
  # re를 이용하여 정규표현식으로 매칭을 할 때 contractions_dict의 키를   
  # 찾고자 하는 형태를 만든뒤 매칭하여 축약문들을 확장 시키는 과정인듯
    contractions_pattern = re.compile('({})'.format('|'.join(contractions_dict.keys())),flags=re.IGNORECASE|re.DOTALL)
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contractions_dict.get(match)\
                        if contractions_dict.get(match)\
                        else contractions_dict.get(match.lower())
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction
    
    try:
      expanded_text = contractions_pattern.sub(expand_match, text)
      expanded_text = re.sub("'", "", expanded_text)
    except:
      return text
    return expanded_text


# special_characters removal
def remove_special_characters(text, remove_digits=True):
  # 문자, 숫자가 아닌것들을 제거
  # r은 정규 표현식을 쓸 때 가독성을 위한 장치 raw string이라는 의미
    pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
    text = re.sub(pattern, '', text)
    return text


def remove_non_ascii(words):
    """Remove non-ASCII characters from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words

def to_lowercase(words):
    """Convert all characters to lowercase from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = word.lower()
        new_words.append(new_word)
    return new_words


def remove_punctuation_and_splchars(words):
    """Remove punctuation from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', word)
        if new_word != '':
            new_word = remove_special_characters(new_word, True)
            new_words.append(new_word)
    return new_words

def replace_numbers(words):
    """Replace all interger occurrences in list of tokenized words with textual representation"""
    p = inflect.engine()
    new_words = []
    for word in words:
        if word.isdigit():
            new_word = p.number_to_words(word)
            new_words.append(new_word)
        else:
            new_words.append(word)
    return new_words

stopword_list= stopwords.words('english')
stopword_list.remove('no')
stopword_list.remove('not')
#stopword_list.remove('headphone')
#stopword_list.remove('headphones')
#stopword_list.remove('earbuds')
#stopword_list.remove('bud')
#stopword_list.remove('ear')
#stopword_list.remove('sony')
#stopword_list.remove('product')


def remove_stopwords(words):
    """Remove stop words from list of tokenized words"""
    new_words = []
    for word in words:
        if word not in stopword_list:
            new_words.append(word)
    return new_words

# 어간 추출
def stem_words(words):      
    """Stem words in list of tokenized words"""
    stemmer = LancasterStemmer()
    stems = []
    for word in words:
        stem = stemmer.stem(word)
        stems.append(stem)
    return stems

# 표제어 추출
def lemmatize_verbs(words): 
    """Lemmatize verbs in list of tokenized words"""
    lemmatizer = WordNetLemmatizer()
    lemmas = []
    for word in words:
        lemma = lemmatizer.lemmatize(word, pos='v')
        lemmas.append(lemma)
    return lemmas

In [None]:
def normalize(words):
    words = remove_non_ascii(words)
    words = to_lowercase(words)
    words = remove_punctuation_and_splchars(words)
    words = remove_stopwords(words)
    return words

def lemmatize(words):
    lemmas = lemmatize_verbs(words)
    return lemmas

In [None]:
def normalize_and_lemmatize(input):
    sample = denoise_text(input)
    sample = expand_contractions(sample)
    sample = remove_special_characters(sample)
    words = nltk.word_tokenize(sample)
    words = normalize(words)
    lemmas = lemmatize(words)
    return ' '.join(lemmas)

In [None]:
total['clean_text'] = total['review_text'].map(lambda text: normalize_and_lemmatize(text))

In [None]:
# 파일 저장
total.to_csv('/content/drive/MyDrive/Colab Notebooks/project/final_data_clean.csv', sep=',', encoding='utf-8', index=False)