# 모듈 및 데이터 로딩

In [None]:
from typing import Any

import matplotlib.pyplot as plt
# %pip install -r ../requirements.txt
import pandas as pd
import seaborn as sns
from pandas import DataFrame

In [None]:
data: DataFrame | Any = pd.read_csv('yelp.csv', index_col=0)

# 데이터 탐색 & Data Cleaning & Feature Engineering

In [None]:
data.head()

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
# 불필요한 컬럼 제거
data.drop(['review_id', 'user_id', 'business_id', 'date'], axis=1, inplace=True)

In [None]:
# 텍스트 길이 컬럼으로 추가
data['text_length'] = data['text'].apply(len)

In [None]:
# 별점 분포 확인
data['stars'].value_counts()

In [None]:
sns.countplot(x='stars', data=data)

In [None]:
# 텍스트 길이 분포 확인
sns.displot(data['text_length'], kde=True)

In [None]:
# 변수별 상관관계 확인
data.drop("text", axis=1).corr()

In [None]:
sns.heatmap(data.drop("text", axis=1).corr(), cmap='coolwarm')

# 불필요한 기호 제거

In [None]:
import string

In [None]:
# 펑션으로 만들어보기
def remove_punc(x):
    new_s = []
    for i in x:
        if i not in string.punctuation:
            new_s.append(i)
    new_s = ''.join(new_s)
    return new_s

In [None]:
data['text'].apply(remove_punc)

In [None]:
# List Comprehension으로 만들어보기
data['text'] = data['text'].apply(lambda x: ''.join([i for i in x if i not in string.punctuation]))

In [None]:
data.head()

# 상투적인 단어 제거하기

In [None]:
from nltk.corpus import stopwords

In [None]:
import nltk

nltk.download('stopwords')

In [None]:
# 펑션으로 만들어보기
def stop_w(x):
    new_s = []
    for i in x.split():
        if i.lower() not in stopwords.words('english'):
            new_s.append(i.lower())
    return new_s

In [None]:
# data['text'].apply(stop_w)

In [None]:
# List Comprehension으로 만들어보기
data['text'] = data['text'].apply(
    lambda x: [i.lower() for i in x.split() if i.lower() not in stopwords.words('english')])

In [None]:
data.head()

# 단어 등장 빈도 확인하기

In [None]:
data["text"].head()

In [None]:
data.iloc[0]["text"]

In [None]:
# 모든 단어를 하나의 리스트로 통합하기
word_split = []

for i in range(len(data)):
    for j in data.iloc[i]['text']:
        word_split.append(j)


In [None]:
word_split

In [None]:
len(word_split)

In [None]:
from nltk.probability import FreqDist

In [None]:
plt.figure(figsize=(20,10))
FreqDist(word_split).plot(50)

# Word Cloud 만들기

In [None]:
# https://www.lfd.uci.edu/~gohlke/pythonlibs/#wordcloud
# !python -m pip install ../wordcloud-1.8.1-cp311-cp311-win_amd64.whl
from wordcloud import WordCloud

In [None]:
wc = WordCloud().generate(str(data['text']))
plt.figure(figsize=(10, 5))
plt.imshow(wc)
plt.axis('off')

In [None]:
data['stars'].value_counts()

# 1점과 5점 Text에 대해 각각 Word Cloud 만들기

In [None]:
good = data[data['stars'] == 5]['text']
bad = data[data['stars'] == 1]['text']

In [None]:
wc = WordCloud().generate(str(good))
plt.figure(figsize=(10, 5))
plt.imshow(wc)
plt.axis('off')

In [None]:
wc = WordCloud().generate(str(bad))
plt.figure(figsize=(10, 5))
plt.imshow(wc)
plt.axis('off')#%%
len(word_split)

In [None]:
word_split

In [None]:
%pip install pyperclip

In [None]:
import pyperclip as cb

In [None]:
cb.copy(str(word_split))

In [None]:
str(word_split)

# 참고할 만한 사이트