In [None]:
import pandas as pd
import numpy as np
import torch
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from glob import glob
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix

# 머신러닝 파트

- 입력값 : 리뷰길이, 첨부사진 개수, inference 결과값


In [77]:
# load data
category_name = '스포츠의류'

df = pd.read_csv('data/review_label/review_label_{}.csv'.format(category_name))
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20195 entries, 0 to 20194
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         20195 non-null  int64  
 1   review             20195 non-null  object 
 2   review_headline    8996 non-null   object 
 3   reviewer           20194 non-null  object 
 4   prod_name          20195 non-null  object 
 5   prod_id            20195 non-null  int64  
 6   prod_link          20195 non-null  object 
 7   prod_star_score    20195 non-null  int64  
 8   prod_reg_date      20195 non-null  object 
 9   attach_count       20195 non-null  int64  
 10  help_count         20195 non-null  int64  
 11  big_category_name  20195 non-null  object 
 12  category_name      20195 non-null  object 
 13  category_id        20195 non-null  int64  
 14  category_link      0 non-null      float64
 15  label              20195 non-null  int64  
dtypes: float64(1), int64(7

### data manipulation

In [78]:
# data manipulation - 제품명, 리뷰 합치기
df['input'] = '제품명 ' + df['prod_name'].fillna('') + ' ' + '리뷰 ' + df['review']

In [79]:
# data manipulation - 리뷰길이 구하기
review_length = []
for i in df['review']:
    length = len(i)
    review_length.append(length)
df['length'] = review_length

In [80]:
# data manipulation - 학습한 모델로 inference하여 확률값 추출

# GPU 사용여부 확인
print(torch.cuda.is_available())

# hyperparameter settings
BATCH_SIZE = 32
MAX_LEN = 256

# load model
category_name = '팬티'
model_path = glob(r'C:\Users\Aytekin\Desktop\review_reliability\{}_model\results\*.pt'.format(category_name))[0]

model_name = "beomi/kcELECTRA-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = torch.load(model_path)

device = torch.device('cuda')
model.to(device)

class TestDataset(Dataset):
    def __init__(self, df):
        self.df_data = df
    def __getitem__(self, index):
        # get the sentence from the dataframe
        sentence = self.df_data.loc[index, 'input']
        encoded_dict = tokenizer(
          text = sentence,
          add_special_tokens = True, 
          max_length = MAX_LEN,
          pad_to_max_length = True,
          truncation=True,           # Pad & truncate all sentences.
          return_tensors="pt")

        padded_token_list = encoded_dict['input_ids'][0]
        token_type_id = encoded_dict['token_type_ids'][0]
        att_mask = encoded_dict['attention_mask'][0]
        sample = (padded_token_list, token_type_id , att_mask)
        return sample
    def __len__(self):
        return len(self.df_data)

data = TestDataset(df)

dataloader = DataLoader(
    data,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=0
    )

# inference
print('>> inference start <<')    
preds = [] 
model.eval()
torch.set_grad_enabled(False)
for batch_id, (input_id,token_type_id,attention_mask) in enumerate(tqdm(dataloader)):
    input_id = input_id.long().to(device)
    token_type_id = token_type_id.long().to(device)
    attention_mask = attention_mask.long().to(device)
    outputs = model(input_ids=input_id, token_type_ids=token_type_id, attention_mask=attention_mask)
    out = outputs[0]
    for inp in out:
          preds.append(inp.detach().cpu().numpy())
Preds = np.array(preds)

first = []
second = []
for i in Preds:
    first.append(i[0]) 
    second.append(i[1]) 
df['first_%'] = first
df['second_%'] = second

df['pred'] = np.argmax(Preds, axis=1)

True
>> inference start <<


100%|██████████| 632/632 [03:02<00:00,  3.47it/s]


In [85]:
# 확인
print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20195 entries, 0 to 20194
Data columns (total 21 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         20195 non-null  int64  
 1   review             20195 non-null  object 
 2   review_headline    8996 non-null   object 
 3   reviewer           20194 non-null  object 
 4   prod_name          20195 non-null  object 
 5   prod_id            20195 non-null  int64  
 6   prod_link          20195 non-null  object 
 7   prod_star_score    20195 non-null  int64  
 8   prod_reg_date      20195 non-null  object 
 9   attach_count       20195 non-null  int64  
 10  help_count         20195 non-null  int64  
 11  big_category_name  20195 non-null  object 
 12  category_name      20195 non-null  object 
 13  category_id        20195 non-null  int64  
 14  category_link      0 non-null      float64
 15  label              20195 non-null  int64  
 16  input              201

Unnamed: 0.1,Unnamed: 0,review,review_headline,reviewer,prod_name,prod_id,prod_link,prod_star_score,prod_reg_date,attach_count,...,big_category_name,category_name,category_id,category_link,label,input,length,first_%,second_%,pred
0,0,캐럿 여성용 후드 플리스 자켓 L사이즈블랙 컬러 후기에요 ~평소에 캐럿 옷을 굉장...,따뜻한데 돌돌이질 필수~,트위스트킹,"캐럿 여성용 후드 플리스 자켓, L, 블랙",1592283662,https://www.coupang.com/vp/products/1592283662,4,2023.01.05,8,...,쿠팡 홈여성패션의류,후드집업/집업류,498724,,1,"제품명 캐럿 여성용 후드 플리스 자켓, L, 블랙 리뷰 캐럿 여성용 후드 플리스 자...",677,-0.671256,0.694225,1
1,1,스펙 : 160cm / 53키로주문한 제품 : S 블랙제조 : 2021.10 인도네...,따뜻해요. 추운날씨에 딱이에요. 가성비 굿.,전세계평화,"캐럿 여성용 후드 플리스 자켓, S, 블랙",1592283662,https://www.coupang.com/vp/products/1592283662,5,2022.10.14,10,...,쿠팡 홈여성패션의류,후드집업/집업류,498724,,1,"제품명 캐럿 여성용 후드 플리스 자켓, S, 블랙 리뷰 스펙 : 160cm / 53...",771,-0.926086,0.965431,1
2,2,완전 뚱뚱해 보여요아직 한번도 못입었음흐물거려서 입을수가 없음!!!집안에서만 입을수...,"#핏이 뚱핏,빗바랜 검정색(초라해보임)따뜻하긴함 ~~~",손*여,"캐럿 여성용 후드 플리스 자켓, L, 블랙",1592283662,https://www.coupang.com/vp/products/1592283662,4,2021.02.02,8,...,쿠팡 홈여성패션의류,후드집업/집업류,498724,,1,"제품명 캐럿 여성용 후드 플리스 자켓, L, 블랙 리뷰 완전 뚱뚱해 보여요아직 한번...",1036,-0.913672,0.953251,1
3,3,간단하게 위에 걸칠 외투가 필요해서 구매했어요.후리스는 너무추운겨울을 제외하고는 입...,,해피니스87,"캐럿 여성용 후드 플리스 자켓, M, 아이보리",1592283662,https://www.coupang.com/vp/products/1592283662,3,2021.01.15,10,...,쿠팡 홈여성패션의류,후드집업/집업류,498724,,1,"제품명 캐럿 여성용 후드 플리스 자켓, M, 아이보리 리뷰 간단하게 위에 걸칠 외투...",470,-0.907972,0.965909,1
4,4,보들보들 보드라운 느낌이 넘 좋아요!진짜 가성비 우수 원단이네용ㅎㅎㅎ집에서 보일러 ...,,아리엘s2,"캐럿 여성용 후드 플리스 자켓, S, 블랙",1592283662,https://www.coupang.com/vp/products/1592283662,5,2020.12.30,5,...,쿠팡 홈여성패션의류,후드집업/집업류,498724,,1,"제품명 캐럿 여성용 후드 플리스 자켓, S, 블랙 리뷰 보들보들 보드라운 느낌이 넘...",440,-1.064534,1.102391,1


### data split

In [92]:
# 데이터분리
data = df[['length']]
target = df['label']

x_train, x_test, y_train, y_test = train_test_split(
    data,
    target,
    test_size = 0.2,
    shuffle = True,
    random_state = 42
)

In [69]:
print('num total review :',df.shape[0])
print('len(x_train) :',len(x_train))
print('len(y_train) :',len(y_train))
print('len(x_test) :',len(x_test))
print('len(y_test) :',len(y_test))
print(x_train.head(3))
print(y_train.head(3))
print('-=-'*30,'\n')

num total review : 20195
len(x_train) : 12924
len(y_train) : 12924
len(x_val) : 3232
len(y_val) : 3232
len(x_test) : 4039
len(y_test) : 4039
       attach_count  length
6207              0      33
11575             0      10
7555              1     232
6207     1
11575    0
7555     1
Name: label, dtype: int64
-=--=--=--=--=--=--=--=--=--=--=--=--=--=--=--=--=--=--=--=--=--=--=--=--=--=--=--=--=--=- 



### modeling
- RogisticRegression
- KNN
- DecisionTree
- RandomForest
- SVM
- naive bayse

#### Pipeline만들기 -> 학습

In [133]:
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix

# ['review_headline', 'prod_star_score', 'attach_count', 'input', 'length', 'first_%', 'second_%', 'pred']

# 데이터분리
data = df[['length','attach_count','first_%','second_%','prod_star_score']]
target = df['label']

x_train, x_test, y_train, y_test = train_test_split(
    data,
    target,
    test_size = 0.2,
    shuffle = True,
    random_state = 42
)
model_pipeline = []
model_pipeline.append(LogisticRegression(solver='liblinear', random_state=42))
model_pipeline.append(SVC(random_state=42))
model_pipeline.append(SVC(kernel='sigmoid', random_state=42))
model_pipeline.append(SVC(kernel='linear', random_state=42))
model_pipeline.append(SVC(kernel='poly', random_state=42))
model_pipeline.append(KNeighborsClassifier())
model_pipeline.append(DecisionTreeClassifier(random_state=42))
model_pipeline.append(RandomForestClassifier(random_state=42))
model_pipeline.append(GaussianNB())
model_pipeline.append(SGDClassifier(loss='perceptron', penalty='l2', alpha=1e-4, random_state=42, max_iter=100))

model_list = ['Logistic Regression','SVM_rbf','SVM_sigmoid','SVM_linear','SVM_poly','KNN','Dicision Tree','Random Forest','Naive Bayes','SGDClassifier_perceptron','KcELECTRA']
acc_list = []
auc_list = []
cm_list = []

for model in model_pipeline:
    model.fit(x_train,y_train)
    y_pred = model.predict(x_test)
    acc_list.append(metrics.accuracy_score(y_test,y_pred))
    fpr, tpr ,_thresholds = metrics.roc_curve(y_test, y_pred)
    auc_list.append(round(metrics.auc(fpr, tpr),2))
    cm_list.append(confusion_matrix(y_test, y_pred))

acc_Kc = metrics.accuracy_score(y_test,df['pred'])
fpr, tpr ,_thresholds = metrics.roc_curve(y_test, df['pred'])
auc_Kc = round(metrics.auc(fpr, tpr),2)
acc_list.append(acc_Kc)
auc_list.append(auc_Kc)

### 결과

In [134]:
# accuracy and AUC with ['attach_count', 'length', 'first_%', 'second_%']
# random_state = 42
print('input columns :', x_train.columns)
result_df = pd.DataFrame({'Model':model_list, "Accuracy":acc_list, 'AUC':auc_list})
result_df

input columns : Index(['length', 'attach_count', 'first_%', 'second_%', 'prod_star_score'], dtype='object')


Unnamed: 0,Model,Accuracy,AUC
0,Logistic Regression,0.700916,0.7
1,SVM_rbf,0.699678,0.7
2,SVM_sigmoid,0.538995,0.54
3,SVM_linear,0.694479,0.7
4,SVM_poly,0.599158,0.58
5,KNN,0.68928,0.69
6,Dicision Tree,0.648923,0.65
7,Random Forest,0.695222,0.7
8,Naive Bayes,0.70364,0.7
9,SGDClassifier_perceptron,0.681357,0.69


In [117]:
# accuracy and AUC with ['attach_count', 'length', 'first_%', 'second_%']
print('input columns :', x_train.columns)
result_df = pd.DataFrame({'Model':model_list, "Accuracy":acc_list, 'AUC':auc_list})
result_df

input columns : Index(['length', 'attach_count', 'first_%', 'second_%', 'prod_star_score'], dtype='object')


Unnamed: 0,Model,Accuracy,AUC
0,Logistic Regression,0.700916,0.7
1,SVM,0.699678,0.7
2,KNN,0.68928,0.69
3,Dicision Tree,0.650904,0.65
4,Random Forest,0.694726,0.69
5,Naive Bayes,0.70364,0.7
6,KcELECTRA,0.696955,0.7


In [114]:
# accuracy and AUC with ['attach_count', 'length', 'first_%', 'second_%']
print('input columns :', x_train.columns)
result_df = pd.DataFrame({'Model':model_list, "Accuracy":acc_list, 'AUC':auc_list})
result_df

input columns : Index(['length', 'attach_count', 'first_%', 'second_%'], dtype='object')


Unnamed: 0,Model,Accuracy,AUC
0,Logistic Regression,0.700173,0.7
1,SVM,0.699431,0.7
2,KNN,0.693736,0.69
3,Dicision Tree,0.644219,0.64
4,Random Forest,0.69646,0.7
5,Naive Bayes,0.70562,0.71
6,KcELECTRA,0.696955,0.7


In [100]:
# accuracy and AUC with ['attach_count', 'length']
print('input columns :', x_train.columns)
result_df = pd.DataFrame({'Model':model_list, "Accuracy":acc_list, 'AUC':auc_list})
result_df

input columns : Index(['attach_count', 'length'], dtype='object')


Unnamed: 0,Model,Accuracy,AUC
0,Logistic Regression,0.652142,0.64
1,SVM,0.650904,0.65
2,KNN,0.606338,0.61
3,Dicision Tree,0.635801,0.63
4,Random Forest,0.642486,0.64
5,Naive Bayes,0.60411,0.59


In [102]:
# accuracy and AUC with ['attach_count']
print('input columns :', x_train.columns)
result_df = pd.DataFrame({'Model':model_list, "Accuracy":acc_list, 'AUC':auc_list})
result_df

input columns : Index(['attach_count'], dtype='object')


Unnamed: 0,Model,Accuracy,AUC
0,Logistic Regression,0.578113,0.56
1,SVM,0.578113,0.56
2,KNN,0.572914,0.56
3,Dicision Tree,0.578113,0.56
4,Random Forest,0.578113,0.56
5,Naive Bayes,0.563753,0.55


In [104]:
# accuracy and AUC with ['length']
print('input columns :', x_train.columns)
result_df = pd.DataFrame({'Model':model_list, "Accuracy":acc_list, 'AUC':auc_list})
result_df

input columns : Index(['length'], dtype='object')


Unnamed: 0,Model,Accuracy,AUC
0,Logistic Regression,0.644714,0.64
1,SVM,0.649418,0.65
2,KNN,0.600644,0.6
3,Dicision Tree,0.643476,0.64
4,Random Forest,0.645209,0.64
5,Naive Bayes,0.604853,0.59
