In [None]:
import pandas as pd
from sklearn.linear_model import SGDClassifier
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from gensim.models import Word2Vec, KeyedVectors
from nltk.tokenize import word_tokenize
import optuna
from optuna.trial import TrialState
from optuna.visualization._plotly_imports import _imports
import random
import os
import tensorflow as tf
import csv
import re
from gensim.models.fasttext import FastText
import pickle
from glove import Corpus, Glove
from nltk.tokenize import word_tokenize
from tqdm import tqdm_notebook

In [None]:
# fasttext

# worker를 1로 해야 seed고정이 된다.
random_state = 42

random.seed(random_state)
np.random.seed(random_state)
os.environ["PYTHONHASHSEED"] = str(random_state)
tf.random.set_seed(random_state)


data = pd.read_csv('csv경로', encoding='cp949')
# print(data[:30])
print('데이터 개수 : ',len(data))
x = data['cvrs']
y = data['label']

# classes = list(set(y))

x_train, x_test, y_train, y_test = train_test_split(np.array(x), np.array(y), test_size =0.2, shuffle=True, random_state=random_state, stratify=y)
print(x_train.shape, x_test.shape)
print(y_train.shape, y_test.shape)

tokenized_train= [word_tokenize(text) for text in x_train]
tokenized_test= [word_tokenize(text) for text in x_test]
print(tokenized_test[:10])



vector_size = 200
ft_model = FastText(tokenized_train, 
                    vector_size=vector_size,
                    window=8, 
                    min_count=8, 
                    sample=0.001, 
                    sg=1, 
                    epochs=16,
                    workers=1,
                    seed = random_state)  # 훈련 재현성을 높이기 위해 가중치를 랜덤하게 초기화하는 데 사용되는 해시 함수

filepath = '파일루트'
filename = filepath + '파일이름.pkl'
pickle.dump(ft_model, open(filename, 'wb'))


def document_vectorizer(corpus, model, num_features):
    # 고유 단어 집합
    vocabulary = set(model.wv.key_to_index)
    # print(len(vocabulary))
    # print(vocabulary) # {'교환', '육개', '연휴', ... ,'십일만', '서비스'}

    def average_word_vectors(words, model, vocabulary, num_features):
        # 모델의 vector_size만큼 0으로 채워진 테이블 생성
        feature_vector = np.zeros((num_features,), dtype="float64")

        nwords = 0.
        for word in words:
            if word in vocabulary: 
                nwords = nwords + 1.
                # vocab에 속한 단어의 벡터값 모두 더해서 feature_vector에 넣어줌
                feature_vector = np.add(feature_vector, model.wv[word]) 

        if nwords:
            # vocab에 속한 단어 개수만큼 나눠서 평균 구함
            feature_vector = np.divide(feature_vector, nwords)
        return feature_vector

    features = [average_word_vectors(tokenized_sentence, model, vocabulary, num_features)
                    for tokenized_sentence in corpus]
    return np.array(features)


# generate averaged word vector features from word2vec model
X_TRAIN = document_vectorizer(corpus=tokenized_train, model=ft_model, num_features=vector_size)
X_TEST = document_vectorizer(corpus=tokenized_test, model=ft_model, num_features=vector_size)

print(f'Word2Vec model:> Train features shape: \t {X_TRAIN.shape}\n \
                         Test features shape: \t {X_TEST.shape}')

from sklearn.model_selection import cross_val_score
from sklearn.linear_model import SGDClassifier


# model = SGDClassifier(alpha=0.00001, loss='log', penalty='l2', n_jobs=-1, verbose=0, random_state=random_state, n_iter_no_change=8, validation_fraction=0.2)
model = SGDClassifier(alpha=0.0001, loss='modified_huber', penalty='l2', n_jobs=-1, verbose=0,random_state=random_state,  n_iter_no_change=8, validation_fraction=0.2)


start = datetime.now()
model.fit(X_TRAIN, y_train)
end = datetime.now()


print('시간 : ', end-start)

# svm_w2v_cv_scores = cross_val_score(model, X_TRAIN, y_train, cv=20)
# svm_w2v_cv_mean_score = np.mean(svm_w2v_cv_scores)
# print('CV Accuracy (5-fold):', svm_w2v_cv_scores)
# print('Mean CV Accuracy:', svm_w2v_cv_mean_score)

test_accuracy = model.score(X_TEST, y_test)
train_accuracy = model.score(X_TRAIN, y_train)
print('Train Accuracy:', train_accuracy)
print('Test Accuracy:', test_accuracy)

y_pred = model.predict(X_TEST)

test_report = classification_report(y_pred, y_test, zero_division=0)
print('test_report : \n',test_report)

In [None]:
# glove

random_state = 64

data = pd.read_csv('csv경로', encoding='cp949')
# print(data[:30])
print('데이터 개수 : ',len(data))
x = data['cvrs']
y = data['label']


x_train, x_test, y_train, y_test = train_test_split(np.array(x), np.array(y), test_size =0.2, shuffle=True, random_state=random_state, stratify=y)
print(x_train.shape, x_test.shape)
print(y_train.shape, y_test.shape)

tokenized_train= [word_tokenize(text) for text in x_train]
tokenized_test= [word_tokenize(text) for text in x_test]

# print(tokenized_test[:10])
# print(len(tokenized_test))


# corpus 생성
corpus = Corpus()
corpus.fit(tokenized_train, window=5)

# 경사하강법 학습률 0.05, 아웃풋 벡터의 차원 100
glove = Glove(no_components=200, learning_rate=0.1)
# 쓰레드 개수는 4개, 훈련 횟수는 20번, verbose (설명) True
glove.fit(corpus.matrix, epochs=64, no_threads=4, verbose=False)
# 유사도 검색을 위한 행렬의 index 정보 입력
glove.add_dictionary(corpus.dictionary)

glove.save('경로.model')

glove_model = glove.load('경로.model')

def document_vectorizer(corpus, model, num_features):
    # 고유 단어 집합
    vocabulary = set(glove_model.dictionary.keys())
    # print(len(vocabulary)) # 3869
    # print(vocabulary) # {'교환', '육개', '연휴', ... ,'십일만', '서비스'}
    
    def average_word_vectors(words, model, vocabulary, num_features):
        # 모델의 vector_size만큼 0으로 채워진 테이블 생성
        feature_vector = np.zeros((num_features,), dtype="float64")

        nwords = 0.
        for word in words:
            if word in vocabulary: 
                nwords = nwords + 1.
                # vocab에 속한 단어의 벡터값 모두 더해서 feature_vector에 넣어줌
                feature_vector = np.add(feature_vector, model.word_vectors[model.dictionary[word]])

        if nwords:
            # vocab에 속한 단어 개수만큼 나눠서 평균 구함
            feature_vector = np.divide(feature_vector, nwords)
        return feature_vector

    features = [average_word_vectors(tokenized_sentence, model, vocabulary, num_features)
                    for tokenized_sentence in corpus]
    return np.array(features)

X_TRAIN = document_vectorizer(corpus=tokenized_train, model=glove_model, num_features=200)
X_TEST = document_vectorizer(corpus=tokenized_test, model=glove_model, num_features=200)


print(f'Word2Vec model:> Train features shape: \t {X_TRAIN.shape}\n \
                         Test features shape: \t {X_TEST.shape}')


from sklearn.model_selection import cross_val_score
from sklearn.linear_model import SGDClassifier


model = SGDClassifier(alpha=0.00001, loss='log', penalty='l2', n_jobs=-1, verbose=0, random_state=random_state, n_iter_no_change=8, validation_fraction=0.2)
# model = SVC(class_weight='balanced',  random_state=random_state)

start = datetime.now()
model.fit(X_TRAIN, y_train)
end = datetime.now()

print('시간 : ', end-start)

# svm_w2v_cv_scores = cross_val_score(model, X_TRAIN, y_train, cv=20)
# svm_w2v_cv_mean_score = np.mean(svm_w2v_cv_scores)
# print('CV Accuracy (5-fold):', svm_w2v_cv_scores)
# print('Mean CV Accuracy:', svm_w2v_cv_mean_score)

test_accuracy = model.score(X_TEST, y_test)
train_accuracy = model.score(X_TRAIN, y_train)
print('Train Accuracy:', train_accuracy)
print('Test Accuracy:', test_accuracy)

y_pred = model.predict(X_TEST)

test_report = classification_report(y_pred, y_test, zero_division=0)
print('test_report : \n',test_report)

In [None]:
# w2v

random.seed(64)
np.random.seed(64)
os.environ["PYTHONHASHSEED"] = str(64)

random_state = 64

data = pd.read_csv('csv경로', encoding='cp949')
# print(data[:30])
print('데이터 개수 : ',len(data))
x = data['cvrs']
y = data['label']


x_train, x_test, y_train, y_test = train_test_split(np.array(x), np.array(y), test_size =0.2, shuffle=True, random_state=random_state, stratify=y)
print(x_train.shape, x_test.shape)
print(y_train.shape, y_test.shape)

tokenized_train= [word_tokenize(text) for text in x_train]
tokenized_test= [word_tokenize(text) for text in x_test]

# w2v_model = KeyedVectors.load_word2vec_format('모델경로')

vector_size = 200
w2v_model = Word2Vec(sentences=tokenized_train,
                vector_size=vector_size,
                window=2,
                min_count=128, # 값을 작게할 수록 몇번 안나온 단어들도 계산에 포함, 속도가 느려지고, 모델의 크기가 커진다
                workers=4, # -1로 하면 빨리 나오지만 성능 떨어짐
                sg=1, #skip gram = 1, cbow = 0
                seed=64,
                negative = 5, # 희귀한 단어가 샘플로 조금 더 잘 뽑힐 수 있도록 한다.
                epochs=64)

def document_vectorizer(corpus, model, num_features):
    # 고유 단어 집합
    vocabulary = set(model.wv.key_to_index)
    # print(len(vocabulary))
    # print(vocabulary) # {'교환', '육개', '연휴', ... ,'십일만', '서비스'}
    
    def average_word_vectors(words, model, vocabulary, num_features):
        # 모델의 vector_size만큼 0으로 채워진 테이블 생성
        feature_vector = np.zeros((num_features,), dtype="float64")

        nwords = 0.
        for word in words:
            if word in vocabulary: 
                nwords = nwords + 1.
                # vocab에 속한 단어의 벡터값 모두 더해서 feature_vector에 넣어줌
                feature_vector = np.add(feature_vector, model.wv[word]) 

        if nwords:
            # vocab에 속한 단어 개수만큼 나눠서 평균 구함
            feature_vector = np.divide(feature_vector, nwords)
        return feature_vector

    features = [average_word_vectors(tokenized_sentence, model, vocabulary, num_features)
                    for tokenized_sentence in corpus]
    return np.array(features)


# generate averaged word vector features from word2vec model
X_TRAIN = document_vectorizer(corpus=tokenized_train, model=w2v_model, num_features=vector_size)
X_TEST = document_vectorizer(corpus=tokenized_test, model=w2v_model, num_features=vector_size)

print(f'Word2Vec model:> Train features shape: \t {X_TRAIN.shape}\n \
                         Test features shape: \t {X_TEST.shape}')

from sklearn.model_selection import cross_val_score
from sklearn.linear_model import SGDClassifier


# model = SGDClassifier(alpha=0.00001, loss='log', penalty='l2', n_jobs=-1, verbose=0, random_state=random_state, n_iter_no_change=8, validation_fraction=0.2)
model = SGDClassifier(alpha=0.0001, loss='hinge', penalty='l2', n_jobs=-1, verbose=0, random_state=random_state, n_iter_no_change=8, validation_fraction=0.2)

start = datetime.now()
model.fit(X_TRAIN, y_train)
end = datetime.now()

print('시간 : ', end-start)

# svm_w2v_cv_scores = cross_val_score(model, X_TRAIN, y_train, cv=20)
# svm_w2v_cv_mean_score = np.mean(svm_w2v_cv_scores)
# print('CV Accuracy (5-fold):', svm_w2v_cv_scores)
# print('Mean CV Accuracy:', svm_w2v_cv_mean_score)

test_accuracy = model.score(X_TEST, y_test)
train_accuracy = model.score(X_TRAIN, y_train)
print('Train Accuracy:', train_accuracy)
print('Test Accuracy:', test_accuracy)

y_pred = model.predict(X_TEST)

test_report = classification_report(y_pred, y_test, zero_division=0)
print('test_report : \n',test_report)
