<a href="https://colab.research.google.com/github/bill7845/brunch_networking_aws/blob/main/Notebook/modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
"""
colab. konlpy setting
"""
!apt-get update
!apt-get install g++ openjdk-8-jdk 
!pip3 install konlpy

!git clone https://github.com/SOMJANG/Mecab-ko-for-Google-Colab.git

In [None]:
cd Mecab-ko-for-Google-Colab/

In [None]:
! bash install_mecab-ko_on_colab190912.sh

In [None]:
!pip install boto3
!pip install awscli

In [None]:
## library import
import pandas as pd
import numpy as np
import json
import os
import pickle
import urllib.request
import re
from ast import literal_eval
from google.oauth2 import service_account
from google.cloud import storage

import boto3
import os
import io

from konlpy.tag import Okt
from konlpy.tag import Mecab
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix,accuracy_score,f1_score

In [None]:
"""
aws auth setting
"""
!aws configure

In [None]:
"""
bigquery Authentication setting.
"""

bq_credential_path = '/content/drive/MyDrive/Colab code/brunch_project/code/gcp_access/brunch-networking-07958d4e3d41.json'
credentials = service_account.Credentials.from_service_account_file(bq_credential_path)
project_id = 'brunch-networking-303012'

def get_data_from_bq(credentials=credentials, project_id=project_id, class_num=int):
  
  if class_num == None:
    
    query = "SELECT * FROM `brunch-networking-303012.brunch_networking.ori_brunch_data`"
    df = pd.read_gbq(query=query, project_id=project_id, credentials=credentials, dialect='standard')
    
  elif class_num != None:
    query = "SELECT * FROM `brunch-networking-303012.brunch_networking.ori_brunch_data` where class_num = {class_num}".format(class_num=class_num)
    df = pd.read_gbq(query=query, project_id=project_id, credentials=credentials, dialect='standard')

  df = df[~df['text'].isnull()] # delete empty text row

  return df

In [None]:
def s3_upload(bucket_name , bucket_key, target_file_path):
  
  s3 = boto3.client('s3')
  
  s3.upload_file(
      Filename = target_file_path,
      Bucket = bucket_name,
      Key = bucket_key
  )

In [None]:
def split_train_test(df):

  x_train,x_test,y_train,y_test = train_test_split(df[['text']],df['class_num'],test_size=0.2,random_state=0,stratify=df['class_num'])

  return x_train,x_test,y_train,y_test


def custom_tagging(df):
  mecab = Mecab()

  text = df['text']
  text_plat = text.values.tolist()

  prepared_data = []
  tmp = None
  for idx in range(len(text_plat)):
      tmp = [i[0] for i in mecab.pos(text_plat[idx]) if ( ((i[1]=="NNG") or (i[1]=="NNP")) and(len(i[0])>1))] # 품사가 명사이면서, 길이가 2이상
      prepared_data.append(" ".join(tmp))

  return prepared_data

## top_n error
## 예측한 최상위 2개 범주 가운데 정답이 없는 경우의 오류율
def top_n_error(model, y_test_proba, y_test, top_n=2):

  top_n_pred = np.argsort(y_test_proba, axis=1)[:, -top_n:]
  class_labels = model.classes_
  
  true_score = 0
  for i in range(len(y_test)):
    if y_test.iloc[i] not in class_labels[top_n_pred][i]:
      true_score += 1
    else :
      pass
  
  error_rate = true_score / len(y_test)

  print("Top_"+str(top_n)+" Error : ",error_rate)

  return error_rate

In [None]:
# all_df = get_data_from_bq(class_num=None) # load all_df from bigqeury
all_df = pd.read_csv("/content/drive/MyDrive/Colab code/brunch_project/data/basement/all_df.csv") # load all_df.csv from google drive

x_train, x_test, y_train, y_test = split_train_test(all_df) # split all_df. train/test

train_data = custom_tagging(x_train) # mecab pos tagging.
test_data = custom_tagging(x_test) # mecab pos taggin.

tfidf_vect = TfidfVectorizer(max_df=0.9)

tfidf_train_vect = tfidf_vect.fit(train_data) 
tfidf_train_matrix = tfidf_vect.transform(train_data)
path_train = '/content/drive/My Drive/Colab code/brunch_project/data/vect/train/'
pickle.dump(tfidf_train_vect, open(os.path.join(path_train,'tfidf_train_vect.pkl'),'wb'),protocol=4) # save google drive
pickle.dump(tfidf_train_matrix, open(os.path.join(path_train,'tfidf_train_matrix.pkl'),'wb'),protocol=4) # save google drive

# upload s3
s3_upload(bucket_name='util-brunch-networking',
          bucket_key='train/' + 'tfidf_train_vect.pkl',
          target_file_path= path_train + 'tfidf_train_vect.pkl')

# tfidf_test_matrix = tfidf_vect.transform(test_data)
path_test = '/content/drive/My Drive/Colab code/brunch_project/data/vect/test/'
pickle.dump(tfidf_test_matrix, open(os.path.join(path_test,'tfidf_test_matrix.pkl'),'wb'),protocol=4) # save google drive

# upload s3
s3_upload(bucket_name='util-brunch-networking',
          bucket_key='test/' + 'tfidf_test_matrix.pkl',
          target_file_path= path_test + 'tfidf_test_matrix.pkl')

In [None]:
# ## logistic Regression
lg_model = LogisticRegression(C=1,multi_class='multinomial',solver='lbfgs')
lg_model.fit(tfidf_train_matrix, y_train)

model_path = '/content/drive/My Drive/Colab code/brunch_project/data/model_weight/'
pickle.dump(lg_model, open(os.path.join(model_path,'classifier_lg.pkl'),'wb'),protocol=4) # save google drive

# load model weight. 
# with open('/content/drive/My Drive/Colab code/brunch_project/data/model_weight/classifier_lg.pkl', 'rb') as f:
#   lg_model = pickle.load(f)

pred_logistic = lg_model.predict(tfidf_test_matrix) # prediction
pred_logistic_proba = lg_model.predict_proba(tfidf_test_matrix) # get prediction probability

## check classfication result
print("classification report", classification_report(y_test,pred_logistic))
print("accuracy : ",accuracy_score(y_test,pred_logistic)) 
print("f1_score : ",f1_score(y_test,pred_logistic, average='macro'))
top_2_error = top_n_error(lg_model ,pred_logistic_proba, y_test, top_n=2) 

# upload s3
s3_upload(bucket_name='util-brunch-networking',
          bucket_key='model_weight/' + 'classifier_lg.pkl',
          target_file_path= model_path + 'classifier_lg.pkl')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


classification report               precision    recall  f1-score   support

           0       0.75      0.72      0.73      3373
           1       0.74      0.71      0.73       726
           2       0.81      0.73      0.77      1652
           3       0.64      0.42      0.50      1044
           4       0.75      0.62      0.68      1409
           5       0.69      0.47      0.56       672
           6       0.74      0.69      0.71       792
           7       0.80      0.69      0.74      1254
           8       0.66      0.54      0.59      4121
           9       0.53      0.70      0.60      9513
          10       0.74      0.71      0.72      1690
          11       0.73      0.66      0.69      1151
          12       0.79      0.78      0.78      1576
          13       0.80      0.82      0.81      3018
          14       0.64      0.56      0.60      2695
          15       0.74      0.68      0.71      1249
          16       0.75      0.69      0.72      3598
     