<a href="https://colab.research.google.com/github/mot1122/NLP_100_knock/blob/main/nlp_100_chap6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

50. データの入手・整形

In [None]:
!wget https://archive.ics.uci.edu/ml/machine-learning-databases/00359/NewsAggregatorDataset.zip

In [None]:
!unzip NewsAggregatorDataset.zip

In [None]:
# 行数の確認
!wc -l ./newsCorpora.csv

In [None]:
!head -10 ./newsCorpora.csv

In [None]:
!sed -e 's/"/'\''/g' ./newsCorpora.csv > ./newsCorpora_re.csv

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

df=pd.read_csv('./newsCorpora_re.csv',header=None,sep='\t',names=['ID', 'TITLE', 'URL', 'PUBLISHER', 'CATEGORY', 'STORY', 'HOSTNAME', 'TIMESTAMP'])

In [None]:
df.head(2)

In [None]:
pd.set_option('display.max_columns',100)

In [None]:
df=df.loc[df['PUBLISHER'].isin(['Reuters', 'Huffington Post', 'Businessweek', 'Contactmusic.com', 'Daily Mail']),['TITLE', 'CATEGORY']]

In [None]:
train,valid_test=train_test_split(df,test_size=0.2,shuffle=True,random_state=123,stratify=df['CATEGORY'])
valid,test=train_test_split(valid_test,test_size=0.5,shuffle=True,random_state=123,stratify=valid_test['CATEGORY'])

In [None]:
train.to_csv('./train.txt',sep='\t',index=False)
valid.to_csv('./valid.txt',sep='\t',index=False)
test.to_csv('./test.txt',sep='\t',index=False)

In [None]:
print("[train]")
print(train['CATEGORY'].value_counts())
print()
print('[valid]')
print(valid['CATEGORY'].value_counts())
print()
print('[test]')
print(test['CATEGORY'].value_counts())
print()

51. 特徴量抽出

In [None]:
import string
import re

def preprocessing(text):
  table=str.maketrans(string.punctuation,' '*len(string.punctuation))
  text=text.translate(table)
  text=text.lower()
  text=re.sub('\d+','0',text)
  return text

In [None]:
df=pd.concat([train,valid,test],axis=0)
df.reset_index(drop=True,inplace=True)
df['TITLE']=df['TITLE'].map(lambda x:preprocessing(x))

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

train_valid=df[:len(train)+len(valid)]
test=df[len(train)+len(valid):]

vec_tfidf=TfidfVectorizer(min_df=10,ngram_range=(1,2))

X_train_valid=vec_tfidf.fit_transform(train_valid['TITLE'])
X_test=vec_tfidf.transform(test['TITLE'])

X_train_valid=pd.DataFrame(X_train_valid.toarray(),columns=vec_tfidf.get_feature_names())
X_test=pd.DataFrame(X_test.toarray(),columns=vec_tfidf.get_feature_names())

In [None]:
X_train=X_train_valid[:len(train)]
X_valid=X_train_valid[len(train):]

X_train.to_csv('./X_train.txt',sep='\t',index=False)
X_valid.to_csv('./X_valid.txt',sep='\t',index=False)
X_test.to_csv('./X_test.txt',sep='t',index=False)

52. 学習

In [None]:
from sklearn.linear_model import LogisticRegression

lg=LogisticRegression(random_state=123,max_iter=10000)
lg.fit(X_train,train['CATEGORY'])

53. 予測

In [None]:
import numpy as np

def score_lg(lg, X):
  return [np.max(lg.predict_proba(X),axis=1),lg.predict(X)]

In [None]:
train_pred=score_lg(lg,X_train)
test_pred=score_lg(lg,X_test)

In [None]:
from sklearn.metrics import accuracy_score

train_accuracy = accuracy_score(train['CATEGORY'],train_pred[1])
test_accuracy = accuracy_score(test['CATEGORY'],test_pred[1])

In [None]:
print(train_pred)

54. 正解率の計測

In [None]:
print(f'train_accuracy : {train_accuracy:.3f}')
print(f'test_accuracy : {test_accuracy:.3f}')

55. 混同行列の作成

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

train_cm=confusion_matrix(train['CATEGORY'],train_pred[1])
print(train_cm)
sns.heatmap(train_cm,annot=True,cmap='Blues')
plt.show()

In [None]:
test_cm=confusion_matrix(test['CATEGORY'],test_pred[1])
sns.heatmap(test_cm,annot=True,cmap='Blues')
plt.show()

56. 適合率，再現率，F1スコアの計測

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score

def culculate_scores(y_true,y_pred):
  precision=precision_score(y_true,y_pred,average=None,labels=['b','e','t','m'])
  precision=np.append(precision,precision_score(y_true,y_pred,average='micro'))
  precision=np.append(precision,precision_score(y_true,y_pred,average='macro'))

  recall=recall_score(y_true,y_pred,average=None,labels=['b','e','t','m'])
  recall=np.append(recall,recall_score(y_true,y_pred,average='micro'))
  recall=np.append(recall,recall_score(y_true,y_pred,average='macro'))

  f1=f1_score(y_true,y_pred,average=None,labels=['b','e','t','m'])
  f1=np.append(f1,f1_score(y_true,y_pred,average='micro'))
  f1=np.append(f1,f1_score(y_true,y_pred,average='macro'))

  scores=pd.DataFrame({'適合率':precision,'再現率':recall,'F1スコア':f1},
                      index=['b','e','t','m','micro','macro'])
  return scores

  

In [None]:
print(culculate_scores(test['CATEGORY'],test_pred[1]))

57. 特徴量の重みの確認

In [None]:
features=X_train.columns.values
index=[i for i in range(1,11)]
for c,coef in zip(lg.classes_,lg.coef_):
  print(f'CATEGORY : {c}')
  best10=pd.DataFrame(features[np.argsort(coef)[::-1][:10]],columns=['重要度上位'],index=index).T
  worst10=pd.DataFrame(features[np.argsort(coef)[:10]],columns=['重要度下位'],index=index).T
  display(pd.concat([best10,worst10],axis=0))
  print('\n')



58. 正則化パラメータの変更

In [None]:
X_train.head()

In [None]:
from tqdm import tqdm
result=[]
for C in tqdm(np.logspace(-3,3,7,base=10)):
  lg=LogisticRegression(random_state=123,max_iter=10000,C=C)
  lg.fit(X_train,train['CATEGORY'])
  train_pred=score_lg(lg,X_train)
  valid_pred=score_lg(lg,X_valid)
  test_pred=score_lg(lg,X_test)
  train_accuracy=accuracy_score(train['CATEGORY'],train_pred[1])
  valid_accuracy=accuracy_score(valid['CATEGORY'],valid_pred[1])
  test_accuracy=accuracy_score(test['CATEGORY'],test_pred[1])
  result.append([C,train_accuracy,valid_accuracy,test_accuracy])

In [None]:
print(result[1])
result=np.array(result).T
print(result)

[0.01, 0.7606701609883939, 0.7589820359281437, 0.7537425149700598]
[[1.00000000e-03 1.00000000e-02 1.00000000e-01 1.00000000e+00
  1.00000000e+01 1.00000000e+02 1.00000000e+03]
 [4.73137402e-01 7.60670161e-01 8.04473980e-01 9.26525646e-01
  9.78098091e-01 9.97004867e-01 9.98408836e-01]
 [4.65568862e-01 7.58982036e-01 7.91916168e-01 8.77994012e-01
  8.87724551e-01 8.70508982e-01 8.54041916e-01]
 [4.82784431e-01 7.53742515e-01 7.97904192e-01 8.85479042e-01
  8.91467066e-01 8.71257485e-01 8.63772455e-01]]


In [None]:
result=np.array(result).T
plt.plot(result[0],result[1],label='train')
plt.plot(result[0],result[2],label='valid')
plt.plot(result[0],result[3],label='test')
plt.ylim(0,1.1)
plt.xlim(10**(-6),10**5)
plt.ylabel('Accuracy')
plt.xscale('log')
plt.xlabel('C')
plt.legend()
plt.show()

59. ハイパーパラメータの探索

In [None]:
!pip install optuna

In [None]:
def objective_lg(trial):
  l1_ratio=trial.suggest_uniform('l1_ratio',0,1)
  C=trial.suggest_uniform('C',1e-4,1e4)

  lg=LogisticRegression(random_state=123,max_iter=10000,penalty='elasticnet',solver='saga',l1_ratio=l1_ratio,C=C)
  lg.fit(X_train,train['CATEGORY'])
  valid_pred=score_lg(lg,X_valid)
  valid_accuracy=accuracy_score(valid['CATEGORY'],valid_pred[1])
  return valid_accuracy

In [None]:
study=optuna.create_study(direction='maximize')
study.optimize(objective_lg,timeout=3600)
trial=study.best_trial
print('best trial:')
print(f.' value:{trial.value}')
print(f.' params:')
l1_ratio,C = trial.params.items()
  print(f.'   l1_ratio:{l1_ratio}')
  print(f.'   C:{C}')

[32m[I 2021-11-15 12:34:49,341][0m A new study created in memory with name: no-name-ddec93e8-bcab-42be-9462-83742d5cd00a[0m
[32m[I 2021-11-15 13:19:35,601][0m Trial 0 finished with value: 0.8532934131736527 and parameters: {'l1_ratio': 0.8475355126958377, 'C': 2908.7565090650137}. Best is trial 0 with value: 0.8532934131736527.[0m
[32m[I 2021-11-15 13:53:08,133][0m Trial 1 finished with value: 0.8547904191616766 and parameters: {'l1_ratio': 0.48867813785740133, 'C': 3066.844889610886}. Best is trial 1 with value: 0.8547904191616766.[0m


best trial:
 value:0.8547904191616766
 params:
   l1_ratio:('l1_ratio', 0.48867813785740133)
   C:('C', 3066.844889610886)


In [None]:
lg=LogisticRegression(random_state=123,max_iter=10000,penalty='elasticnet',solver='saga',l1_ratio=l1_ratio,C=C)
lg.fit(X_train,train['CATEGORY'])
train_pred=score_lg(lg,X_train)
valid_pred=score_lg(lg,X_valid)
test_pred=score_lg(lg,X_test)

train_accuracy=accuracy_score(train['CATEGORY'],train_pred[1])
valid_accuracy=accuracy_score(valid['CATEGORY'],valid_pred[1])
test_accuracy=accuracy_score(test['CATEGORY'],test_pred[1])

print(f'train_accuracy : {train_accuracy}')
print(f'valid_accuracy : {valid_accuracy}')
print(f'test_accuracy : {test_accuracy}')


TypeError: ignored

In [None]:
!pip install xgboost
import xgboost as xgb

In [None]:
params={
    'objective':'multi:softmax',
    'num_class':4,
    'eval_metric':'mlogloss',
    'colsample_bytree':1.0,
    'colsample_bylevel':0.5,
    'min_child_weight':1,
    'sabsample':0.9,
    'eta':0.1,
    'max_depth':5,
    'gamma':0.0,
    'alpha':0.0,
    'lambda':1.0,
    'num_round':1000,
    'early_stopping_rounds':50,
    'verbosity':0
}

category_dict={'b':0,'e':1,'t':2,'m':3}
y_train=train['CATEGORY'].map(lambda x:category_dict[x])
y_valid=valid['CATEGORY'].map(lambda x:category_dict[x])
y_test=test['CATEGORY'].map(lambda x:category_dict[x])
dtrain=xgb.DMatrix(X_train,label=y_train)
dvalid=xgb.DMatrix(X_valid,label=y_valid)
dtest=xgb.DMatrix(X_test,label=y_test)

num_round=params.pop('num_round')
early_stopping_rounds=params.pop('early_stopping_rounds')
watchlist=[[dtrain,'train'],[dvalid,'eval']]
model=xgb.train(params,drain,num_round,evals=watchlist,early_stopping_rounds=early_stopping_rounds)

In [None]:
train_pred=model.predict(dtain,ntree_limit=model.best_ntree_limit)
valid_pred=model.predict(dvalid,ntree_limit=model.best_ntree_limit)
test_pred=model.predict(dtest,ntree_limit=model.best_ntree_limit)

train_accuracy=accuracy_score(y_train,train_pred)
valid_accuracy=accuracy_score(y_valid,valid_pred)
test_accuracy=accuracy_score(y_test,test_pred)

print(f'train_accuracy : {train_accuracy}')
print(f'valid_accuracy : {valid_accuracy}')
print(f'test_accuracy : {test_accuracy}')