In [1]:
import pandas as pd
import numpy as np
import math
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
random_seed = 2020

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
random_seed = 2020

## Load dataset

- 데이터셋 불러오기

In [8]:
movie_review = pd.read_csv('/content/drive/MyDrive/데이터마이닝 과제/week12_ml/week12_ml/dataset/imdb_dataset.txt', delimiter='\t', compression='zip').reset_index(drop=True).dropna()
movie_review.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 1.1+ MB


In [9]:
movie_review.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


- 긍정 리뷰와 부정 리뷰 비율 확인(다를 경우, 클래스 불균형 문제 발생)

In [11]:
pd.DataFrame(movie_review['sentiment'].value_counts(normalize=True))

Unnamed: 0,sentiment
negative,0.5
positive,0.5


# Split dataset

In [96]:
X = movie_review['review'].values.tolist()
y = movie_review['sentiment'].values.tolist()

##y값 0, 1로 변경

In [97]:
for i in range(len(y)):
  if y[i] == 'negative':
    y[i] = 0
  else:
    y[i] = 1

In [98]:
print(X[0], y[0])

One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the fac

In [99]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.3, 
                                                    random_state=random_seed)

# Data preprocessing

In [None]:
! pip install contractions
import re
import contractions

In [100]:
def text_clean(text):
    
    text = contractions.fix(text) #he's -> he is
    text = re.sub('<br />'," ",text)  
    text = re.sub('https?:/\/\S+', ' ', text) # remove urls
    text = re.sub('[0-9]+', ' ', text) # remove numbers
    text = re.sub('[-=+,#/\?:^$.@*\"※~&%ㆍ!』\\‘|\(\)\[\]\<\>`\…》]+', ' ', text) # remove all symbols and punctuation except for . , ! and ?
    text = re.sub("'"," ", text)
    text = re.sub('\s+', ' ', text) # 중복 띄어쓰기 삭제

    text = text.lower() # 소문자
    return text.strip()

In [101]:
text_clean(X[1])

'a wonderful little production the filming technique is very unassuming very old time bbc fashion and gives a comforting and sometimes discomforting sense of realism to the entire piece the actors are extremely well chosen michael sheen not only has got all the polari but he has all the voices down pat too you can truly see the seamless editing guided by the references to williams diary entries not only is it well worth the watching but it is a terrificly written and performed piece a masterful production about one of the great master s of comedy and his life the realism really comes home with the little things the fantasy of the guard which rather than use the traditional dream techniques remains solid then disappears it plays on our knowledge and our senses particularly with the scenes concerning orton and halliwell and the sets particularly of their flat with halliwell s murals decorating every surface are terribly well done'

#Tokenization

In [104]:
from nltk.tokenize import TreebankWordTokenizer
tokenizer=TreebankWordTokenizer()

def token(text): 
  return tokenizer.tokenize(text)

# Word Vectorizer

In [105]:
from sklearn.feature_extraction.text import CountVectorizer

# Counter Vectorization으로 피처 벡터화 수행
cnt_vect = CountVectorizer(lowercase=False, preprocessor = text_clean,  tokenizer = token, min_df=5, max_features=1600)
cnt_vect.fit(X_train)

X_train_cnt_vect = cnt_vect.transform(X_train)

#  학습 데이터로 fit()된 Vectorizer를 이용해 테스트 데이터를 피처 벡터화 변환 수행
X_test_cnt_vect = cnt_vect.transform(X_test)



In [106]:
from sklearn.feature_extraction.text import TfidfVectorizer

# X_train
tfidf_vect = TfidfVectorizer(lowercase=False, preprocessor = text_clean,  tokenizer = token, min_df=5, max_features=1600)

tfidf_vect.fit(X_train)
X_train_tfidf_vect = tfidf_vect.transform(X_train)

# X_test
X_test_tfidf_vect = tfidf_vect.transform(X_test)



# Model Selection

In [107]:
def model_selection(X_train, y_train, kfold=3):
    from sklearn.linear_model import LogisticRegression
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.svm import LinearSVC
    from sklearn.neighbors import KNeighborsClassifier
    from xgboost import XGBClassifier
    from sklearn.model_selection import cross_val_score
    from tqdm.notebook import tqdm

    models = [
        LogisticRegression(),
        XGBClassifier(random_state=random_seed),
        LinearSVC(),
        KNeighborsClassifier(),
        RandomForestClassifier(),
    ]

    progress_bar = tqdm(total = len(models) * kfold)
    
    entries = []
    for model in models:
        model_name = model.__class__.__name__
        accuracies = cross_val_score(model, X_train, y_train, 
                        scoring='accuracy', cv=kfold, n_jobs=-1)
    
        for fold_idx, accuracy in enumerate(accuracies):
            progress_bar.update()
            entries.append((model_name, fold_idx, accuracy))
    
    progress_bar.close()
    
    cv_result = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'], index=range(kfold * len(models)))
    return cv_result

In [108]:
model_selection_result = model_selection(X_train_tfidf_vect, y_train)

HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))






In [112]:
model_selection_result

Unnamed: 0,model_name,fold_idx,accuracy
0,LogisticRegression,0,0.869975
1,LogisticRegression,1,0.871261
2,LogisticRegression,2,0.870393
3,XGBClassifier,0,0.807491
4,XGBClassifier,1,0.802948
5,XGBClassifier,2,0.809018
6,LinearSVC,0,0.870404
7,LinearSVC,1,0.869632
8,LinearSVC,2,0.868678
9,KNeighborsClassifier,0,0.70378


In [110]:
pd.pivot_table(model_selection_result, values='accuracy', index=['model_name'], aggfunc=np.mean, fill_value=0)

Unnamed: 0_level_0,accuracy
model_name,Unnamed: 1_level_1
KNeighborsClassifier,0.698514
LinearSVC,0.869571
LogisticRegression,0.870543
RandomForestClassifier,0.831086
XGBClassifier,0.806486


In [111]:
model_selection_result2 = model_selection(X_train_cnt_vect, y_train)

HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))






In [113]:
model_selection_result2

Unnamed: 0,model_name,fold_idx,accuracy
0,LogisticRegression,0,0.869032
1,LogisticRegression,1,0.863375
2,LogisticRegression,2,0.86645
3,XGBClassifier,0,0.805348
4,XGBClassifier,1,0.80312
5,XGBClassifier,2,0.808332
6,LinearSVC,0,0.861147
7,LinearSVC,1,0.83569
8,LinearSVC,2,0.808246
9,KNeighborsClassifier,0,0.625611


In [114]:
pd.pivot_table(model_selection_result2, values='accuracy', index=['model_name'], aggfunc=np.mean, fill_value=0)

Unnamed: 0_level_0,accuracy
model_name,Unnamed: 1_level_1
KNeighborsClassifier,0.624972
LinearSVC,0.835028
LogisticRegression,0.866286
RandomForestClassifier,0.831114
XGBClassifier,0.8056


**Logistic Regression의 성능이 가장 좋았고 이 때 TF-IDF를 썼을 경우 가장 좋은 
성능을 보였다.**

# Evaluation

In [36]:
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

In [37]:
def evaluation_report(y_test, pred, is_return=True):
        from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
        acc = accuracy_score(y_test, pred) 
        pre=precision_score(y_test, pred)
        recall = recall_score(y_test, pred)
        f1 = f1_score(y_test, pred)
        auc = roc_auc_score(y_test, pred)
        for name, value in zip(['accuracy_score', 'precision_score', 'recall_score', 'f1_score', 'roc_auc_score'], [acc, pre, recall, f1, auc]):
            print('{name} = {value:.2f}'.format(name=name, value=value), end='\t')

#Logistic Regression

In [119]:
from sklearn.linear_model import LogisticRegression

lr_clf = LogisticRegression(max_iter = 200, n_jobs=-1)
lr_clf.fit(X_train_tfidf_vect, y_train)
pred = lr_clf.predict(X_test_tfidf_vect)

In [120]:
evaluation_report(y_test, pred)

accuracy_score = 0.89	precision_score = 0.88	recall_score = 0.89	f1_score = 0.89	roc_auc_score = 0.89	

In [121]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.89      0.88      0.89      7474
           1       0.88      0.89      0.89      7526

    accuracy                           0.89     15000
   macro avg       0.89      0.89      0.89     15000
weighted avg       0.89      0.89      0.89     15000



#Hyperparameter Tuning

In [127]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

lr_clf = LogisticRegression(max_iter=200)
params = {'penalty': ['l2', 'l1'],
       'C': [0.001, 0.01, 0.1, 1, 5, 10]}

grid_lr_clf = GridSearchCV(lr_clf, 
                           param_grid=params, verbose=1, scoring='accuracy', n_jobs=-1, cv = 3)

grid_lr_clf.fit(X_train_tfidf_vect, y_train) 

Fitting 3 folds for each of 12 candidates, totalling 36 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  36 out of  36 | elapsed:   16.3s finished


GridSearchCV(cv=3, error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=200, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='lbfgs',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'C': [0.001, 0.01, 0.1, 1, 5, 10],
                         'penalty': ['l2', 'l1']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=1)

In [128]:
grid_lr_clf.best_estimator_

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=200,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [129]:
pred = grid_lr_clf.predict(X_test_tfidf_vect)
evaluation_report(y_test, pred)

accuracy_score = 0.89	precision_score = 0.88	recall_score = 0.89	f1_score = 0.89	roc_auc_score = 0.89	

In [131]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.89      0.88      0.89      7474
           1       0.88      0.89      0.89      7526

    accuracy                           0.89     15000
   macro avg       0.89      0.89      0.89     15000
weighted avg       0.89      0.89      0.89     15000

