## 4.1.4 Logistic Regression Example with TF-IDF

### TF-IDF Feature Example

In [1]:
import os

import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [3]:
DATA_IN_PATH = './data_in/' 
DATA_OUT_PATH = './data_out/'
TRAIN_CLEAN_DATA = 'train_clean.csv'

RANDOM_SEED = 42
TEST_SPLIT = 0.2

In [4]:
train_data = pd.read_csv( DATA_IN_PATH + TRAIN_CLEAN_DATA )
train_data.head()

Unnamed: 0,review,sentiment
0,stuff going moment mj started listening music ...,1
1,classic war worlds timothy hines entertaining ...,1
2,film starts manager nicholas bell giving welco...,0
3,must assumed praised film greatest filmed oper...,0
4,superbly trashy wondrously unpretentious explo...,1


In [5]:
reviews = list(train_data['review'])
sentiments = list(train_data['sentiment'])

In [6]:
#min_df : [0.0, 1.0] or int (default=1)
#토큰이 나타난 횟수를 기준으로 단어장을 구성할 수도 있다. 토큰의 빈도가 max_df로 지정한 값을 초과 하거나 min_df로 지정한 값보다 작은 경우에는 무시
# sublinear_tf : tf값에 1+log(tf)를 적용하여 tf값이 무한정 커지는 것을 막음
#ngram_range : (min_n, max_n) 튜플
#ngram : n-그램은 단어장 생성에 사용할 토큰의 크기를 결정
#tuple (min_n, max_n)
vectorizer = TfidfVectorizer(min_df = 0.0, analyzer="char", sublinear_tf=True, ngram_range=(1,3), max_features=5000) 

X = vectorizer.fit_transform(reviews)
y = np.array(sentiments)

In [6]:
X

<25000x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 17862871 stored elements in Compressed Sparse Row format>

In [7]:
features = vectorizer.get_feature_names()

In [7]:
X_train, X_eval, y_train, y_eval = train_test_split(X, y, test_size=TEST_SPLIT, random_state=RANDOM_SEED)

In [9]:
print(X_train)

  (0, 2455)	0.06978011467494992
  (0, 3290)	0.11831336053546065
  (0, 3447)	0.11180387486782767
  (0, 3446)	0.1037162053791878
  (0, 4804)	0.06143571244583702
  (0, 3431)	0.0603609514255221
  (0, 4340)	0.06681896363777874
  (0, 820)	0.042851785542498216
  (0, 3272)	0.06674513801040942
  (0, 1302)	0.05841662267017666
  (0, 4487)	0.053230369826226004
  (0, 4317)	0.06504555308895356
  (0, 2300)	0.05139493032860254
  (0, 4709)	0.04874449992816423
  (0, 562)	0.05791472561909676
  (0, 1040)	0.03872461261618889
  (0, 4559)	0.0450725268219528
  (0, 865)	0.0428928615562273
  (0, 493)	0.03895721264061005
  (0, 722)	0.05271124205527762
  (0, 705)	0.04783935547756013
  (0, 2675)	0.079523315787901
  (0, 1131)	0.080183326599645
  (0, 3233)	0.03605932115372919
  (0, 2286)	0.03752971048688696
  :	:
  (19999, 4766)	0.03416403225368242
  (19999, 4562)	0.03467417608746243
  (19999, 3402)	0.05564971657621524
  (19999, 614)	0.03067382979080541
  (19999, 2277)	0.035212326418526056
  (19999, 4949)	0.04506654

In [10]:
lgs = LogisticRegression() 
lgs.fit(X_train, y_train) 

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [10]:
predicted = lgs.predict(X_eval)

In [11]:
print("Accuracy: %f" % lgs.score(X_eval, y_eval))

Accuracy: 0.859600


In [12]:
TEST_CLEAN_DATA = 'test_clean.csv'

test_data = pd.read_csv(DATA_IN_PATH + TEST_CLEAN_DATA)

In [13]:
testDataVecs = vectorizer.transform(test_data['review'])

In [14]:
test_predicted = lgs.predict(testDataVecs)
print(test_predicted)

[1 0 1 ... 0 1 0]


In [15]:
if not os.path.exists(DATA_OUT_PATH):
    os.makedirs(DATA_OUT_PATH)

answer_dataset = pd.DataFrame({'id': test_data['id'], 'sentiment': test_predicted})
answer_dataset.to_csv(DATA_OUT_PATH + 'lgs_tfidf_answer.csv', index=False, quoting=3)