## 4.1.3 Linear Regression Example 1

### TF-IDF Feature Example

In [1]:
import os

import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [3]:
DEFAULT_PATH = '~/.kaggle/competitions/word2vec-nlp-tutorial/' #TODO1: 디렉토리 변경 필요 data_in
DATA_OUT_PATH = './data_out/'

RANDOM_SEED = 42
TEST_SPLIT = 0.2

In [4]:
train = pd.read_csv(DEFAULT_PATH + "labeledTrainData.tsv", header=0, delimiter="\t", quoting=3)

In [5]:
reviews = list(train['review'])
sentiments = list(train['sentiment'])

In [6]:
vectorizer = TfidfVectorizer(min_df = 0.0, analyzer="char", sublinear_tf=True, ngram_range=(1,3), max_features=5000) #converting data to vectors

X = vectorizer.fit_transform(reviews)
y = np.array(sentiments)

In [7]:
features = vectorizer.get_feature_names()

In [8]:
print(X.shape)

(25000, 5000)


In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SPLIT, random_state=RANDOM_SEED)

In [10]:
lgs = LogisticRegression(class_weight='balanced') 
lgs.fit(X_train, y_train) 

LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [11]:
predicted = lgs.predict(X_test)

In [12]:
fpr, tpr, _ = metrics.roc_curve(y_test, (lgs.predict_proba(X_test)[:, 1]))

In [13]:
auc = metrics.auc(fpr, tpr)

In [14]:
print("Accuracy: %f" % lgs.score(X_test, y_test))  
print("Precision: %f" % metrics.precision_score(y_test, predicted))
print("Recall: %f" % metrics.recall_score(y_test, predicted))
print("F1-Score: %f" % metrics.f1_score(y_test, predicted))
print("AUC: %f" % auc)

Accuracy: 0.869800
Precision: 0.863707
Recall: 0.880508
F1-Score: 0.872027
AUC: 0.944510


In [15]:
test = pd.read_csv(DEFAULT_PATH + "testData.tsv", header=0, delimiter="\t", quoting=3)

In [16]:
testDataVecs = vectorizer.transform(test['review'])

In [17]:
test_predicted = lgs.predict(testDataVecs)

In [18]:
if not os.path.exists(DATA_OUT_PATH):
    os.makedirs(DATA_OUT_PATH)

answer_dataset = pd.DataFrame({'id': test['id'], 'sentiment': test_predicted})
answer_dataset.to_csv(DATA_OUT_PATH + 'lgs_tfidf_answer.csv', index=False, quoting=3)