### Dataset

In [1]:
import pandas as pd

In [2]:
train = pd.read_csv('../dataset/news_group/train.csv')
train.head()

Unnamed: 0,id,text,target
0,0,"\nThey were, and even if Washington might cons...",10
1,1,"We run ""SpaceNews & Views"" on our STAREACH BBS...",14
2,2,\n\n\nNot to worry. The Masons have been demo...,19
3,3,"Only Brendan McKay, or maybe ARF, would come t...",17
4,4,Help: I am running some sample problems from O...,5


In [3]:
train.shape

(9233, 3)

### Data Split

In [4]:
X = train.text # train에서 문서 추출
y = train.target # train에서 라벨 추출

In [5]:
X.head()

0    \nThey were, and even if Washington might cons...
1    We run "SpaceNews & Views" on our STAREACH BBS...
2    \n\n\nNot to worry.  The Masons have been demo...
3    Only Brendan McKay, or maybe ARF, would come t...
4    Help: I am running some sample problems from O...
Name: text, dtype: object

In [6]:
y.head()

0    10
1    14
2    19
3    17
4     5
Name: target, dtype: int64

### Vectorizer

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier

In [8]:
vectorizer = TfidfVectorizer()
vectorizer.fit(X) # countvectorizer 학습
X = vectorizer.transform(X) # transform

In [9]:
vectorizer.inverse_transform(X[0]) # 역변환하여 첫 번째 문장의 단어들 확인.

[array(['were', 'washington', 'utter', 'trade', 'they', 'that', 'rework',
        'patty', 'only', 'minute', 'might', 'in', 'if', 'here', 'has',
        'goals', 'even', 'druce', 'consider', 'complete', 'bust', 'been',
        'and'], dtype='<U81')]

### Modeling

In [10]:
# 학습
model = Pipeline([('tfidf', TfidfTransformer()),
                  ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)),
                 ])

model.fit(X, y)

Pipeline(steps=[('tfidf', TfidfTransformer()),
                ('clf',
                 SGDClassifier(alpha=0.001, max_iter=5, random_state=42,
                               tol=None))])

In [11]:
# 추론
from sklearn.metrics import accuracy_score

y_pred = model.predict(X[0])
print('예측 라벨 :', y_pred)
print('실제 라벨 :', train.target[0])

예측 라벨 : [10]
실제 라벨 : 10


### Submission

In [12]:
test = pd.read_csv('../dataset/news_group/test.csv')
test.head()

Unnamed: 0,id,text
0,0,\nThe VL-IDE Adapter can be much faster then t...
1,1,\n\nYeah. In a fire that reportedly burned ho...
2,2,":Judge: ""I grant you immunity from whatever ma..."
3,3,"I, too, put a corbin seat on my Hawk. I got t..."
4,4,\n\nDo I ever!!!!!! After 2 years of having h...


In [13]:
test_X = test.text # 문서 데이터 생성
test_X_vect = vectorizer.transform(test_X) # 문서 데이터 transform

In [14]:
pred = model.predict(test_X_vect)
print(pred)

[ 3 16 11 ...  4  1 12]


In [15]:
submission = pd.read_csv('../dataset/news_group/sample_submission.csv')
submission.head()

Unnamed: 0,id,target
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0


In [16]:
submission['target'] = pred
submission.head()

Unnamed: 0,id,target
0,0,3
1,1,16
2,2,11
3,3,8
4,4,13


In [17]:
submission.to_csv('7.TfidVectorizer_svm.csv', index=False)