In [4]:
import pickle as pkl
import pandas as pd, numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import Normalizer
from sklearn.metrics import precision_recall_fscore_support,precision_score,recall_score,f1_score
from sklearn.model_selection import StratifiedKFold
import os

In [7]:
path = 'E:/TextClassification/data_raw'
name = 'train_set.csv'
train_data = pd.read_csv(os.path.join(path, name), header=0)
test_data = pd.read_csv(os.path.join(path, 'test_set.csv'), header=0)

In [8]:
vector = TfidfVectorizer(ngram_range=(1,2),min_df=3, max_df=0.9,use_idf=1,smooth_idf=1, sublinear_tf=1)
train_tfidf = vector.fit_transform(train_data['word_seg'])
test_tfidf = vector.transform(test_data['word_seg'])

In [10]:
label = (train_data["class"]-1).astype(int)

# LinearSVC

In [15]:
from sklearn.svm import LinearSVC
k_fold = StratifiedKFold(n_splits=10)

results_average = pd.DataFrame()

for i, (train_index, dev_index) in enumerate(k_fold.split(train_tfidf, label)):
    fold_train = train_tfidf[train_index]
    fold_trlabel = label[train_index]
    fold_dev = train_tfidf[dev_index]
    fold_devlabel = label[dev_index]
    
    clf_0 = LinearSVC()
    clf_0.fit(fold_train, fold_trlabel)
    print(i)
    metric_dev = clf_0.predict(fold_dev)
    precision, recall, f_score, true_sum = precision_recall_fscore_support(fold_devlabel, metric_dev)
    print(precision, recall, f_score, true_sum)
    results_average[i] = clf_0.predict(test_tfidf)
    

0
[0.67467249 0.78983051 0.89223638 0.86666667 0.79148936 0.93883792
 0.74204947 0.71428571 0.92875648 0.73195876 0.67039106 0.71278826
 0.71524664 0.7602649  0.90703851 0.79844961 0.74817518 0.85444744
 0.61594203] [0.57434944 0.80068729 0.92548077 0.84856397 0.78481013 0.89114659
 0.69078947 0.77363897 0.93359375 0.71428571 0.67039106 0.63789869
 0.80657396 0.85163205 0.90824468 0.63975155 0.66129032 0.89674682
 0.61482821] [0.62048193 0.79522184 0.90855457 0.85751979 0.78813559 0.91437081
 0.71550256 0.74277854 0.93116883 0.72301426 0.67039106 0.67326733
 0.75816993 0.80335899 0.9076412  0.71034483 0.70205479 0.87508627
 0.61538462] [538 291 832 383 237 689 304 698 768 497 358 533 791 674 752 322 310 707
 553]
1
[0.67324561 0.80144404 0.89638554 0.87894737 0.88262911 0.94610778
 0.73285199 0.68882979 0.9284802  0.708      0.70108696 0.71862348
 0.72008782 0.76998597 0.91983696 0.7238806  0.72239748 0.8604336
 0.61121157] [0.57063197 0.76551724 0.89423077 0.87206266 0.79324895 0.9172

In [18]:
results_average.head(100)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,4,4,4,4,4,4,4,4,4,4
1,3,3,3,3,3,3,3,3,3,3
2,12,12,12,12,12,12,12,12,12,12
3,3,3,3,3,3,3,3,3,3,3
4,4,4,4,4,4,4,4,4,4,4
5,4,4,4,4,4,4,4,4,4,4
6,14,14,14,14,14,14,14,14,14,14
7,18,18,18,18,18,18,18,18,18,18
8,2,2,2,2,2,2,2,2,2,2
9,11,11,11,11,11,11,11,11,11,11


In [42]:
from collections import Counter
# cnt = Counter(results_average.iloc[21])
# print(cnt.most_common(1)[0][0])
def common(li):
    cnt = Counter(li)
    return int(cnt.most_common(1)[0][0]+1)

results_average['all_for_one'] = results_average.iloc[:, 0:10].apply(common, axis=1)
results_average.head(100)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,all_for_one
0,4,4,4,4,4,4,4,4,4,4,5
1,3,3,3,3,3,3,3,3,3,3,4
2,12,12,12,12,12,12,12,12,12,12,13
3,3,3,3,3,3,3,3,3,3,3,4
4,4,4,4,4,4,4,4,4,4,4,5
5,4,4,4,4,4,4,4,4,4,4,5
6,14,14,14,14,14,14,14,14,14,14,15
7,18,18,18,18,18,18,18,18,18,18,19
8,2,2,2,2,2,2,2,2,2,2,3
9,11,11,11,11,11,11,11,11,11,11,12


In [43]:
results_average.to_csv('cv_svm.csv', columns=['all_for_one'], header=['class'], index=True, index_label='id')

In [44]:
def cross_validation(clf):
    results_average = pd.DataFrame()
    for i, (train_index, dev_index) in enumerate(k_fold.split(train_tfidf, label)):
        fold_train = train_tfidf[train_index]
        fold_trlabel = label[train_index]
        fold_dev = train_tfidf[dev_index]
        fold_devlabel = label[dev_index]

        clf.fit(fold_train, fold_trlabel)
        print(i)
        metric_dev = clf_0.predict(fold_dev)
        precision, recall, f_score, true_sum = precision_recall_fscore_support(fold_devlabel, metric_dev)
        print('precision:', precision)
        print('recall:', recall)
        print('f_score:', f_score)
        print('true_sum:', true_sum)
        results_average[i] = clf_0.predict(test_tfidf)
    return results_average

# 梯度提升决策树

In [None]:
GBC = GradientBoostingClassifier(n_estimators=300)
gbc_results = cross_validation(GBC)
gbc_results.head()

In [None]:
gbc_results['all_for_one'] = gbc_results.iloc[:, 0:10].apply(common, axis=1)
gbc_results.head(100)

In [None]:
gbc_results.to_csv('cv_gbc.csv', columns=['all_for_one'], header=['class'], index=True, index_label='id')

# 逻辑回归

# SVC

# 随机森林