In [1]:
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.metrics import classification_report,accuracy_score,confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight
import numpy
import pandas as pd
import re
import os
import jieba
import jieba.posseg as pseg
from sklearn import feature_extraction, model_selection, naive_bayes, pipeline, manifold, preprocessing

In [2]:
df_train = pd.read_csv('./kaggle_train_dataset.csv', sep=',',encoding='utf-8')
df_test = pd.read_csv('./kaggle_test_dataset.csv', sep=',',encoding='utf-8')

In [3]:
df_train["content"] = df_train["content"].fillna("NoName")
df_test["content"] = df_test["content"].fillna("NoName")
df_train["title"] = df_train["title"].fillna("NoName")
df_test["title"] = df_test["title"].fillna("NoName")
df_train["text"] = df_train.title + df_train.content
df_test["text"] = df_test.title + df_test.content

In [4]:
jieba.set_dictionary("./dict.txt.big")

df_train["text"] = df_train["text"].astype(str).map(lambda x: " ".join(jieba.cut(x, cut_all = False)))

df_test["text"] = df_test["text"].astype(str).map(lambda x: " ".join(jieba.cut(x, cut_all = False)))
df_train.head(1)

Building prefix dict from C:\Users\charl\NLP\dict.txt.big ...
Loading model from cache C:\Users\charl\AppData\Local\Temp\jieba.u84a06abc7f9e19dbe3099582e43bc074.cache
Loading model cost 1.431 seconds.
Prefix dict has been built successfully.


Unnamed: 0,label,title,content,text
0,informative,本報特約----宏觀縱覽/溫家寶：當前重要的是促進投資合理增長,7月9日上午和10日上午，大陸國務院總理溫家寶先後主持召開兩次經濟形勢座談會，聽取專家和企業...,本報 特約 ---- 宏觀 縱覽 / 溫家寶 ： 當前 重要 的 是 促進 投資 合理 增長...


In [5]:
stopwords=pd.read_table("./stopwords.txt")
lst_stopword=list(stopwords["是"])

In [6]:
def utils_preprocess_text(text,lst_stopwords=None):

 text = re.sub(r'[^\u4e00-\u9fa5]'," ",str(text).strip())
 lst_text = text.split()    ## remove Stopwords
 if lst_stopwords is not None:
    lst_text = [word for word in lst_text if word not in lst_stopword]
 text = " ".join(lst_text)
 return text

In [7]:
df_train["text_clean"] = df_train["text"].apply(lambda x: utils_preprocess_text(x,lst_stopwords=lst_stopword))

df_test["text_clean"] = df_test["text"].apply(lambda x:utils_preprocess_text(x,lst_stopwords=lst_stopword))
df_train.head(1)

Unnamed: 0,label,title,content,text,text_clean
0,informative,本報特約----宏觀縱覽/溫家寶：當前重要的是促進投資合理增長,7月9日上午和10日上午，大陸國務院總理溫家寶先後主持召開兩次經濟形勢座談會，聽取專家和企業...,本報 特約 ---- 宏觀 縱覽 / 溫家寶 ： 當前 重要 的 是 促進 投資 合理 增長...,特約 宏觀 縱覽 溫家寶 重要 促進 投資 合理 增長 上午 上午 大陸 國務院 總理 溫家...


In [8]:
train_features = df_train["text_clean"].values
train_label = df_train["label"].values
TEST_features = df_test["text_clean"].values

In [None]:
x_train = train_features
y_train = train_label
x_TEST = TEST_features

from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_extraction.text import CountVectorizer



classifier = ExtraTreesClassifier(n_estimators=200,class_weight = 'balanced_subsample')
count_vectorizer = CountVectorizer(max_df=0.95,max_features=12000)

# TF-IDF
x_train_tfidf = count_vectorizer.fit_transform(x_train)
x_TEST_tfidf = count_vectorizer.transform(x_TEST)
classifier.fit(x_train_tfidf, y_train) 

# make predicitions
TEST_predict_result = classifier.predict(x_TEST_tfidf)

# transform label string to numbic due to Kaggle requirements.        
str_encode_num = {'informative':0, 'happy':1, 'angry':2, 'depressing':3, 'odd':4, 'boring':5, 'warm':6,'worried':7}

# output result to csv file
with open("./kaggle_submission.csv", "w", encoding="utf-8") as f:
    f.write(f'Id,Label\n')
    for idx, item in enumerate([str_encode_num[item] for item in TEST_predict_result]):
        f.write(f'{idx},{item}\n')