### 3_Nature_language_Process_Text_Analysis_Classification

In [None]:
#  Copyright private in 2018 
#  Modify Date: 
#       2018 - 9 - 19
#  Purpose : 
#       Text Analysise  by fasttext/word2vec/Deep learning/LSTM
# ----------
import numpy as np
import matplotlib.pyplot as plt
import fasttext 
import pandas as pd 
import sklearn 

In [None]:
# hierarchical softmax - 类别墅较多时，通过构建哈夫曼编码来技术softmax layer 计算 和之前的word2vec 的trick
# N-gram - 之使用unigram 的话会丢掉word order 信息

In [2]:
# 当文本量大时，可以通过fasttext来学习
# data Exampe: 
#__label__2,.......content ......
#__label__3,......content......
#__label__4,......content......

# Data Category 
# 1. car 2.sports 3.entertainment 4. technology 5. military

## (一). 通过Facebook 工业界fasttext 模型根据输入的新闻内容预测该新闻所属的种类

## 有监督学习 - 新闻分类/或者用于用户情感的褒贬分析

### 1. 生成文本数据

In [None]:
import jieba
import random 
cate_dic = {'technology':1, 'car':2, 'entertainment':3, 'military':4, 'sports':5}

# data set dict 
# df_technology
df_technology  = pd.read_csv("C:\Python_\technologynews.csv", encoding = 'utf-8')
df_technology  = df_technology.dropna()  # 空的字符drop 掉

# df_car
df_car  = pd.read_csv("C:\Python_\carnews.csv", encoding = 'utf-8')
df_car  = df_car.dropna()

# df_entertainment 
df_entertainment   = pd.read_csv("C:\Python_\entertainmentnews.csv", encoding = 'utf-8')
df_entertainment   = df_entertainment.dropna()

# df_sprots
df_sprots   = pd.read_csv("C:\Python_\sportsnews.csv", encoding = 'utf-8')
df_sprots   = df_sprots.dropna()

#df_military 
df_military  = pd.read_csv("C:\Python_\military news.csv",encoding = 'utf-8')
df_military  = df_military.dropna()


# 提取出一定量的数据
# .values -> array 数组
# .tolist -> list 列表
# [1000:21000] -> 切片找出一部分的数据
technology = df_technology.content.values.tolist()[1000:21000]
car        = df_car.content.values.tolist()[1000:21000]
entertainment = df_entertainment.content.values.tolist()[1000:21000]
military   = df_military.content.values.tolist()[1000:21000]
sports     = df_sports.content.values.tolist()[1000:21000]


### 2. Remove Stopwards

In [None]:
stopwords  = pd.read_csv("data/stopwards.txt", index_col = False, quoting = 3, sep = "\t", names = ['stopward'], encoding = 'utf-8')
stopwords  = stopwords['stopward'].values


### 3. Data Text Preparation 

In [None]:
def preprocess_text(content_lines, sentences, category):
    sentences = []
    for line in content_lines:
        try:
            segs = jieba.lcut(line)
            segs = filter(lambda x: len(x)>1,segs)  # len(x)<=1 为空，过滤掉
            segs = filter(lambda x: x not in stopwards, segs)  # x in stopwards 过滤掉
            # category 按照字典cate_dic 里面的value 取1/2/3/4 and so on 
            sentences.append("__label__"+str(category)+", "+" " .join(segs))
    except Exception,e:
        print line
        continue
        

## 生成训练数据
sentences = []

# preprocess the data 
preprocess_text(technology,sentences,cate_dic['technology'])
preprocess_text(car,sentences,cate_dic['car'])
preprocess_text(entertainment,sentences,cate_dic['entertainment'])
preprocess_text(military,sentences,cate_dic['military'])
preprocess_text(sports,sentences,cate_dic['sports'])

# 乱序处理 - 使得同一类别的样本不扎堆出现在一起
random.shuffle(sentences)

### 4.写入文本操作

In [None]:
print "writing data to fasttext format"
openout  = open('training_datasets.txt','w')  # 写入的方式打开

for sentence in sentences:
    openout.write(sentence.encode('utf-8') + "\n")  # 中文形式  encode('utf-8') + "\n" 换行符
print "Done...."

### 5.调用fasttext 训练生成模型

In [None]:
classifier  = fasttext.supervised('training_datasets.txt','classifier.model', label_prefix  = '__label__') 

In [None]:
# test and evaluted the data set 
testresult = classifier.test('training_datasets.txt')

# printing 准确值和召回率
print 'testresult precision', testresult.precision 
print 'testresult recall', testresult.recall 

print 'Number of examples: ', testresult.nexamples


### 6.实际预测过程

In [None]:
label_to_cata_test = {'technology':1, 'car':2, 'entertainment':3, 'military':4, 'sports':5}

# 待测试样本
texts  = ['马来西亚 反贪 委员会 称 马来西亚 前总理 纳吉布 被逮捕 因其牵涉']
labels = classifier.predict(texts)

#printing label and category 
print('labels is :', labels)
print label_to_cata_test[int(labels[0][0])]


# 同时输出有多少的概率来肯定种类是这个
labels = classifier.predict_proba(texts)
print labels 

In [None]:
### TopK 预测结果分析
# K = 5
category = classifier.predict(texts, K = 5)
print category

In [None]:
# 输出category 及其对应的概率
category = classifier.predict_proba(texts, K = 5)
print category

## (二). 通过fasttext做无监督文本学习

In [6]:
# Data Preparation 
def preprocess_text_unsupervised(content_lines, sentences, category):
    sentences = []
    for line in content_lines:
        try:
            segs = jieba.lcut(line)
            segs = filter(lambda x: len(x)>1,segs)  # len(x)<=1 为空，过滤掉
            segs = filter(lambda x: x not in stopwards, segs)  # x in stopwards 过滤掉
            # category 按照字典cate_dic 里面的value 取1/2/3/4 and so on 
            sentences.append(" ".join(segs))
    except Exception,e:
        print line
        continue
        

## 生成训练数据
sentences = []

# preprocess the data 
preprocess_text(technology,sentences,cate_dic['technology'])
preprocess_text(car,sentences,cate_dic['car'])
preprocess_text(entertainment,sentences,cate_dic['entertainment'])
preprocess_text(military,sentences,cate_dic['military'])
preprocess_text(sports,sentences,cate_dic['sports'])


# print out 
print "writing data to  fasttext unsupervised learning format ..."
writeout = open('unsupervised_trainingdatasets.txt','w')

for sentence in sentences: 
    writeout.write(sentence.encode('utf-8')+ "\n")
    
print"write Done ..."



In [None]:
# Using fasttext to training the data sets
# Skipgram model
model = fasttext.skipgram('unsupervised_trainingdatasets.txt','model')
# print the list 
print model.words

# CBOW model - continue bags of words 
model  = fasttext.cbow('unsupervised_trainingdatasets.txt','model')
print model.words # list of words from dictionary 

## (三). Gensim vs Fasttext

In [None]:
# Data Preparation 
def preprocess_text_unsupervised(content_lines, sentences, category):
    sentences = []
    for line in content_lines:
        try:
            segs = jieba.lcut(line)
            segs = filter(lambda x: len(x)>1,segs)  # len(x)<=1 为空，过滤掉
            segs = filter(lambda x: x not in stopwards, segs)  # x in stopwards 过滤掉
            # category 按照字典cate_dic 里面的value 取1/2/3/4 and so on 
            sentences.append(" ".join(segs))
    except Exception,e:
        print line
        continue
        

## 生成无监督训练数据
sentences = []

# preprocess the data - 无监督不需要标签
preprocess_text(technology,sentences)
preprocess_text(car,sentences)
preprocess_text(entertainment,sentences)
preprocess_text(military,sentences)
preprocess_text(sports,sentences)

In [None]:
##model fitting 
model = Word2Vec(sentences, size = 100, window = 5, min_count = 5, workers = 4)
model.save(gensim_word2vec.model)
model.wv['信息']

## (四).文本分类by Deep Learning  

In [None]:
# Note: 
#并不是将全部数据全部加在到内容，而是将一个batch 一个batch 学习及权重更新去学习新的模型
# 

### 1.CNN 做文本分类

In [7]:
# used on short text processing 
# LSTM can be used to long text processing

# CNN 中的filter 窗口大小跟词向量的文本大小有关，每个词的窗口可能为 词向量的个数* 每个词向量的维数
# 窗口filter + pooling 池化 + fullconnection 全链接


### 1.1 Data preprocessing 

In [None]:
# Data processing and remove stopwards 
import pandas as pd 

# data set dict 
# df_technology
df_technology  = pd.read_csv("C:\Python_\technologynews.csv", encoding = 'utf-8')
df_technology  = df_technology.dropna()  # 空的字符drop 掉

# df_car
df_car  = pd.read_csv("C:\Python_\carnews.csv", encoding = 'utf-8')
df_car  = df_car.dropna()

# df_entertainment 
df_entertainment   = pd.read_csv("C:\Python_\entertainmentnews.csv", encoding = 'utf-8')
df_entertainment   = df_entertainment.dropna()

# df_sprots
df_sprots   = pd.read_csv("C:\Python_\sportsnews.csv", encoding = 'utf-8')
df_sprots   = df_sprots.dropna()

#df_military 
df_military  = pd.read_csv("C:\Python_\military news.csv",encoding = 'utf-8')
df_military  = df_military.dropna()


# 提取出一定量的数据
# .values -> array 数组
# .tolist -> list 列表
# [1000:21000] -> 切片找出一部分的数据
technology = df_technology.content.values.tolist()[1000:21000]
car        = df_car.content.values.tolist()[1000:21000]
entertainment = df_entertainment.content.values.tolist()[1000:21000]
military   = df_military.content.values.tolist()[1000:21000]
sports     = df_sports.content.values.tolist()[1000:21000]


### remove stopwards
stopwords = pd.read_csv("data/stopwards.txt",index_col = False, quoting =3, sep= "\t", names =['stopwards'],encoding = 'utf-8' )
stopwords = stopwords['stopwords'].values

In [None]:
### Construct Data 
# Data Preparation 
def preprocess_text_cnn(content_lines, sentences, category):
    sentences = []
    for line in content_lines:
        try:
            segs = jieba.lcut(line)
            segs = filter(lambda x: len(x)>1,segs)  # len(x)<=1 为空，过滤掉
            segs = filter(lambda x: x not in stopwards, segs)  # x in stopwards 过滤掉
            # category 按照字典cate_dic 里面的value 取1/2/3/4 and so on 
            sentences.append(" ".join(segs),category)  # 添加数据及其label
    except Exception,e:
        print line
        continue
        

## 生成训练数据
sentences = []

# preprocess the data 
preprocess_text_cnn(technology,sentences,'technology')
preprocess_text_cnn(car,sentences,'car')
preprocess_text_cnn(entertainment,sentences,'entertainment')
preprocess_text_cnn(military,sentences,'military')
preprocess_text_cnn(sports,sentences,'sports')

In [None]:
### split data sets 
from  sklearn.model_selection import train_test_split

# 拉链 将词语和label 分别分给x and y 
x,y = zip(*sentences)

# split the data into trianing and test data sets 
train_data,test_data,train_target,test_target  = train_test_split(x,y,random_state = 200)

### 1.2 构建神经网络过程 - 中文文本分类 on CPU

In [None]:
# for python2 need to import lib
# from __future__ import absolute_import
# from __future__ import division
# from __future__ import print_function 

import argparse
import sys
import pandas as pd 
import np
from sklrearn import metrics 
import tensorflow as tf 

learn  = tf.contrib.learn 


In [11]:
# Variables Initialization

FLAGS = None
# 文档最长的长度
MAX_DOCUMENT_LENGTH = 100

# 最小词频数
MIN_WORD_FREQUENCY = 2 

# 词嵌入的维度
EMBEDDING_SIZE= 20

# filter 数量
N_FILTERS = 10

# Windows size 
WINDOWS_SIZE = 20

#filter 的形状
FILTER_SHAPE1 = [WINDOWS_SIZE, EMBEDDING_SIZE]
FILTER_SHAPE2 = [WINDOWS_SIZE, N_FILTERS]

# Pooling 
POOLING_WINDOW  = 4
POOLING_STRIDE  = 2 
n_words = 0

In [None]:
# define CNN Model 卷积神经网络
def cnn_model(features,target):
    ###
    ### 两层的卷积神经网络，用于短文本分类
    # 先把词转成词嵌入
    # 我们得到一个形状为[n_words,EMBEDDING_SIZE] 的词表映射矩阵
    # 接着我们可以把一批文本映射成[batch_size,sequence_length,EMBEDDING_SIZE]的矩阵
    
    # one - hot 编码 
    target = tf.one_hot(target,15,1,0)
    
    # 将feature/文本 的序列做一个映射，编成一个二维向量
    word_vectors = tf.contrib.layers.embed_sequence(features, vocab_size = n_words, embed_dim = EMBEDDING_SIZE,scope = 'words')
    
    # 将2维转成3 维
    word_vectors = tf.expand_dims(word_vectors,3)
    
    with tf.variable_scope('CNN_Layer1'):
        # 添加一个二维的卷积滤波
        conv1  = tf.contrib.layers.convolution2d(word_vectors,N_FILTERS,FILTER_SHAPE1,padding = 'VALID')
        # 添加RELU非线性 - 激活函数
        conv1  = tf.nn.relu(conv1)
        
        # maxmimum pooling 
        pool1 = tf.nn.max_pool(conv1,ksize = [1,POOLING_WINDOW,1,1], strides = [1,POOLING_STRIDE,1,1], padding = 'SAME')
        
        # 对矩阵转置 以满足形状
        pool1 = tf.transpose(pool1,[0,1,3,2])
        
    
    with tf.variabel_scope('CNN_Layer2'):
            # 第2个卷积层
            conv2 = tf.contrib.layers.convolution2d(pool1,N_FILTERS,FILTER_SHAPE2,padding = 'VALID')
            
            # 抽取特征
            pool2  = tf.squeeze(tf.reduce_max(conv2,1), squeeze_dims = [1])
            
            
    # FullConnection - 全链接
    # 预测值 : logits
    logits = tf.contrib.layer.fully_connected(pool2, 15, activation_fn = None)  # 无激活函数
    loss   = tf.losses.softmax_cross_entropy(target, logits)  #target:真实值， logits；预测值
    
    
    # 循环迭代
    train_op = tf.contrib.layers.optimize_loss(loss,tf.contrib.framwork.get_gloabl_step(), optimizer = 'Adam', learning_rate = 0.01)
    
    # return
    return({
        'class': tf.argmax(logits,1),
        'prob': tf.nn.softmax(logits)
    }, loss, train_op)

        
        

### 1.3 Tensorflow.preprocessing 里的VocabularyProcessor

In [None]:
temp  = ['I am good', 'You are  here ','I am glad', 'it is great']
#
# 只要出现的最小频率是1 或者比1 大，就处理
vocab_processor  = learn.preprocessing.VocabularyProcessor(10, min_frequency=1)
list(vocab_processor.fit_transform(temp))

# I am good -> [1,2,0,0,......] length = 10

In [None]:
global n_words
# 处理词汇
vocab_processor  = learn.preprocessing.VocabularyProcessor(MAX_DOCUMENT_LENGTH,min_frequency = 1)
x_train = np.array(list(vocab_processor.fit_transform(train_data)))
x_test = np.array(list(vocab_processor.fit_transform(test_data)))

n_words = len(vocab_processor.vocabulary_)
print('Total words: %d', %n_words)


In [None]:
#-------------
# 将类别映射成数字
cate_dic = {'technology':1, 'car':2, 'entertainment':3, 'military':4, 'sports':5}
train_target = map(lambda x:cate_dic[x],train_target )
test_target = map(lambda x:cate_dic[x],test_target )

y_train =  pandas.Series(train_target)
y_test  =  pandas.Series(train_test)


In [None]:
# 构建模型
classifier = learn.SKCompat(learn.Estimator(model_fn = cnn_model))

# 训练和预测
classifier.fit(x_train,y_train,steps = 1000)
y_predicted = classifier.predict(x_test)['class']

score = metrics.accuracy_score(y_test,y_predicted)
print('Accuracy:{0:f}'.format(score))


### 1.4 LSTM/GRU - RNN 循环神经网络

In [None]:
# 使用RNN完成文本分类
# for python2 need to import lib
# from __future__ import absolute_import
# from __future__ import division
# from __future__ import print_function 

import argparse
import sys
import pandas as pd 
import np
from sklrearn import metrics 
import tensorflow as tf 

from tensorflow.contrib.layers.python.layers import encoders 
learn  = tf.contrib,learn

FLAGS = None


In [None]:
# 通过词袋模型来一批一批的把数据灌进去
MAX_DOCUMENT_LENGTN = 15
MIN_WORD_FREQUENCE  = 1
EMBEDDING_SIZE = 50

global n_words
# 处理词汇
vocab_processor  = learn.preprocessing.VocabularyProcessor(MAX_DOCUMENT_LENGTH,min_frequency = 1)
x_train = np.array(list(vocab_processor.fit_transform(train_data)))
x_test = np.array(list(vocab_processor.fit_transform(test_data)))

n_words = len(vocab_processor.vocabulary_)
print('Total words: %d', %n_words)

def bag_of_words_model(feature,target):
    # 生成词袋模型
    target = tf.one_hot(target,15,1,0)
    features = encoders.bow_encoder(features,vocab_size = n_words,embed_dim = EMBEDDING_SIZE)

    # FullConnection - 全链接
    # 预测值 : logits
    logits = tf.contrib.layer.fully_connected(pool2, 15, activation_fn = None)  # 无激活函数
    loss   = tf.losses.softmax_cross_entropy(target, logits)  #target:真实值， logits；预测值
    
    
    # 循环迭代
    train_op = tf.contrib.layers.optimize_loss(loss,tf.contrib.framwork.get_gloabl_step(), optimizer = 'Adam', learning_rate = 0.01)
  
    # return
    return({
        'class': tf.argmax(logits,1),
        'prob': tf.nn.softmax(logits)
    }, loss, train_op)

#
model_fn = bag_of_words_model
classifier = learn.SKCompat(learn.Estimator(model_fn = model_fn))

# 训练和预测
classifier.fit(x_train,y_train,steps = 1000)
y_predicted = classifier.predict(x_test)['class']

score = metrics.accuracy_score(y_test,y_predicted)
print('Accuracy:{0:f}'.format(score))

In [None]:
# 