# 作业

In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity='all'

## 1.在fake_or_real_news数据集上构建一个简单的文本分类器， 实现对虚假新闻的分类， 可以使用其他模型和评测指标对比分类效果；

In [2]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

### 数据预处理

In [3]:
df = pd.read_csv('fake_or_real_news.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6335 entries, 0 to 6334
Data columns (total 4 columns):
Unnamed: 0    6335 non-null int64
title         6335 non-null object
text          6335 non-null object
label         6335 non-null object
dtypes: int64(1), object(3)
memory usage: 198.1+ KB


In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


#### 创建label列

In [5]:
df['FakeOrReal'] = df['label'].map(lambda x: 1 if 'REAL'==x else 0)
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6335 entries, 0 to 6334
Data columns (total 5 columns):
Unnamed: 0    6335 non-null int64
title         6335 non-null object
text          6335 non-null object
label         6335 non-null object
FakeOrReal    6335 non-null int64
dtypes: int64(2), object(3)
memory usage: 247.6+ KB


Unnamed: 0.1,Unnamed: 0,title,text,label,FakeOrReal
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE,0
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE,0
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL,1
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE,0
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL,1


#### 定义标签，训练集，测试集

In [6]:
# 标签
y = df['FakeOrReal']

# 创建训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(df['text'],
                                                    y,
                                                    test_size=0.3,
                                                    random_state=53)

#### 使用CountVectorizer

In [7]:
# 初始化CountVectorizer对象
count_vectorizer = CountVectorizer(stop_words='english')
count_train = count_vectorizer.fit_transform(X_train.values)
count_test = count_vectorizer.transform(X_test.values)

#### 使用TfidfVectorizer

In [8]:
# Import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# 初始化TfidfVectorizer对象
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)
tfidf_train = tfidf_vectorizer.fit_transform(X_train.values)
tfidf_test = tfidf_vectorizer.transform(X_test.values)

#### 创建CountVectorizer词向量数据集

In [9]:
# Create the CountVectorizer DataFrame: count_df
count_df = pd.DataFrame(count_train.A, columns=count_vectorizer.get_feature_names())
# Print the head of count_df
count_df.head()


Unnamed: 0,00,000,0000,00000031,000035,00006,0001,0001pt,000billion,000ft,...,حلب,عربي,عن,لم,ما,محاولات,من,هذا,والمرضى,ยงade
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### 创建TfidfVectorizer词向量数据集

In [10]:
# Create the TfidfVectorizer DataFrame: tfidf_df
tfidf_df = pd.DataFrame(tfidf_train.A, columns=tfidf_vectorizer.get_feature_names())
# Print the head of tfidf_df
tfidf_df.head()

Unnamed: 0,00,000,0000,00000031,000035,00006,0001,0001pt,000billion,000ft,...,حلب,عربي,عن,لم,ما,محاولات,من,هذا,والمرضى,ยงade
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.041696,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.031448,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.014377,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### 训练模型

In [11]:
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

#### 基于CountVectorizer词向量数据集的模型训练和模型评测

In [24]:
nb_classifier = MultinomialNB()
nb_classifier.fit(count_train, y_train)
pred = nb_classifier.predict(count_test)
# metrics.accuracy_score(y_test, pred)
# metrics.roc_auc_score(y_test, pred)
# metrics.confusion_matrix(y_test, pred, labels=[0, 1])
print(metrics.classification_report(y_true=y_test, y_pred=pred,labels=[0, 1]))

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

              precision    recall  f1-score   support

           0       0.92      0.87      0.90       913
           1       0.89      0.93      0.91       988

    accuracy                           0.90      1901
   macro avg       0.90      0.90      0.90      1901
weighted avg       0.90      0.90      0.90      1901



#### 基于TfidfVectorizer词向量数据集的模型训练和模型评测

In [25]:
nb_classifier = MultinomialNB()

nb_classifier.fit(tfidf_train, y_train)
pred = nb_classifier.predict(tfidf_test)

# metrics.accuracy_score(y_test, pred)
# metrics.roc_auc_score(y_test, pred)
# metrics.confusion_matrix(y_test, pred, labels=[0, 1])
print(metrics.classification_report(y_true=y_test, y_pred=pred,labels=[0, 1]))

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

              precision    recall  f1-score   support

           0       0.97      0.74      0.84       913
           1       0.80      0.98      0.88       988

    accuracy                           0.86      1901
   macro avg       0.88      0.86      0.86      1901
weighted avg       0.88      0.86      0.86      1901



### 模型效果对比

<h4>结论：由于fake和real的样本各占比接近一半，分布比较均匀，使用classification_report显示CountVectorizer的accuracy比TfidfVectorizer要理想</h4>

## 2.多分类的问题，课上所讲的影评数据集实现对其他题材电影的分类

<h4>解决思路：考虑尝试使用pytorch进行文本分类</h4>

### 数据预处理

In [1]:
import pandas as pd
df = pd.read_csv('IMDb movies.csv')
df = df.loc[:, ['title','description', 'genre']]
df = df[df.description.notnull()]

def getClassification(x):
    if 'Romance' in [i.strip() for i in x.split(',')]:
        return 1
    elif 'Comedy' in [i.strip() for i in x.split(',')]:
        return 2
    elif 'Drama' in [i.strip() for i in x.split(',')]:
        return 3
    else:
        return 0

df['Classification'] = df['genre'].map(getClassification)
# df['Classification'].value_counts()  
df=df.drop(df[(df['Classification'].map(lambda d: d))==0].index)
# df.head()
df.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 62423 entries, 0 to 81270
Data columns (total 4 columns):
title             62423 non-null object
description       62423 non-null object
genre             62423 non-null object
Classification    62423 non-null int64
dtypes: int64(1), object(3)
memory usage: 2.4+ MB


In [2]:
col_name=df.columns.tolist() 
col_name.insert(0,'Classification2') 
df=df.reindex(columns=col_name) 
df['Classification2']=df['Classification'] 
df=df.drop(['Classification','genre'],axis=1)
df.head()

Unnamed: 0,Classification2,title,description
0,3,The Story of the Kelly Gang,True story of notorious Australian outlaw Ned ...
1,3,Den sorte drøm,Two men of high rank are both wooing the beaut...
2,3,Cleopatra,The fabled queen of Egypt's affair with Roman ...
3,3,L'Inferno,Loosely adapted from Dante's Divine Comedy and...
4,3,"From the Manger to the Cross; or, Jesus of Naz...","An account of the life of Jesus Christ, based ..."


In [7]:
from sklearn.model_selection import train_test_split

train_data,test_data=train_test_split(df,test_size=0.3,shuffle=True,random_state=53)
train_data.info()
test_data.info()

train_path="E:/KaiKeBa/基础班/Python/第六章/第八节/homework-data/train.csv"
test_path="E:/KaiKeBa/基础班/Python/第六章/第八节/homework-data/test.csv"

train_data.to_csv(train_path,index=False,header=False) #注意导出的文件后缀要写成.csv
test_data.to_csv(test_path,index=False,header=False)#index和header默认为True

<class 'pandas.core.frame.DataFrame'>
Int64Index: 43696 entries, 5924 to 73824
Data columns (total 3 columns):
Classification2    43696 non-null int64
title              43696 non-null object
description        43696 non-null object
dtypes: int64(1), object(2)
memory usage: 1.3+ MB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 18727 entries, 58675 to 70269
Data columns (total 3 columns):
Classification2    18727 non-null int64
title              18727 non-null object
description        18727 non-null object
dtypes: int64(1), object(2)
memory usage: 585.2+ KB


### 创建Dataset

In [1]:
import torch
import torchtext
from torchtext.datasets import text_classification
from torchtext.utils import extract_archive, unicode_csv_reader
from torchtext.vocab import build_vocab_from_iterator
from torchtext.datasets.text_classification import *
from torchtext.datasets.text_classification import _csv_iterator,_create_data_from_iterator
NGRAMS = 2

# 定义创建数据集函数
def _setup_datasets(root='E:/KaiKeBa/基础班/Python/第六章/第八节/homework-data', ngrams=NGRAMS, vocab=None, include_unk=False):
    train_csv_path = root+'/train.csv'
    test_csv_path = root+'/test.csv'
    if vocab is None:
        logging.info('Building Vocab based on {}'.format(train_csv_path))
        vocab = build_vocab_from_iterator(_csv_iterator(train_csv_path, ngrams)) #创建词典
    else:
        if not isinstance(vocab, Vocab):
            raise TypeError("Passed vocabulary is not of type Vocab")
    logging.info('Vocab has {} entries'.format(len(vocab)))
    logging.info('Creating training data')
    train_data, train_labels = _create_data_from_iterator(   #创建训练数据
        vocab, _csv_iterator(train_csv_path, ngrams, yield_cls=True), include_unk) 
    logging.info('Creating testing data')
    test_data, test_labels = _create_data_from_iterator(   #创建测试数据
        vocab, _csv_iterator(test_csv_path, ngrams, yield_cls=True), include_unk)
    if len(train_labels ^ test_labels) > 0:
        raise ValueError("Training and test labels don't match")
    return (TextClassificationDataset(vocab, train_data, train_labels),  #返回数据集实例
            TextClassificationDataset(vocab, test_data, test_labels))

train_dataset, test_dataset = _setup_datasets()


43696lines [00:04, 9496.00lines/s] 
43696lines [00:06, 7209.61lines/s]
18727lines [00:02, 7444.76lines/s]


### 定义模型

In [2]:
import torch.nn as nn
import torch.nn.functional as F
class TextSentiment(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class):
        super().__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=True)
        self.fc = nn.Linear(embed_dim, num_class)
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, text, offsets):
        embedded = self.embedding(text, offsets)
        return self.fc(embedded)

In [3]:
BATCH_SIZE = 4
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
VOCAB_SIZE = len(train_dataset.get_vocab())
EMBED_DIM = 32
NUN_CLASS = len(train_dataset.get_labels())
model = TextSentiment(VOCAB_SIZE, EMBED_DIM, NUN_CLASS).to(device)

### 定义batch生成函数

In [4]:
def generate_batch(batch):
    label = torch.tensor([entry[0] for entry in batch])
    text = [entry[1] for entry in batch]
    offsets = [0] + [len(entry) for entry in text]
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text = torch.cat(text)
    return text, offsets, label

### 定义模型训练函数

In [5]:
from torch.utils.data import DataLoader

def train_func(sub_train_):

    # Train the model
    train_loss = 0
    train_acc = 0
    data = DataLoader(sub_train_, batch_size=BATCH_SIZE, shuffle=True,
                      collate_fn=generate_batch)
    for i, (text, offsets, cls) in enumerate(data):
        optimizer.zero_grad()
        text, offsets, cls = text.to(device), offsets.to(device), cls.to(device)
        output = model(text, offsets)
        loss = criterion(output, cls)
        train_loss += loss.item()
        loss.backward()
        optimizer.step()
        train_acc += (output.argmax(1) == cls).sum().item()

    # Adjust the learning rate
    scheduler.step()

    return train_loss / len(sub_train_), train_acc / len(sub_train_)


### 定义模型验证函数

In [6]:
def valid(data_):
    loss = 0
    acc = 0
    data = DataLoader(data_, batch_size=BATCH_SIZE, collate_fn=generate_batch)
    for text, offsets, cls in data:
        text, offsets, cls = text.to(device), offsets.to(device), cls.to(device)
        with torch.no_grad():
            output = model(text, offsets)
            loss = criterion(output, cls)
            loss += loss.item()
            acc += (output.argmax(1) == cls).sum().item()

    return loss / len(data_), acc / len(data_)

### 定义模型测试函数

In [7]:
def test(data_):
    loss = 0
    acc = 0
    data = DataLoader(data_, batch_size=BATCH_SIZE, collate_fn=generate_batch)
    for text, offsets, cls in data:
        text, offsets, cls = text.to(device), offsets.to(device), cls.to(device)
        with torch.no_grad():
            save_model = torch.load('E:\KaiKeBa\基础班\Python\第六章\第八节\model\TextSentiment-model.pkl')
            output = save_model(text, offsets)
            loss = criterion(output, cls)
            loss += loss.item()
            acc += (output.argmax(1) == cls).sum().item()

    return loss / len(data_), acc / len(data_)

### 训练模型

In [8]:
import time
from torch.utils.data.dataset import random_split
N_EPOCHS = 20
min_valid_loss = float('inf')

criterion = torch.nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=0.5)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.9)

train_len = int(len(train_dataset) * 0.95)
sub_train_, sub_valid_ = random_split(train_dataset, [train_len, len(train_dataset) - train_len])
best_acc=0
for epoch in range(N_EPOCHS):

    start_time = time.time()
    train_loss, train_acc = train_func(sub_train_)
    valid_loss, valid_acc = valid(sub_valid_)
    
    if best_acc==0:
        best_acc=valid_acc
    if best_acc>0 and best_acc<valid_acc:
        best_acc=valid_acc
        torch.save(model, 'E:\KaiKeBa\基础班\Python\第六章\第八节\model\TextSentiment-model.pkl')
    
    
    secs = int(time.time() - start_time)
    mins = secs / 60
    secs = secs % 60

    print('Epoch: %d' %(epoch + 1), " | time in %d minutes, %d seconds" %(mins, secs))
    print(f'\tLoss: {train_loss:.4f}(train)\t|\tAcc: {train_acc * 100:.1f}%(train)')
    print(f'\tLoss: {valid_loss:.4f}(valid)\t|\tAcc: {valid_acc * 100:.1f}%(valid)')

Epoch: 1  | time in 0 minutes, 29 seconds
	Loss: 0.2566(train)	|	Acc: 47.3%(train)
	Loss: 0.0009(valid)	|	Acc: 51.3%(valid)


  "type " + obj.__name__ + ". It won't be checked "


Epoch: 2  | time in 0 minutes, 28 seconds
	Loss: 0.2321(train)	|	Acc: 56.3%(train)
	Loss: 0.0010(valid)	|	Acc: 55.6%(valid)
Epoch: 3  | time in 0 minutes, 20 seconds
	Loss: 0.2073(train)	|	Acc: 63.1%(train)
	Loss: 0.0013(valid)	|	Acc: 57.1%(valid)
Epoch: 4  | time in 0 minutes, 19 seconds
	Loss: 0.1808(train)	|	Acc: 68.9%(train)
	Loss: 0.0013(valid)	|	Acc: 58.7%(valid)
Epoch: 5  | time in 0 minutes, 19 seconds
	Loss: 0.1505(train)	|	Acc: 75.3%(train)
	Loss: 0.0026(valid)	|	Acc: 56.8%(valid)
Epoch: 6  | time in 0 minutes, 19 seconds
	Loss: 0.1176(train)	|	Acc: 81.9%(train)
	Loss: 0.0015(valid)	|	Acc: 55.7%(valid)
Epoch: 7  | time in 0 minutes, 19 seconds
	Loss: 0.0866(train)	|	Acc: 87.9%(train)
	Loss: 0.0022(valid)	|	Acc: 55.3%(valid)
Epoch: 8  | time in 0 minutes, 20 seconds
	Loss: 0.0606(train)	|	Acc: 93.0%(train)
	Loss: 0.0030(valid)	|	Acc: 57.2%(valid)
Epoch: 9  | time in 0 minutes, 19 seconds
	Loss: 0.0411(train)	|	Acc: 96.7%(train)
	Loss: 0.0026(valid)	|	Acc: 57.4%(valid)
Epoch: 1

### 测试模型

In [9]:
print('Checking the results of test dataset...')
test_loss, test_acc = test(test_dataset)
print(f'\tLoss: {test_loss:.4f}(test)\t|\tAcc: {test_acc * 100:.1f}%(test)')

Checking the results of test dataset...
	Loss: 0.0001(test)	|	Acc: 55.3%(test)


### 模型应用

In [13]:
import re
from torchtext.data.utils import ngrams_iterator
from torchtext.data.utils import get_tokenizer

ag_news_label = {1 : "Romance",
                 2 : "Comedy",
                 3 : "Drama"}
best_model = torch.load('E:\KaiKeBa\基础班\Python\第六章\第八节\model\TextSentiment-model.pkl')
        
def predict(text, model, vocab, ngrams):
    tokenizer = get_tokenizer("basic_english")
    with torch.no_grad():
        text = torch.tensor([vocab[token]
                            for token in ngrams_iterator(tokenizer(text), ngrams)])
        output = best_model(text, torch.tensor([0]))
        return output.argmax(1).item() + 1

ex_text_str="Michael Adler has run away from his suburban home with his little brother Dylan. Hiding out in a quiet, rural town, Michael's convinced he can make a better life for both of them. While ..."
vocab = train_dataset.get_vocab()
best_model = best_model.to("cpu")
print("This is a %s movie" %ag_news_label[predict(ex_text_str, model, vocab, 2)])

This is a Drama movie


<h4>结论：本人尝试使用pytorch搭建模型进行文本分类，由于样本总量不足够，样本分布也不均匀，因此训练出来的模型效果不是很理想，测试集准确率只达到55%</h4>

## 3.实际工作中我们会结合中文分词工具和课上所讲的工具库完成中文的自然语言处理任务，中文分词工作本身并不复杂，可以借助jieba完成，请同学们自行学习和掌握；

In [4]:
# 导入 jieba
import jieba
import jieba.posseg as pseg #词性标注
import jieba.analyse as anls #关键词提取

### 分词

In [5]:
seg_list = jieba.cut("他来到了网易杭研大厦") #默认精确模式和启用 HMM
print("【分词】：" + "/ ".join(seg_list)) 

Building prefix dict from the default dictionary ...
Dumping model to file cache C:\Users\abby\AppData\Local\Temp\jieba.cache
Loading model cost 0.691 seconds.
Prefix dict has been built successfully.


【分词】：他/ 来到/ 了/ 网易/ 杭研/ 大厦


### 词性标注

In [9]:
words = pseg.cut("他改变了中国")
for word, flag in words:
    print("{0} {1}".format(word, flag))

他 r
改变 v
了 ul
中国 ns


### 关键词提取

In [6]:
s = "此外，公司拟对全资子公司吉林欧亚置业有限公司增资4.3亿元，增资后，吉林欧亚置业注册资本由7000万元增加到5亿元。吉林欧亚置业主要经营范围为房地产开发及百货零售等业务。目前在建吉林欧亚城市商业综合体项目。2013年，实现营业收入0万元，实现净利润-139.13万元。"

#### 基于 TF-IDF 算法进行关键词提取

In [7]:
# 通过 jieba.analyse.extract_tags 方法可以基于 TF-IDF 算法进行关键词提取，该方法共有 4 个参数：

# sentence：为待提取的文本
# topK：为返回几个 TF/IDF 权重最大的关键词，默认值为 20
# withWeight：是否一并返回关键词权重值，默认值为 False
# allowPOS：仅包括指定词性的词，默认值为空

for x, w in anls.extract_tags(s, topK=20, withWeight=True):
    print('%s %s' % (x, w))

欧亚 0.7300142700289363
吉林 0.659038184373617
置业 0.4887134522112766
万元 0.3392722481859574
增资 0.33582401985234045
4.3 0.25435675538085106
7000 0.25435675538085106
2013 0.25435675538085106
139.13 0.25435675538085106
实现 0.19900979900382978
综合体 0.19480309624702127
经营范围 0.19389757253595744
亿元 0.1914421623587234
在建 0.17541884768425534
全资 0.17180164988510638
注册资本 0.1712441526
百货 0.16734460041382979
零售 0.1475057117057447
子公司 0.14596045237787234
营业 0.13920178509021275


#### 基于 TextRank 算法的关键词提取

In [8]:
# 默认过滤词性（allowPOS=('ns', 'n', 'vn', 'v')）
for x, w in anls.textrank(s, withWeight=True):
    print('%s %s' % (x, w))

吉林 1.0
欧亚 0.9966893354178172
置业 0.6434360313092776
实现 0.5898606692859626
收入 0.43677859947991454
增资 0.4099900531283276
子公司 0.35678295947672795
城市 0.34971383667403655
商业 0.34817220716026936
业务 0.3092230992619838
在建 0.3077929164033088
营业 0.3035777049319588
全资 0.303540981053475
综合体 0.29580869172394825
注册资本 0.29000519464085045
有限公司 0.2807830798576574
零售 0.27883620861218145
百货 0.2781657628445476
开发 0.2693488779295851
经营范围 0.2642762173558316
