In [1]:
##-------------
#  Modify Date:
#         2018 - 12 -2 
#  利用朴素贝叶斯分类器实现简单语言种类分类器
# 
# 

#### 1 数据预处理 - 获取训练和测试数据集

In [1]:
# 载入数据并做简单处理
in_f  = open('./data/data.csv')
lines = in_f.readlines()
in_f.close()

# 数据和标签单独拿出来  data：[:-3], label: [-2:]
dataset = [(line.strip()[:-3], line.strip()[-2:]) for line in lines]  # 先取出空格strip 然后再取索引

# split dataset into test and training dataser
import sklearn
x,y    = zip(*dataset)  # dataset type is the list 

from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,random_state = 100)
# len(x_train) #  => 6799 same with len(y_train)

'''index = 0
for line in lines:
    if line.strip()[-2:] == 'es':
        index += 1
        if index == 1:
            print(line.strip()[:-3])'''

"index = 0\nfor line in lines:\n    if line.strip()[-2:] == 'es':\n        index += 1\n        if index == 1:\n            print(line.strip()[:-3])"

In [2]:
x_train[0:2]

['followfriday pour les fans du grand d茅tournement vous pouvez suivre laclasse',
 'wo bebt die erde twitter weiss']

#### 2 剔除噪音数据

In [3]:
# 通过正则表达式来剔除噪声
import re 
def noise_remove(inputs):
    pattern     = re.compile(" ".join(["@\w+",'#\w+','http\S+']) )
    cleand_text = re.sub(pattern, " ",inputs)
    return cleand_text.strip()

#noise_remove("Trump images are now more popular than cat gifs. @trump #trends http://www.trumptrends.html")
# >>> 'Trump images are now more popular than cat gifs.'

In [11]:
## 下一步要做的是抽取出有用的特征，抽取1-gram and 2-gram 的统计特征
from sklearn.feature_extraction.text import CountVectorizer
countervectorizer = CountVectorizer(
    lowercase    = True,        # lowercase the text 
    analyzer     = 'char_wb',   # tokenise by character ngrams 
    ngram_range  = (1,2),       # ngrams : 1-gram and 2-gram 统计特征
    max_features = 1000,        # only consider the top max_features ordered 
    preprocessor = noise_remove # preprocess - 预处理
)
# Instraction for CountVectorizer: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html

#### 3 建立分类器 - 并计算准确率

In [16]:
# import the clasifier
from sklearn.naive_bayes import MultinomialNB # 多项式朴素贝叶斯
classifier = MultinomialNB()
'''import numpy as np 
print(np.array(x_train).shape)  # 6799*1 
print(np.array(y_train).shape)  # 6799*1 '''

classifier.fit(countervectorizer.fit_transform(x_train),y_train) # fit_transform 对训练集先拟合然后再归一化

# test score on test dataset - 准确率
print('score is: %s' %(classifier.score(countervectorizer.transform(x_test), y_test))) # transform 对测试集归一化

(6799,)
(6799,)


NameError: name 'transform' is not defined

###  规范化：写成一个类

In [22]:
import re 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

class LanguageClassifierByNaiveBayes():
    
    def __init__(self,classifier = MultinomialNB() ):  # __init__ 函数默认被调用
        self.classifier = classifier
        self.countervectorizer  = CountVectorizer(
                   lowercase    = True,      # lowercase the text 
                   analyzer     = 'char_wb', # tokenise by character ngrams 
                   ngram_range  = (1,4),     # ngrams : 1-gram and 2-gram 统计特征
                   max_features = 1000,      # only consider the top max_features ordered 
                   preprocessor = self.remove_noise # preprocess - 预处理
                   )
    # remove noise 
    def remove_noise(self,inputs):
        pattern      = re.compile("".join(["http\S+","\@\w+","\#\w+"]))
        cleaned_text = re.sub(pattern," ",inputs)
        return cleaned_text.strip()
    
    # data pre-processing:
    def data_preprocesing(self):
        open_data = open('./data/data.csv')
        readlines = open_data.readlines()
        dataset_withlabel = [(line.strip()[:-3], line.strip()[-2:]) for line in readlines] 
        x,y       =  zip(*dataset_withlabel)
        x_train,x_test,y_train,y_test = train_test_split(x,y,random_state = 100)
        return x_train,x_test,y_train,y_test
    
    # 数据归一化处理
    def data_transform(self,data):     # 针对训练数据先归一化
        return self.countervectorizer.transform(data)
    
    # 归一化处理后再拟合
    def data_transform_fit(self,data): # 后拟合，针对测试数据直接拟合无需归一化处理
        self.countervectorizer.fit(data)
    
    # 用分类器拟合数据
    def model_fitting(self,data_x,data_y):
        self.data_transform_fit(data_x)
        self.classifier.fit(self.data_transform(data_x),data_y)
    
    def test_score(self,data_x,data_y):
        return self.classifier.score(self.data_transform(data_x),data_y)
    
    def prediction(self,data):
        return self.classifier.predict(self.data_transform([data]))

In [23]:
language_detector = LanguageClassifierByNaiveBayes()
x_train,x_test,y_train,y_test = language_detector.data_preprocesing()
language_detector.model_fitting(x_train,y_train)
print("socre is %s" %language_detector.test_score(x_test,y_test))
print(language_detector.prediction('This is one football games!'))

socre is 0.9907366563740626
['en']
