## for the model building with binary classfier and multilabel classifier
### 1.build a multilabel classfier
#### 1.1 build the multilabel classfier model using the sample data
#### 1.2 doing a cross validation using the full data from Chinadaily english website.
### 2.build binary classfier on each category for all the news data.
#### 2.1 building a business binary classfier
#### 2.2 building a lifestyle binary classfier binary classfier
#### 2.3 building a tech binary classfier
#### 2.4 building a politics binary classfier
#### 2.5 building a education binary classfier

In [217]:
import pandas as pd
import numpy as np
import re
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import spacy
nlp = spacy.load('en_core_web_sm')


In [218]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score

In [219]:
df = pd.read_csv('ChinaDaily_EN_cleaned.csv')

In [220]:
df_sample = pd.read_csv('EN_BJ_sample.csv')


In [221]:
df.head(1)

Unnamed: 0.1,Unnamed: 0,level_0,column_1,column_2,content,publishtime,source,title,url
0,0,0,Business,"100 cities, counties and companies",The artificial intelligence sector is expected...,2018-10-15 09:33,"QIU QUANLIN,CHINA DAILY",AI plays crucial role in industrial upgrading,http://www.chinadaily.com.cn/a/201810/15/WS5bc...


In [222]:
df_sample.head(1)

Unnamed: 0.1,Unnamed: 0,column_1,column_2,content,publishtime,source,title,url,if_business,if_BJ,top nouns,top words,overall label,business,life style,tech,politics,education
0,7568,Business,Motoring,Beijing Automotive Group Co Ltd is building a ...,2018-09-11 10:13,"JING SHUIYU,CHINA DAILY",Carmaker taps growing off-road vehicle culture,http://www.chinadaily.com.cn/a/201809/11/WS5b9...,1.0,True,"['road', 'culture', 'vehicles', 'sales', 'car'...","['road', 'China', 'culture', 'vehicles', 'sale...",business,1.0,0.0,1.0,0.0,0.0


### 1.1 build the multilabel classfier model using the sample data

In [223]:
# x is feature data
X = df_sample['content']
# y  is label
y = df_sample['overall label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state= 7)

In [224]:
X_train.shape

(1094,)

In [225]:
X_test.shape

(470,)

In [226]:
from sklearn.linear_model import LogisticRegression

In [227]:
text_clf_LR = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', LogisticRegression()),
])

# Feed the training data through the pipeline
text_clf_LR.fit(X_train, y_train)



Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,...penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))])

In [228]:
predictions = text_clf_LR.predict(X_test)

In [229]:
print(confusion_matrix(y_test,predictions))

[[ 84   0  14  17   5]
 [  3  26  15   6   0]
 [  6   0 125  10   0]
 [ 16   4  16  86   0]
 [  7   0   5  16   9]]


In [230]:
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

    business       0.72      0.70      0.71       120
   education       0.87      0.52      0.65        50
  life style       0.71      0.89      0.79       141
    politics       0.64      0.70      0.67       122
        tech       0.64      0.24      0.35        37

   micro avg       0.70      0.70      0.70       470
   macro avg       0.72      0.61      0.64       470
weighted avg       0.71      0.70      0.69       470



In [231]:
print(accuracy_score(y_test,predictions))

0.7021276595744681


### 1.2doing a cross validation using the full data from Chinadaily english website.

In [232]:
df['column_1'].value_counts()

China        25731
Business     23064
Culture       6988
Travel         516
World          417
Lifestyle      254
Sports         239
Opinion         14
Food             2
Name: column_1, dtype: int64

In [233]:
result = text_clf_LR.predict(df['content'][df['column_1'] == 'Travel'])

unique, counts = np.unique(result, return_counts=True)

print(np.asarray((unique, counts)).T)

[['business' 68]
 ['life style' 419]
 ['politics' 27]
 ['tech' 2]]


### 2.1 building a business binary classfier

In [234]:
# x is feature data
X = df_sample['content']
# y  is label
y = df_sample['business']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state= 7)

In [235]:
business_clf_LR = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', LogisticRegression()),
])

# Feed the training data through the pipeline
business_clf_LR.fit(X_train, y_train)



Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,...penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))])

In [236]:
predictions = business_clf_LR.predict(X_test)

In [237]:
print(confusion_matrix(y_test,predictions))

[[254  24]
 [ 73 119]]


In [238]:
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

         0.0       0.78      0.91      0.84       278
         1.0       0.83      0.62      0.71       192

   micro avg       0.79      0.79      0.79       470
   macro avg       0.80      0.77      0.78       470
weighted avg       0.80      0.79      0.79       470



In [239]:
print(accuracy_score(y_test,predictions))

0.7936170212765957


### 2.2 building a lifestyle binary classfier binary classfier

In [240]:
# x is feature data
X = df_sample['content']
# y  is label
y = df_sample['life style']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state= 7)

In [241]:
lifestyle_clf_LR = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', LogisticRegression()),
])

# Feed the training data through the pipeline
lifestyle_clf_LR.fit(X_train, y_train)



Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,...penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))])

In [242]:
predictions = lifestyle_clf_LR.predict(X_test)

In [243]:
print(confusion_matrix(y_test,predictions))

[[106  70]
 [ 26 268]]


In [244]:
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

         0.0       0.80      0.60      0.69       176
         1.0       0.79      0.91      0.85       294

   micro avg       0.80      0.80      0.80       470
   macro avg       0.80      0.76      0.77       470
weighted avg       0.80      0.80      0.79       470



In [245]:
print(accuracy_score(y_test,predictions))

0.7957446808510639


### 2.3 building a tech binary classfier

In [246]:
# x is feature data
X = df_sample['content']
# y  is label
y = df_sample['tech']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state= 7)

In [247]:
tech_clf_LR = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', LogisticRegression()),
])

# Feed the training data through the pipeline
tech_clf_LR.fit(X_train, y_train)



Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,...penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))])

In [248]:
predictions = tech_clf_LR.predict(X_test)

In [249]:
print(confusion_matrix(y_test,predictions))

[[386   0]
 [ 57  27]]


In [250]:
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

         0.0       0.87      1.00      0.93       386
         1.0       1.00      0.32      0.49        84

   micro avg       0.88      0.88      0.88       470
   macro avg       0.94      0.66      0.71       470
weighted avg       0.89      0.88      0.85       470



In [251]:
print(accuracy_score(y_test,predictions))

0.8787234042553191


### 2.4 building a politics binary classfier

In [252]:
# x is feature data
X = df_sample['content']
# y  is label
y = df_sample['politics']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state= 7)

In [253]:
politics_clf_LR = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', LogisticRegression()),
])

# Feed the training data through the pipeline
politics_clf_LR.fit(X_train, y_train)



Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,...penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))])

In [254]:
predictions = politics_clf_LR.predict(X_test)

In [255]:
print(confusion_matrix(y_test,predictions))

[[152  56]
 [ 48 214]]


In [256]:
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

         0.0       0.76      0.73      0.75       208
         1.0       0.79      0.82      0.80       262

   micro avg       0.78      0.78      0.78       470
   macro avg       0.78      0.77      0.77       470
weighted avg       0.78      0.78      0.78       470



In [257]:
print(accuracy_score(y_test,predictions))

0.7787234042553192


### 2.5 building a education binary classfier

In [258]:
# x is feature data
X = df_sample['content']
# y  is label
y = df_sample['education']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state= 7)

In [259]:
education_clf_LR = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', LogisticRegression()),
])

# Feed the training data through the pipeline
education_clf_LR.fit(X_train, y_train)



Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,...penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))])

In [260]:
predictions = education_clf_LR.predict(X_test)

In [261]:
print(confusion_matrix(y_test,predictions))

[[394   1]
 [ 45  30]]


In [262]:
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

         0.0       0.90      1.00      0.94       395
         1.0       0.97      0.40      0.57        75

   micro avg       0.90      0.90      0.90       470
   macro avg       0.93      0.70      0.76       470
weighted avg       0.91      0.90      0.88       470



In [263]:
print(accuracy_score(y_test,predictions))

0.902127659574468
