In [2]:
# # Code for hiding seaborn warnings
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import matplotlib.pyplot as plt
import altair as alt
import numpy as np
import pickle
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import chi2
import numpy as np
import seaborn as sns
sns.set_style("whitegrid")
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.model_selection import cross_val_score, KFold
from scipy.stats import sem
from sklearn import metrics, pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV


In [3]:
def load_data(training_path, testing_path):
    df_train = pd.read_csv('training.csv')
    df_test = pd.read_csv('test.csv')
    return df_train, df_test

In [4]:
def pre_processing_data(df_train, df_test, stop_words):
    # remove irrelevant items
    df_train = df_train.loc[df_train['topic'] != 'IRRELEVANT']
    df_test = df_test.loc[df_test['topic'] != 'IRRELEVANT']

    # add topic codes for each topic
    # generate topic codes for each topic
    topic_codes = {
        'ARTS CULTURE ENTERTAINMENT': 0,
        'BIOGRAPHIES PERSONALITIES PEOPLE': 1,
        'DEFENCE': 2,
        'DOMESTIC MARKETS': 3,
        'FOREX MARKETS': 4,
        'HEALTH': 5,
        'MONEY MARKETS': 6,
        'SCIENCE AND TECHNOLOGY': 7,
        'SHARE LISTINGS': 8,
        'SPORTS': 9
    }

    # Category mapping
    df_train['topic_codes'] = df_train['topic']
    df_train = df_train.replace({'topic_codes': topic_codes})
    df_test['topic_codes'] = df_test['topic']
    df_test = df_test.replace({'topic_codes': topic_codes})

    # remove stop words
    df_train['content_parsed'] = df_train['article_words']
    df_test['content_parsed'] = df_test['article_words']

    for stop_word in stop_words:
        regex_stopword = r'\b' + ',' + stop_word + r'\b'
        regex2_stopword = r'\b' + stop_word + ',' + r'\b'
        df_train['content_parsed'] = df_train['content_parsed'].str.replace(regex_stopword, '')
        df_train['content_parsed'] = df_train['content_parsed'].str.replace(regex2_stopword, '')
        df_test['content_parsed'] = df_test['content_parsed'].str.replace(regex_stopword, '')
        df_test['content_parsed'] = df_test['content_parsed'].str.replace(regex2_stopword, '')
    
    # add article length and id information
    df_train['article_length'] = df_train['content_parsed'].str.len()
    df_test['article_length'] = df_test['content_parsed'].str.len()
    df_train['id'] = 1
    df_test['id'] = 1
    
    return df_train, df_test

In [5]:
def features_extract(df_train, df_test, nb_features):
    # for each class, get the most frequent  words as feature vectors
    words_set_train = set()
    for i in range(1, 11, 1):
        bag = df_train[df_train['topic_codes'] == i]['content_parsed']
        total_text = ""
        for text in bag:
            total_text += (text + ",")
        temp_set = set(pd.value_counts(total_text.split(","))[0:nb_features].keys())
        words_set_train = words_set_train.union(temp_set)

    words_set_test = set()
    for i in range(1, 11, 1):
        bag = df_test[df_test['topic_codes'] == i]['content_parsed']
        total_text = ""
        for text in bag:
            total_text += (text + ",")
        temp_set = set(pd.value_counts(total_text.split(","))[0:100].keys())
        words_set_test = words_set_test.union(temp_set)

    # get the intersection of two feature words(including training feature words and testing feature words)
    # and use the intersection as our feature vector X
    words_set = words_set_train.intersection(words_set_test)
    words_list = sorted(list(words_set))
#     print("words feature numbers : ", len(words_list), "gives the best performance!")
#     print("our words feature is : ", words_set)
    return words_list

In [6]:
def regularize_data(df_train, df_test, words_list):
    # regularize training data
    new_content_train = []
    for row in df_train['content_parsed']:
        temp_row = row.split(",")
        new_row = []
        for word in temp_row:
            if word in words_list:
                new_row.append(word)
        new_str = ",".join(new_row)
        new_content_train.append(new_str)
    df_train['content_parsed_2'] = new_content_train

    # regularize testing data
    new_content_test = []
    for row in df_test['content_parsed']:
        temp_row = row.split(",")
        new_row = []
        for word in temp_row:
            if word in words_list:
                new_row.append(word)
        new_str = ",".join(new_row)
        new_content_test.append(new_str)
    df_test['content_parsed_2'] = new_content_test

    return df_train, df_test

In [7]:
def multiclass_logloss(actual, predicted, eps=1e-15):
#     """
#     对数损失度量（Logarithmic Loss  Metric）的多分类版本。
#     :param actual: 包含actual target classes的数组
#     :param predicted: 分类预测结果矩阵, 每个类别都有一个概率
#     """
    # Convert 'actual' to a binary array if it's not already:
    if len(actual.shape) == 1:
        actual2 = np.zeros((actual.shape[0], predicted.shape[1]))
        for i, val in enumerate(actual):
            actual2[i, val] = 1
        actual = actual2

    clip = np.clip(predicted, eps, 1 - eps)
    rows = actual.shape[0]
    vsota = np.sum(actual * np.log(clip))
    return -1.0 / rows * vsota

In [8]:
def training_process_for_MultinomialNB_model(X_train, y_train, X_test, y_test, alpha=1.0):
    # # get prior probability (probability of each topic is not uniformed)
    # prior_pro = dict(df_train.groupby('topic_codes').count()['topic'] / df_train.shape[0])
    # class_prior = list(prior_pro.values())
#     # do cross validation:
#     # create a k-fold croos validation iterator of k folds
#     cv = KFold(n_splits=10, shuffle=True, random_state=0)
#     # by default the score used is the one returned by score method of the estimator (accuracy)
#     scores = cross_val_score(clf, X_train, y_train, cv=cv)
#     cv_mean_score = np.mean(scores)
#     print("Mean score: {0:.3f}").format(cv_mean_score)
#     return cv_mean_score
#     return clf.score(X_test, y_test)

    
    # create model and predict result
    print("MNB model created...:")
    clf = MultinomialNB(alpha=alpha, fit_prior=True, class_prior=None)    
    model = clf.fit(X_train, y_train)
    predicted_y = clf.predict(X_test)
    predictions = model.predict_proba(X_test)
    print("logloss: %0.3f " % multiclass_logloss(y_test, predictions))
    print("accuracy : ",clf.score(X_test, y_test))
#     print(accuracy_score(y_test, predicted_y[:,1]))
#     print(precision_score(y_test, predicted_y, average='macro'))
#     print(recall_score(y_test, predicted_y, average='macro'))
#     print(f1_score(y_test, predicted_y, average='macro'))
    print("here below is classification report:")
    print(classification_report(y_test, predicted_y))

In [24]:
def training_process_for_MultinomialLR_model(X_train, y_train, X_test, y_test, C=1.0):
    # create model and predict result
    print("MLR model created...:")
    clf = LogisticRegression(C=C, solver='lbfgs', multi_class='multinomial')
    model = clf.fit(X_train, y_train)
    predicted_y = clf.predict(X_test)
    predictions = clf.predict_proba(X_test)
    
    
    print("predictions : ",predictions)
    print("logloss: %0.3f " % multiclass_logloss(y_test, predictions))
    print("accuracy : ",clf.score(X_test, y_test))
#     print(accuracy_score(y_test, predicted_y[:,1]))
#     print(precision_score(y_test, predicted_y, average='macro'))
#     print(recall_score(y_test, predicted_y, average='macro'))
#     print(f1_score(y_test, predicted_y, average='macro'))
    print("here below is classification report:")
    print(classification_report(y_test, predicted_y))

In [10]:
# get stopwords
nltk.download('stopwords')
stop_words = list(stopwords.words('english'))  # get stop words
# print(stop_words)

# do some data cleaning and pre-processing
df_train, df_test = load_data("training.csv", "test.csv")
df_train, df_test = pre_processing_data(df_train, df_test, stop_words)

# nb_features = []
# accuracy_list = []
# for nb in range(50, 200, 5):
#     feature_words = features_extract(df_train, df_test, nb)
#     df_train, df_test = regularize_data(df_train, df_test, feature_words)
#     accuracy = training_process_for_MultinomialNB_model(df_train, df_test)
#     nb_features.append(nb)
#     accuracy_list.append(accuracy)

best_nb = 65
feature_words = features_extract(df_train, df_test, best_nb)
df_train, df_test = regularize_data(df_train, df_test, feature_words)


# create bag of words
text_data_train = np.array(df_train['content_parsed_2'])
count_train = CountVectorizer()
bag_of_words_train = count_train.fit_transform(text_data_train)

text_data_test = np.array(df_test['content_parsed_2'])
count_test = CountVectorizer()
bag_of_words_test = count_test.fit_transform(text_data_test)

# X_train, X_test, y_train, y_test = train_test_split(bag_of_words_train.toarray(), 
#                                                 np.array(df_train['topic_codes']), 
#                                                 test_size=0.15, 
#                                                 random_state=0)
    
# Create feature matrix and target, train model
X_train = bag_of_words_train.toarray()
y_train = np.array(df_train['topic_codes'])
X_test = bag_of_words_test.toarray()
y_test = np.array(df_test['topic_codes'])


# plt.plot(nb_features, accuracy_list, label="accuracy_trend", color='red')
# plt.xlabel("nb_features")
# plt.ylabel("accuracy score")
# plt.xticks(nb_features)  # show x-coordinate with details
# plt.legend()  # show label graphic
# plt.show()


[nltk_data] Downloading package stopwords to /Users/steve/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
# training_process_for_MultinomialNB_model(X_train, y_train, X_test, y_test, alpha=1.0)
# training_process_for_MultinomialLR_model(X_train, y_train, X_test, y_test, C=1.0)

MLR model created...:
logloss: 0.655 
accuracy :  0.7435897435897436
here below is classification report:
              precision    recall  f1-score   support

           0       0.25      0.33      0.29         3
           1       0.82      0.60      0.69        15
           2       1.00      0.92      0.96        13
           3       1.00      1.00      1.00         2
           4       0.54      0.40      0.46        48
           5       0.69      0.79      0.73        14
           6       0.65      0.77      0.70        69
           7       0.25      0.33      0.29         3
           8       0.88      1.00      0.93         7
           9       0.98      0.98      0.98        60

    accuracy                           0.74       234
   macro avg       0.71      0.71      0.70       234
weighted avg       0.74      0.74      0.74       234



In [13]:
# create score fuction
mll_scorer = metrics.make_scorer(multiclass_logloss, greater_is_better=False, needs_proba=True)
nb_model = MultinomialNB()

# create pipeline 
clf = pipeline.Pipeline([('nb', nb_model)])

# search parameters
param_grid = {'nb__alpha': [0.001, 0.01, 0.1, 1, 10, 100]}

# Grid Search Model Initialization
model = GridSearchCV(estimator=clf, param_grid=param_grid, scoring=mll_scorer,
                                 verbose=10, n_jobs=-1, iid=True, refit=True, cv=6)

# fit Grid Search Model
model.fit(X_train, y_train)
# print("Best score: %0.3f" % model.best_score_)
print("Best parameters set:")
best_parameters = model.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))


Fitting 6 folds for each of 6 candidates, totalling 36 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done  17 out of  36 | elapsed:    2.4s remaining:    2.6s


Best parameters set:
	nb__alpha: 100


[Parallel(n_jobs=-1)]: Done  21 out of  36 | elapsed:    2.4s remaining:    1.7s
[Parallel(n_jobs=-1)]: Done  25 out of  36 | elapsed:    2.4s remaining:    1.1s
[Parallel(n_jobs=-1)]: Done  29 out of  36 | elapsed:    2.4s remaining:    0.6s
[Parallel(n_jobs=-1)]: Done  33 out of  36 | elapsed:    2.4s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done  36 out of  36 | elapsed:    2.5s finished


In [23]:
# training_process_for_MultinomialNB_model(X_train, y_train, X_test, y_test, alpha=1.0)
training_process_for_MultinomialNB_model(X_train, y_train, X_test, y_test, alpha=100)

MNB model created...:
logloss: 1.318 
accuracy :  0.7478632478632479
here below is classification report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         3
           1       0.78      0.47      0.58        15
           2       0.71      0.92      0.80        13
           3       0.67      1.00      0.80         2
           4       0.57      0.50      0.53        48
           5       0.72      0.93      0.81        14
           6       0.69      0.74      0.71        69
           7       0.00      0.00      0.00         3
           8       0.67      0.86      0.75         7
           9       0.97      1.00      0.98        60

    accuracy                           0.75       234
   macro avg       0.58      0.64      0.60       234
weighted avg       0.73      0.75      0.73       234



In [17]:
lr_model = LogisticRegression(solver='lbfgs',multi_class='multinomial')

# create pipeline 
clf = pipeline.Pipeline([('lr', lr_model)])

# search parameters
param_grid = {'lr__C': [0.01, 0.1, 1.0, 10, 100]}

# Grid Search Model Initialization
model = GridSearchCV(estimator=clf, param_grid=param_grid, scoring=mll_scorer,
                                 verbose=10, n_jobs=-1, iid=True, refit=True, cv=6)

# fit Grid Search Model
model.fit(X_train, y_train)
# print("Best score: %0.3f" % model.best_score_)
print("Best parameters set:")
best_parameters = model.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Fitting 6 folds for each of 5 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: Done  11 out of  30 | elapsed:    1.6s remaining:    2.8s
[Parallel(n_jobs=-1)]: Done  15 out of  30 | elapsed:    3.1s remaining:    3.1s
[Parallel(n_jobs=-1)]: Done  19 out of  30 | elapsed:    3.1s remaining:    1.8s
[Parallel(n_jobs=-1)]: Done  23 out of  30 | elapsed:    3.2s remaining:    1.0s
[Parallel(n_jobs=-1)]: Done  27 out of  30 | elapsed:    3.8s remaining:    0.4s
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    3.8s finished


Best parameters set:
	lr__C: 0.1


In [25]:
# training_process_for_MultinomialLR_model(X_train, y_train, X_test, y_test, C=1.0)
training_process_for_MultinomialLR_model(X_train, y_train, X_test, y_test, C=0.1)


MLR model created...:
predictions :  [[3.24386089e-08 8.68931887e-08 1.42268060e-09 ... 1.08354356e-08
  1.52937581e-06 3.03303096e-09]
 [1.09829683e-09 2.63056358e-08 1.59822330e-10 ... 4.30331175e-11
  1.62565673e-08 3.20432110e-12]
 [6.32184479e-05 2.00389476e-04 4.92442081e-06 ... 7.25583224e-05
  4.56508053e-06 9.99628831e-01]
 ...
 [1.20296382e-10 3.60756916e-11 7.56610052e-16 ... 3.24590355e-11
  9.44041924e-16 1.00000000e+00]
 [2.57985249e-06 2.75701348e-06 1.93898624e-07 ... 4.10725938e-06
  1.20515613e-06 7.92717951e-07]
 [1.06881607e-02 8.27904479e-03 7.00471697e-03 ... 9.03275562e-03
  5.48411277e-01 2.67142215e-02]]
logloss: 0.550 
accuracy :  0.7435897435897436
here below is classification report:
              precision    recall  f1-score   support

           0       0.20      0.33      0.25         3
           1       0.90      0.60      0.72        15
           2       1.00      0.92      0.96        13
           3       1.00      1.00      1.00         2
        

In [38]:
pre_y = np.array([0,1,0])
labels = np.array([3, 5, 7])
pre_p = np.array([[0.56651809,0.43348191],[0.15598162,0.84401838], [0.86852502,0.13147498]])

result = np.c_[np.max(pre_p, axis=1), labels.T, pre_y.T]
result.

array([[0.56651809, 3.        , 0.        ],
       [0.84401838, 5.        , 1.        ],
       [0.86852502, 7.        , 0.        ]])

In [None]:
plt.figure(figsize=(15,5)) #整个现实图（框架）的大小
plt.plot(nb_features, accuracy_list, label="accuracy_trend", color='red')
plt.xlabel("nb_features")
plt.ylabel("accuracy score")
plt.xticks(nb_features)  # show x-coordinate with details
plt.legend()  # show label graphic
plt.show()

In [None]:
bars = alt.Chart(df_train).mark_bar(size=50).encode(
    x=alt.X("topic"),
    y=alt.Y("count():Q", axis=alt.Axis(title='Number of articles')),
    tooltip=[alt.Tooltip('count()', title='Number of articles'), 'topic'],
    color='topic'

)

text = bars.mark_text(
    align='center',
    baseline='bottom',
).encode(
    text='count()'
)

(bars + text).interactive().properties(
    height=500, 
    width=700,
    title = "Number of articles in each topic",
)

In [None]:
df_topic = pd.DataFrame(df_train.groupby('topic').count()['id']).reset_index()

bars = alt.Chart(df_topic).mark_bar(size=50).encode(
    x=alt.X('topic'),
    y=alt.Y('PercentOfTotal:Q', axis=alt.Axis(format='.0%', title='% of Articles')),
    color='topic'
).transform_window(
    TotalArticles='sum(id)',
    frame=[None, None]
).transform_calculate(
    PercentOfTotal="datum.id / datum.TotalArticles"
)

text = bars.mark_text(
    align='center',
    baseline='bottom',
    #dx=5  # Nudges text to right so it doesn't appear on top of the bar
).encode(
    text=alt.Text('PercentOfTotal:Q', format='.1%')
)

(bars + text).interactive().properties(
    height=500, 
    width=700,
    title = "% of articles in each topic",
)

In [None]:
plt.figure(figsize=(12.8,6))
sns.distplot(df_train['article_length']).set_title('article_length distribution');

In [None]:
df_test.groupby('topic').count()

In [None]:
class_prior = list(prior_pro.values())

X_train = bag_of_words_train.toarray()
y_train = np.array(df_train['topic_codes'])

X_test = bag_of_words_test.toarray()
y_test = np.array(df_test['topic_codes'])


clf = MultinomialNB(fit_prior=True, class_prior=None)
model = clf.fit(X_train, y_train)
predicted_y = model.predict(X_test)
print(accuracy_score(y_test, predicted_y))
print(precision_score(y_test, predicted_y, average='macro'))
print(recall_score(y_test, predicted_y, average='macro'))
print(f1_score(y_test, predicted_y, average='macro'))
print(classification_report(y_test, predicted_y))

