# Import libraries

In [1]:
import os
from pathlib import Path
import numpy as np
import pandas as pd

import spacy

from xgboost import XGBClassifier

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,classification_report

In [2]:
#it is required to download en_core_web_sm
#spacy.cli.download('en_core_web_sm')

# Data preprocessing

## Append text data to list
Note that data must be in the same directory as this notebook, in the bbc folder.

In [3]:
#create an empty list for storing the path which will be used for for-loop and read the text from
list_path = []

#construct a list that contains all the path for all txt files
for i in os.listdir('bbc/'):
    # for each path, if the path string contains dot symbol then continue
    if i.find('.')!=-1:
        continue
    else:
    # for each txt file in different folders, append its path to the empty list_path
        for j in os.listdir("bbc/{}".format(i)):
            list_path.append("bbc/{}/{}".format(i,j))

In [4]:
#exclude the path of each text contains dirty data 
list_path = list(filter(lambda x:x.find("DS_Store")==-1,list_path))

In [5]:
#create empty list for each class
list_business = []
list_entertainment = []
list_politics = []
list_tech = []
list_sport = []

In [6]:
#create a function that will read the text and append the test to the list
def append_list(list_name,category):
    for i in list_path:
        if i.find(category)!=-1:
            list_name.append(Path(i).read_text())
        else:
            continue

There is a issue about the pound sterling sign in the 199.txt file in bbc/sport folder. I replaced the orginal pound sterling sign with a new one.

In [7]:
# For each path, apply the append_list function
append_list(list_business,"business")
append_list(list_entertainment,"entertainment")
append_list(list_politics,"politics")
append_list(list_tech,"tech")
append_list(list_sport,"sport")

In [8]:
#check if all files were read
len(list_business) + len(list_entertainment) + len(list_politics) + len(list_tech) + len(list_sport)

2225

## Create pandas dataframes

In [9]:
# create pandas dataframes for each category, assign label to them.
df_business = pd.DataFrame({"content":list_business,
                            "label":0,
                            "label_en":"business"})

df_entertainment = pd.DataFrame({"content":list_entertainment,
                                 "label":1,
                                 "label_en":"entertainment"})

df_politics = pd.DataFrame({"content":list_politics,
                            "label":2,
                            "label_en":"politics"})

df_tech = pd.DataFrame({"content":list_tech,
                        "label":3,
                        "label_en":"tech"})

df_sport = pd.DataFrame({"content":list_sport,
                         "label":4,
                         "label_en":"sport"})

In [10]:
#combine all data into one dataframe
frames = [df_business,df_entertainment,df_politics,df_tech,df_sport]
dataset = pd.concat(frames)
#reset the index and drop the index column
dataset_final = dataset.reset_index().drop(columns='index')

In [11]:
#shuffle the dataset
dataset_final = shuffle(dataset_final)

## Tokenization, lemmatization, removing punctuation/stopwords, lowering words 

In [12]:
#load en_core_web_sm for further data clearning using spacy
nlp = spacy.load('en_core_web_sm')

In [13]:
def data_prep(x):
    """
    This function is to remove punctuation/space/determiners/particles/stop words in documents.
    Then words will be lemmatized and lowered.
    The function returns list of words.
    """
    doc = nlp(x)
    return [token.lemma_.lower() for token in doc if token.pos_ not in ('PUNCT','SPACE','DET','PART') and token.is_stop == False]

In [14]:
#map the function to content and then create a column for storing cleaned data
dataset_final['tokenization'] = dataset_final['content'].map(data_prep)

In [15]:
dataset_final.head()

Unnamed: 0,content,label,label_en,tokenization
975,Royal couple watch nation's mood\n\nPrince Cha...,2,politics,"[royal, couple, watch, nation, mood, prince, c..."
1273,Labour in constituency race row\n\nLabour's ch...,2,politics,"[labour, constituency, race, row, labour, choi..."
2222,Melzer shocks Agassi in San Jose\n\nSecond see...,4,sport,"[melzer, shock, agassi, san, jose, second, see..."
1797,Wright-Phillips to start on right\n\nEngland c...,4,sport,"[wright, phillips, start, right, england, coac..."
362,Making your office work for you\n\nOur mission...,0,business,"[make, office, work, mission, brighten, work, ..."


## Feature Extraction - Named Entity Recognition

In [16]:
# a function will be used to extract the named entity labels for each token.
def named_entity(x):
    mylist = [token.label_ for token in nlp(x).ents]
    return mylist

In [17]:
#extract named entity labels for each content
dataset_final["ner"] = dataset_final['content'].apply(named_entity)

In [18]:
#a list contains each type of labels is created.
ner_list = dataset_final["ner"].explode().value_counts().reset_index()['ner'].to_list()

In [19]:
# use for-loop to count the occurence of each label for all rows
for _ in ner_list:
    dataset_final[str(_)] = dataset_final.apply(lambda x: x['ner'].count(_), axis=1)

In [20]:
dataset_final.head()

Unnamed: 0,content,label,label_en,tokenization,ner,PERSON,ORG,DATE,GPE,CARDINAL,...,PERCENT,LOC,TIME,WORK_OF_ART,EVENT,PRODUCT,FAC,QUANTITY,LAW,LANGUAGE
975,Royal couple watch nation's mood\n\nPrince Cha...,2,politics,"[royal, couple, watch, nation, mood, prince, c...","[ORG, DATE, PERSON, ORG, FAC, ORG, CARDINAL, C...",25,8,8,6,3,...,3,0,2,0,0,0,2,0,0,0
1273,Labour in constituency race row\n\nLabour's ch...,2,politics,"[labour, constituency, race, row, labour, choi...","[ORG, CARDINAL, GPE, PERSON, GPE, GPE, CARDINA...",5,9,1,5,7,...,2,0,0,0,1,0,0,0,0,0
2222,Melzer shocks Agassi in San Jose\n\nSecond see...,4,sport,"[melzer, shock, agassi, san, jose, second, see...","[ORG, PERSON, GPE, ORDINAL, PERSON, PERSON, EV...",13,3,1,3,7,...,0,0,0,0,1,0,0,0,0,0
1797,Wright-Phillips to start on right\n\nEngland c...,4,sport,"[wright, phillips, start, right, england, coac...","[PERSON, NORP, ORG, PERSON, GPE, ORG, ORDINAL,...",13,6,3,4,3,...,0,0,0,1,0,0,0,0,0,0
362,Making your office work for you\n\nOur mission...,0,business,"[make, office, work, mission, brighten, work, ...","[DATE, DATE, ORG, GPE, TIME, PERSON, ORG, CARD...",1,3,2,1,2,...,0,0,1,0,0,0,0,0,0,0


## Join tokenized words and change int label to list of a int label

In [21]:
def join_(x):
    """
    The tokenized words will be joined as input for tf-idf.
    """
    return ' '.join(x)

In [22]:
#Apply the function to tokenizatio column, a new column named tokenization_joined is created
#and will be used for TF-IDF algorithm.
dataset_final['tokenization_joined'] = dataset_final['tokenization'].map(join_)

In [23]:
def int_to_list(x):
    """
    This function return int as list for tagged.
    """
    return [x]

In [24]:
#Apply the function to label column, a new column named label_list is created.
dataset_final['label_list'] = dataset_final.label.apply(int_to_list)

## Examine bias

In [25]:
def check_words_gender(x):
    """
    This function check if certain gender is appeared in the tokenization
    """
    if "woman" in x or "female" in x:
        return "woman"
    elif "man" in x or "male" in x:
        return "man"
    else:
        return "none"

In [26]:
#apply the function to tokenization column,a new column named check_words is created.
dataset_final['check_words'] = dataset_final['tokenization'].apply(check_words_gender)

In [27]:
#analyze the data
dataset_final.pivot_table(index=['label_en'],
                          columns=['check_words'],
                          values=['content'],
                          aggfunc=lambda x: len(x))

Unnamed: 0_level_0,content,content,content
check_words,man,none,woman
label_en,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
business,22,474,14
entertainment,62,265,59
politics,60,326,31
sport,80,388,43
tech,32,344,25


## Split the dataset into train, val and test

In [28]:
#select x and y for train test
X = dataset_final.drop(columns=['label','label_en','label_list'])
y = dataset_final[['label','label_list']]

In [29]:
train_ratio = 0.75
validation_ratio = 0.15
test_ratio = 0.10

# train is now 75% of the entire data set
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=1 - train_ratio)

# test is now 10% of the initial data set
# validation is now 15% of the initial data set
x_val, x_test, y_val, y_test = train_test_split(x_test, y_test, test_size=test_ratio/(test_ratio + validation_ratio)) 

In [30]:
#merge the X with y again for feature engineering and other purposes
X_train_df = x_train.reset_index().merge(y_train.reset_index(),on=['index'])
X_val_df = x_val.reset_index().merge(y_val.reset_index(),on=['index'])
X_test_df = x_test.reset_index().merge(y_test.reset_index(),on=['index'])

# Tf-Idf(term frequency - inverse document frequency)

## test max features on validaiton test

In [31]:
#max features list for testing
tfidf_max_features = [400,800,1000,2000,4000,8000,10000,12000,14000,18000,20000,40000,60000,80000]

Below code cell is commented out because it takes time to compute.
Remove comment mark if execution is needed.

In [32]:
# test max_features parameter using XGBoost classification algorithm
"""
for feature_num in tfidf_max_features:
    
    tfidf_vec = TfidfVectorizer(ngram_range=(1,2), 
                                max_features=feature_num,
                               lowercase=False)
    X_tfidf = tfidf_vec.fit_transform(X_train_df['tokenization_joined'])
    X_val_tfidf = tfidf_vec.transform(X_val_df['tokenization_joined'])
    
    xgb_multi_class = XGBClassifier()
    xgb_multi_class.fit(X_tfidf,X_train_df['label'])
    
    y_pred = xgb_multi_class.predict(X_val_tfidf)
    y_test = X_val_df['label']

    score = precision_score(y_test,y_pred,average='macro')
    print("max feature is:{0}, macro averaged precision is:{1}".format(feature_num,score))
"""

'\nfor feature_num in tfidf_max_features:\n    \n    tfidf_vec = TfidfVectorizer(ngram_range=(1,2), \n                                max_features=feature_num,\n                               lowercase=False)\n    X_tfidf = tfidf_vec.fit_transform(X_train_df[\'tokenization_joined\'])\n    X_val_tfidf = tfidf_vec.transform(X_val_df[\'tokenization_joined\'])\n    \n    xgb_multi_class = XGBClassifier()\n    xgb_multi_class.fit(X_tfidf,X_train_df[\'label\'])\n    \n    y_pred = xgb_multi_class.predict(X_val_tfidf)\n    y_test = X_val_df[\'label\']\n\n    score = precision_score(y_test,y_pred,average=\'macro\')\n    print("max feature is:{0}, macro averaged precision is:{1}".format(feature_num,score))\n'

### train tfidf vector

In [33]:
#create an instance
tfidf_vec = TfidfVectorizer(ngram_range=(1,2),
                            max_features=20000,
                           lowercase=False)

In [34]:
#fit and transform training/validation/test sets
X_tfidf = tfidf_vec.fit_transform(X_train_df['tokenization_joined'])
X_val_tfidf = tfidf_vec.transform(X_val_df['tokenization_joined'])
X_test_tfidf = tfidf_vec.transform(X_test_df['tokenization_joined'])

In [35]:
print(X_tfidf.shape,X_val_tfidf.shape,X_test_tfidf.shape)

(1668, 20000) (334, 20000) (223, 20000)


# Paragraph Embedding - doc2vec

In [36]:
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument

In [37]:
# a function that transforms data to taggeddcoument format
def tag_(x):
    return TaggedDocument(words = x['tokenization'],
                          tags = x['label_list'])

In [38]:
# apply the function to train/val/test datasets
X_train_df['tagged'] = X_train_df[['tokenization','label_list']].apply(tag_,axis=1)
#X_val_df['tagged'] = X_val_df[['tokenization','label_list']].apply(tag_,axis=1)
#X_test_df['tagged'] = X_test_df[['tokenization','label_list']].apply(tag_,axis=1)

In [39]:
#create a function for traning a doc2vec model
def train_doc2vec_model(x, vector_size_):
    #create an instance of Doc2Vec
    doc2vec = Doc2Vec(vector_size = vector_size_)
    #Using training data(x) to build vocabulary
    doc2vec.build_vocab(x)
    #train the model
    doc2vec.train(x,
                     total_examples = doc2vec.corpus_count,
                     epochs = doc2vec.epochs)
    return doc2vec

In [40]:
#create a function to use model to infer_vector and return vector lists
def vec_for_learning(model, tagged_docs):
    try:
        sents = tagged_docs.values
        regressors = [model.infer_vector(doc.words) for doc in sents]
    except:
        regressors = [model.infer_vector(doc) for doc in sents]
    return regressors

## testing parameters

In [41]:
vector_size = [10, 20, 50, 70, 100, 150, 200, 250, 300]

**Below code cell is <span style="color: red;">commented out</span> because it takes time to compute.
<span style="color: red;">Remove comment mark""" """</span> if execution is needed.**

In [42]:
"""
for vector_size_ in vector_size:
    
    doc2vec_model_test = train_doc2vec_model(X_train_df['tagged'],vector_size_)

    X_train_vec = vec_for_learning(doc2vec_model_test, X_train_df['tagged'])
    X_val_vec = vec_for_learning(doc2vec_model_test, X_val_df['tokenization'])
    
    xgb_multi_class = XGBClassifier()
    xgb_multi_class.fit(X_train_vec,X_train_df['label'])

    y_pred = xgb_multi_class.predict(X_val_vec)
    y_test = X_val_df['label']
    
    score = precision_score(y_test,y_pred,average='macro')
    
    print("vector_size is:{0}, macro averaged precision is:{1}".format(vector_size_,score))
"""

'\nfor vector_size_ in vector_size:\n    \n    doc2vec_model_test = train_doc2vec_model(X_train_df[\'tagged\'],vector_size_)\n\n    X_train_vec = vec_for_learning(doc2vec_model_test, X_train_df[\'tagged\'])\n    X_val_vec = vec_for_learning(doc2vec_model_test, X_val_df[\'tokenization\'])\n    \n    xgb_multi_class = XGBClassifier()\n    xgb_multi_class.fit(X_train_vec,X_train_df[\'label\'])\n\n    y_pred = xgb_multi_class.predict(X_val_vec)\n    y_test = X_val_df[\'label\']\n    \n    score = precision_score(y_test,y_pred,average=\'macro\')\n    \n    print("vector_size is:{0}, macro averaged precision is:{1}".format(vector_size_,score))\n'

## training

In [43]:
#train the mode using training set
doc2vec_model = train_doc2vec_model(X_train_df['tagged'],20)

In [44]:
#perform infer_vector for every dataset
X_train_vec = vec_for_learning(doc2vec_model, X_train_df['tagged'])
X_val_vec = vec_for_learning(doc2vec_model, X_val_df['tokenization'])
X_test_vec = vec_for_learning(doc2vec_model, X_test_df['tokenization'])

# combine features

In [45]:
#select NER features for all sets and combine all features
X_train_ner = X_train_df[['PERSON', 'ORG', 'DATE', 'GPE','CARDINAL', 'NORP', 'MONEY', 'ORDINAL', 'PERCENT', 
                          'LOC', 'TIME','EVENT', 'WORK_OF_ART', 'PRODUCT', 'FAC', 'QUANTITY', 'LAW', 'LANGUAGE']]

X_train_final = np.hstack((X_tfidf.toarray(),X_train_vec,X_train_ner))


X_val_ner = X_val_df[['PERSON', 'ORG', 'DATE', 'GPE','CARDINAL', 'NORP', 'MONEY', 'ORDINAL', 'PERCENT', 
                      'LOC', 'TIME','EVENT', 'WORK_OF_ART', 'PRODUCT', 'FAC', 'QUANTITY', 'LAW', 'LANGUAGE']]

X_val_final = np.hstack((X_val_tfidf.toarray(),X_val_vec,X_val_ner))


X_test_ner = X_test_df[['PERSON', 'ORG', 'DATE', 'GPE','CARDINAL', 'NORP', 'MONEY', 'ORDINAL', 'PERCENT', 
                        'LOC', 'TIME','EVENT', 'WORK_OF_ART', 'PRODUCT', 'FAC', 'QUANTITY', 'LAW', 'LANGUAGE']]

X_test_final = np.hstack((X_test_tfidf.toarray(),X_test_vec,X_test_ner))

In [46]:
print(X_train_final.shape,X_val_final.shape,X_test_final.shape)

(1668, 20038) (334, 20038) (223, 20038)


# Feature selection

## f_classif

In [47]:
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import SelectKBest

In [48]:
k_num = [1000,2000,3000,4000,5000,6000,7000,8000,9000,10000,11000,12000,13000,
         14000,15000,16000,17000,18000,20000]

In [49]:
X_train_final.shape,X_train_df['label'].shape

((1668, 20038), (1668,))

In [50]:
X_val_final.shape,X_val_df['label'].shape

((334, 20038), (334,))

**Below code cell is <span style="color: red;">commented out</span> because it takes time to compute.
<span style="color: red;">Remove comment mark""" """</span> if execution is needed.**

In [51]:
"""
# test k number parameter on training set and validation set
for num in k_num:
    fs_ = SelectKBest(f_classif,k=num).fit(X_train_final,X_train_df['label'])
    
    X_train_final_new = fs_.transform(X_train_final)
    
    xgb_multi_class = XGBClassifier()
    xgb_multi_class.fit(X_train_final_new,X_train_df['label'])
    
    X_val_final_new = fs_.transform(X_val_final)
    
    y_pred = xgb_multi_class.predict(X_val_final_new)
    y_test = X_val_df['label']
    
    score = precision_score(y_test,y_pred,average='macro')
    
    print("K number is:{0}, macro averaged precision is:{1}".format(num,score))
"""

'\n# test k number parameter on training set and validation set\nfor num in k_num:\n    fs_ = SelectKBest(f_classif,k=num).fit(X_train_final,X_train_df[\'label\'])\n    \n    X_train_final_new = fs_.transform(X_train_final)\n    \n    xgb_multi_class = XGBClassifier()\n    xgb_multi_class.fit(X_train_final_new,X_train_df[\'label\'])\n    \n    X_val_final_new = fs_.transform(X_val_final)\n    \n    y_pred = xgb_multi_class.predict(X_val_final_new)\n    y_test = X_val_df[\'label\']\n    \n    score = precision_score(y_test,y_pred,average=\'macro\')\n    \n    print("K number is:{0}, macro averaged precision is:{1}".format(num,score))\n'

In [52]:
#select use f_classif function,which is ANOVA F-value because the features contain negative number.
#fit the data using X_train
fs_ = SelectKBest(f_classif,k=3000).fit(X_train_final,X_train_df['label'])
#Transform the training/validation/testing data
X_train_final_new = fs_.transform(X_train_final)
X_val_final_new = fs_.transform(X_val_final)
X_test_final_new = fs_.transform(X_test_final)

# Train and test model

## GridSearch best set hyperparameters
**Below code cell is <span style="color: red;">commented out</span> because it takes time to compute.
<span style="color: red;">Remove comment mark""" """</span> if execution is needed.**

In [53]:
"""

param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.1, 0.01, 0.001],
    'subsample': [0.5, 0.7, 1]
}

# Create the XGBoost model instance
xgb_model = XGBClassifier()

# Create the GridSearchCV instance
grid_search = GridSearchCV(xgb_model, param_grid, cv=5, scoring='accuracy')

# Fit the GridSearchCV object to the training data
grid_search.fit(X_train_final_new, X_train_df['label'])

# Print the best set of hyperparameters and the corresponding score
print("Best set of hyperparameters: ", grid_search.best_params_)
print("Best score: ", grid_search.best_score_)

"""

'\n\nparam_grid = {\n    \'max_depth\': [3, 5, 7],\n    \'learning_rate\': [0.1, 0.01, 0.001],\n    \'subsample\': [0.5, 0.7, 1]\n}\n\n# Create the XGBoost model instance\nxgb_model = XGBClassifier()\n\n# Create the GridSearchCV instance\ngrid_search = GridSearchCV(xgb_model, param_grid, cv=5, scoring=\'accuracy\')\n\n# Fit the GridSearchCV object to the training data\ngrid_search.fit(X_train_final_new, X_train_df[\'label\'])\n\n# Print the best set of hyperparameters and the corresponding score\nprint("Best set of hyperparameters: ", grid_search.best_params_)\nprint("Best score: ", grid_search.best_score_)\n\n'

## Train XGBClassifier model

In [54]:
#Create an instance of XGBClassifier
xgb_multi_class = XGBClassifier(learning_rate = 0.1,max_depth=3,subsample=0.5)

#Fit new train data and corresponded label
xgb_multi_class.fit(X_train_final_new, X_train_df['label'])

#predict on test set
y_pred = xgb_multi_class.predict(X_test_final_new)

#y true
y_test = X_test_df['label']

In [55]:
full_report = classification_report(y_test, y_pred)

print(full_report)

              precision    recall  f1-score   support

           0       0.89      0.95      0.92        57
           1       0.94      0.98      0.96        45
           2       0.95      0.93      0.94        41
           3       0.97      0.88      0.92        34
           4       1.00      0.96      0.98        46

    accuracy                           0.94       223
   macro avg       0.95      0.94      0.94       223
weighted avg       0.94      0.94      0.94       223



# K-Fold cross-validation

In [56]:
#construct a new X(contains features) and y(contains labels)
X_kfold = dataset_final.drop(columns=['label','label_en'])
y_kfold = dataset_final[['label','label_list']]

#create an instance of KFold and assign n_splits = 5,meaning there will be five fold cross-validation.
kf = KFold(n_splits=5)

#create four variables that will hold the results the cross-validation outputs
avg_accuracy = 0
macro_avg_precision = 0
macro_avg_recall = 0
macro_avg_f1 = 0

In [57]:
for i, (train_index, test_index) in enumerate(kf.split(X)):
    
    """
    split the data into train and test subsets
    """
    X_train_new, X_test_new = X_kfold.iloc[train_index] , X_kfold.iloc[test_index]
    y_train_new, y_test_new = y_kfold.iloc[train_index] , y_kfold.iloc[test_index]
    
    """
    train tf-idf feature
    """
    # create TfidfVectorizer
    tfidf_vec_kf = TfidfVectorizer(ngram_range=(1,2),max_features=20000,lowercase=False)
    #fit and transform X_train
    X_train_tfidf_kf = tfidf_vec_kf.fit_transform(X_train_new['tokenization_joined'])
    #transform X_test
    X_test_tfidf_kf = tfidf_vec_kf.transform(X_test_new['tokenization_joined'])
    
    """
    train doc2vec feature
    """
    X_train_new['tagged'] = X_train_new[['tokenization','label_list']].apply(tag_,axis=1)
    
    model_test = Doc2Vec(vector_size=20)
    #build vocabulary
    model_test.build_vocab(X_train_new['tagged'])
    #train the model
    model_test.train(X_train_new['tagged'],
                     total_examples = model_test.corpus_count,
                     epochs = model_test.epochs)
    #infer the vector for train and test datasets
    X_train_vec_kf = vec_for_learning(model_test, X_train_new['tagged'])
    X_test_vec_kf = vec_for_learning(model_test, X_test_new['tokenization'])
    
    """
    select NER feature
    """
    columns = ['PERSON', 'ORG', 'DATE', 'GPE','CARDINAL', 'NORP', 'MONEY', 'ORDINAL', 'PERCENT', 'LOC', 'TIME','EVENT', 
                'WORK_OF_ART', 'PRODUCT', 'FAC', 'QUANTITY', 'LAW', 'LANGUAGE']
    X_train_ner_kf = X_train_new[columns]
    X_test_ner_kf = X_test_new[columns]
    
    """
    combine features
    """
    X_train_final_kf = np.hstack((X_train_tfidf_kf.toarray(),X_train_vec_kf,X_train_ner_kf))
    X_test_final_kf = np.hstack((X_test_tfidf_kf.toarray(),X_test_vec_kf,X_test_ner_kf))
    
    """
    feature selection
    """
    
    fs_kf = SelectKBest(f_classif,k=3000).fit(X_train_final_kf,y_train_new['label'])
    X_train_final_new_kf = fs_kf.transform(X_train_final_kf)
    X_test_final_new_kf = fs_kf.transform(X_test_final_kf)
    
    """
    train the model
    """
    xgb_multi_class = XGBClassifier(learning_rate = 0.1,max_depth = 3,subsample = 0.5)
    xgb_multi_class.fit(X_train_final_new_kf, y_train_new['label'])
    
    """
    predict 
    """
    
    y_pred_kf = xgb_multi_class.predict(X_test_final_new_kf)
    y_test_kf = y_test_new['label']
    
    """
    accumulate results
    """
    avg_accuracy  += accuracy_score(y_test_kf, y_pred_kf)
    macro_avg_precision += precision_score(y_test_kf, y_pred_kf,average='macro')
    macro_avg_recall += recall_score(y_test_kf, y_pred_kf,average='macro')
    macro_avg_f1 += f1_score(y_test_kf, y_pred_kf,average='macro')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_new['tagged'] = X_train_new[['tokenization','label_list']].apply(tag_,axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_new['tagged'] = X_train_new[['tokenization','label_list']].apply(tag_,axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_new['tagged'] = X_train_n

In [58]:
print("avg accuracy is:",avg_accuracy/5)
print("macro avg precision is:",macro_avg_precision/5)
print("macro avg recall is:",macro_avg_recall/5)
print("macro avg f1 is:",macro_avg_f1/5)

avg accuracy is: 0.9608988764044943
macro avg precision is: 0.9600561379206676
macro avg recall is: 0.9610578007471011
macro avg f1 is: 0.9603562922634108
