In [1]:
import pandas as pd
import numpy as np

# Below option allows us to see the entire comment_text column
pd.set_option('display.max_colwidth', None)
# Read in the dataset
train = pd.read_csv("../../data/kaggle_train.csv")
train = train.drop(columns=['id'])

labels = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

print("Stats of training set: ", train.shape)
print("Labels:", labels)

Stats of training set:  (159571, 7)
Labels: ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']


In [2]:
train.head()

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,"Explanation\r\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27",0,0,0,0,0,0
1,"D'aww! He matches this background colour I'm seemingly stuck with. Thanks. (talk) 21:51, January 11, 2016 (UTC)",0,0,0,0,0,0
2,"Hey man, I'm really not trying to edit war. It's just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. He seems to care more about the formatting than the actual info.",0,0,0,0,0,0
3,"""\r\nMore\r\nI can't make any real suggestions on improvement - I wondered if the section statistics should be later on, or a subsection of """"types of accidents"""" -I think the references may need tidying so that they are all in the exact same format ie date format etc. I can do that later on, if no-one else does first - if you have any preferences for formatting style on references or want to do it yourself please let me know.\r\n\r\nThere appears to be a backlog on articles for review so I guess there may be a delay until a reviewer turns up. It's listed in the relevant form eg Wikipedia:Good_article_nominations#Transport """,0,0,0,0,0,0
4,"You, sir, are my hero. Any chance you remember what page that's on?",0,0,0,0,0,0


# Text Preprocessing

Below I have noticed some inconsistencies in the data and by preprocessing it, we can ensure a clean dataset.

In [3]:
# Convert comment to lowercase
def to_lowercase(text):
    return text.lower()

train['comment_text'] = train['comment_text'].apply(to_lowercase)
train.head()

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,"explanation\r\nwhy the edits made under my username hardcore metallica fan were reverted? they weren't vandalisms, just closure on some gas after i voted at new york dolls fac. and please don't remove the template from the talk page since i'm retired now.89.205.38.27",0,0,0,0,0,0
1,"d'aww! he matches this background colour i'm seemingly stuck with. thanks. (talk) 21:51, january 11, 2016 (utc)",0,0,0,0,0,0
2,"hey man, i'm really not trying to edit war. it's just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. he seems to care more about the formatting than the actual info.",0,0,0,0,0,0
3,"""\r\nmore\r\ni can't make any real suggestions on improvement - i wondered if the section statistics should be later on, or a subsection of """"types of accidents"""" -i think the references may need tidying so that they are all in the exact same format ie date format etc. i can do that later on, if no-one else does first - if you have any preferences for formatting style on references or want to do it yourself please let me know.\r\n\r\nthere appears to be a backlog on articles for review so i guess there may be a delay until a reviewer turns up. it's listed in the relevant form eg wikipedia:good_article_nominations#transport """,0,0,0,0,0,0
4,"you, sir, are my hero. any chance you remember what page that's on?",0,0,0,0,0,0


In [4]:
import re
# Remove HTML tags from the comments
def remove_html(text):
    return re.sub(r"<.*>", "", text, flags=re.MULTILINE)
    
train['comment_text'] = train['comment_text'].apply(remove_html)
train.head()

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,"explanation\r\nwhy the edits made under my username hardcore metallica fan were reverted? they weren't vandalisms, just closure on some gas after i voted at new york dolls fac. and please don't remove the template from the talk page since i'm retired now.89.205.38.27",0,0,0,0,0,0
1,"d'aww! he matches this background colour i'm seemingly stuck with. thanks. (talk) 21:51, january 11, 2016 (utc)",0,0,0,0,0,0
2,"hey man, i'm really not trying to edit war. it's just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. he seems to care more about the formatting than the actual info.",0,0,0,0,0,0
3,"""\r\nmore\r\ni can't make any real suggestions on improvement - i wondered if the section statistics should be later on, or a subsection of """"types of accidents"""" -i think the references may need tidying so that they are all in the exact same format ie date format etc. i can do that later on, if no-one else does first - if you have any preferences for formatting style on references or want to do it yourself please let me know.\r\n\r\nthere appears to be a backlog on articles for review so i guess there may be a delay until a reviewer turns up. it's listed in the relevant form eg wikipedia:good_article_nominations#transport """,0,0,0,0,0,0
4,"you, sir, are my hero. any chance you remember what page that's on?",0,0,0,0,0,0


In [5]:
# Remove links from the comments
def remove_links(text):
    text= re.sub(r"http\S+"," ",text, flags=re.MULTILINE)
    return re.sub(r"www\S+"," ",text, flags=re.MULTILINE)

train['comment_text'] = train['comment_text'].apply(remove_links)
train.head()

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,"explanation\r\nwhy the edits made under my username hardcore metallica fan were reverted? they weren't vandalisms, just closure on some gas after i voted at new york dolls fac. and please don't remove the template from the talk page since i'm retired now.89.205.38.27",0,0,0,0,0,0
1,"d'aww! he matches this background colour i'm seemingly stuck with. thanks. (talk) 21:51, january 11, 2016 (utc)",0,0,0,0,0,0
2,"hey man, i'm really not trying to edit war. it's just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. he seems to care more about the formatting than the actual info.",0,0,0,0,0,0
3,"""\r\nmore\r\ni can't make any real suggestions on improvement - i wondered if the section statistics should be later on, or a subsection of """"types of accidents"""" -i think the references may need tidying so that they are all in the exact same format ie date format etc. i can do that later on, if no-one else does first - if you have any preferences for formatting style on references or want to do it yourself please let me know.\r\n\r\nthere appears to be a backlog on articles for review so i guess there may be a delay until a reviewer turns up. it's listed in the relevant form eg wikipedia:good_article_nominations#transport """,0,0,0,0,0,0
4,"you, sir, are my hero. any chance you remember what page that's on?",0,0,0,0,0,0


In [6]:
import string
# Remove punctuation marks 
def remove_punctuation(text):
    for i in string.punctuation:
        text = text.replace(i, "")
    return text

train['comment_text'] = train['comment_text'].apply(remove_punctuation)
train.head()

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,explanation\r\nwhy the edits made under my username hardcore metallica fan were reverted they werent vandalisms just closure on some gas after i voted at new york dolls fac and please dont remove the template from the talk page since im retired now892053827,0,0,0,0,0,0
1,daww he matches this background colour im seemingly stuck with thanks talk 2151 january 11 2016 utc,0,0,0,0,0,0
2,hey man im really not trying to edit war its just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page he seems to care more about the formatting than the actual info,0,0,0,0,0,0
3,\r\nmore\r\ni cant make any real suggestions on improvement i wondered if the section statistics should be later on or a subsection of types of accidents i think the references may need tidying so that they are all in the exact same format ie date format etc i can do that later on if noone else does first if you have any preferences for formatting style on references or want to do it yourself please let me know\r\n\r\nthere appears to be a backlog on articles for review so i guess there may be a delay until a reviewer turns up its listed in the relevant form eg wikipediagoodarticlenominationstransport,0,0,0,0,0,0
4,you sir are my hero any chance you remember what page thats on,0,0,0,0,0,0


In [7]:
# Remove special characters such as: \n \r \t
def remove_special(text):
    return re.sub(r"[\n\t\\\/\r]"," ",text, flags=re.MULTILINE)

train['comment_text'] = train['comment_text'].apply(remove_special)
train.head()

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,explanation why the edits made under my username hardcore metallica fan were reverted they werent vandalisms just closure on some gas after i voted at new york dolls fac and please dont remove the template from the talk page since im retired now892053827,0,0,0,0,0,0
1,daww he matches this background colour im seemingly stuck with thanks talk 2151 january 11 2016 utc,0,0,0,0,0,0
2,hey man im really not trying to edit war its just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page he seems to care more about the formatting than the actual info,0,0,0,0,0,0
3,more i cant make any real suggestions on improvement i wondered if the section statistics should be later on or a subsection of types of accidents i think the references may need tidying so that they are all in the exact same format ie date format etc i can do that later on if noone else does first if you have any preferences for formatting style on references or want to do it yourself please let me know there appears to be a backlog on articles for review so i guess there may be a delay until a reviewer turns up its listed in the relevant form eg wikipediagoodarticlenominationstransport,0,0,0,0,0,0
4,you sir are my hero any chance you remember what page thats on,0,0,0,0,0,0


In [8]:
# Remove stopwords using nltk's stopwords package
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words = stopwords.words('english')

train['comment_text'] = train['comment_text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
train.head()


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Andrew\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,explanation edits made username hardcore metallica fan reverted werent vandalisms closure gas voted new york dolls fac please dont remove template talk page since im retired now892053827,0,0,0,0,0,0
1,daww matches background colour im seemingly stuck thanks talk 2151 january 11 2016 utc,0,0,0,0,0,0
2,hey man im really trying edit war guy constantly removing relevant information talking edits instead talk page seems care formatting actual info,0,0,0,0,0,0
3,cant make real suggestions improvement wondered section statistics later subsection types accidents think references may need tidying exact format ie date format etc later noone else first preferences formatting style references want please let know appears backlog articles review guess may delay reviewer turns listed relevant form eg wikipediagoodarticlenominationstransport,0,0,0,0,0,0
4,sir hero chance remember page thats,0,0,0,0,0,0


In [9]:
# As you can see above, there are numbers and/or dates
# I will remove those as they are not helpful

def remove_numbers(text):
    return re.sub(r'\d'," ",text, flags=re.MULTILINE)

train['comment_text'] = train['comment_text'].apply(remove_numbers)
train.head()

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,explanation edits made username hardcore metallica fan reverted werent vandalisms closure gas voted new york dolls fac please dont remove template talk page since im retired now,0,0,0,0,0,0
1,daww matches background colour im seemingly stuck thanks talk january utc,0,0,0,0,0,0
2,hey man im really trying edit war guy constantly removing relevant information talking edits instead talk page seems care formatting actual info,0,0,0,0,0,0
3,cant make real suggestions improvement wondered section statistics later subsection types accidents think references may need tidying exact format ie date format etc later noone else first preferences formatting style references want please let know appears backlog articles review guess may delay reviewer turns listed relevant form eg wikipediagoodarticlenominationstransport,0,0,0,0,0,0
4,sir hero chance remember page thats,0,0,0,0,0,0


# Train, Test, and Validation Split

Below I need to split the dataset into train and test datasets.
However, sklearn's `train_test_split` function does not work for
multi-class classification.

Therefore, I will be creating a train, test, and validation split for each label in the classification.

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(train[["comment_text"]], train[["toxic","severe_toxic","obscene","threat","insult","identity_hate"]], test_size=0.20)
X_train

Unnamed: 0,comment_text
88613,idea blocked need go process sort either party blocked continue wrong others wrong continue thing caused block im also concerned admin go wpani write things untrue incident without censure
122580,sorry man sufferering schizophrenia
47891,go ahead change wiki everybody improve
16438,case someone thinks im wakefield apologist first changed discredited much stronger wording
42388,see formally nominated article deletion please state case discussion others respond general tried use registration bar notability political parties cases palmer united waived pup got major nationwide coverage months registered also two queensland state mps within ranks quickly new country party add extraordinarily unlikely achieve registration current name receive kind coverage get defectors ranks might make exception meantime simply case wptoosoon know long time covering politics many kinds parties fade away quickly without ever anything much im saying happen given party less month old think soon including land issue useful addition abc combine make claim serious coverage one inclined accept others might hope unduly discouraged encourage state case afd discussion linked
...,...
145570,glyph section could truncated far see equal justification way suggest god hes seen light jokes aside making philosophical implications threeness major point article inviting come along argue religion natural philosophy centered around number five means youre wrong know youre wrong priest docktor spent much time studying mystickal properties number relates number number important think overall philosophy overall much merit mr pierce whether interpret praise towards insult towards late cs entirely ultimately neither place page asked list stories number three title also doesnt belong maybe disambig page mean literature three little pigs three bears three blind mice etc three bears popular russian chocolate candy include well see cases numeral used common classification systems periodic table even positions sports first baseman literally referred phrases double play things like three witches macbeth seem much eehhhhhhhh maybe issue argued whole wikinumbers project issue page stumbled months ago actually completely ridiculous amount unnecessary marginally significant horribly formatted content actually want website good useful instead godawful pile crap sorry ot thats happens get going apologize advance certain individuals sensitive hear truth
156881,usage sorry late response youll look usage guide template intended pullquotes wpmosquotations recommends html element template equally effective misuse pet peeve mine wiki — talk • contribs
71961,lol well post ani thread response bbb s rr analogy seems shame let go waste particularly seems prescient heres even better analogy user makes reverts hours without valid reason user b tells violation rr user tells wasnt gives whole list gobbledygook links kind justification reverts normal user looks like nonsense borne fundemental failure understand rr links threw user proceeds make reverts user b tells hes incompetent advises buck ideas user gets upset comes ani user b done nothing wrong stands actions users cdefgh form kangaroo court expressing concern whatsoever user proceed focus solely user b full self importance chance fuck someone heavy doses selective hearingreading probable outcome user b productive editor decides wikipedia basket case completely backwards priorities lost project forever either force choice user continues incompent editor messes whole bunch articles propogates poor understanding policy even less experienced users spreading random talk pages long term user either finally learns ropes doubt destroyed unknowable amount content confused unknowable amount inexperienced editors meantime screw get blocked applying incompetence disupte involves power user finally gets block deserve still doubt idea theyve done wrong heres kicker incompetent along im fun editting articles others dont know blp loaf bread get way highlight dismissed nothing content dispute good job everyone
40185,deleted article dear disconspinster see youre pretty efficient contributorfixer wiki wellrecognised well article interrelated demand supply deleted redirected additional information present anywhere wikipedia focuses definition different types relationships complementary goods course original stuff wrote interrelated demand supply probably go somewhere elasticity complementary goods wanted provide information without sullying neat organized data two pages id created another page hope someone tidies bit merges articles didnt expect get deleted though im going revert original back hope someone better economic knowledge come along merge somewhere since im new wiki formatting thing


In [11]:
print("Train shape:",X_train.shape)
print("Test shape:", X_test.shape)

Train shape: (127656, 1)
Test shape: (31915, 1)


# Vectorizing the Comment Text

*Logistic Regression can't take text values as input*

Since the independent variable I have is only text, we will need to use a vectorizer to convert the text into usable data for Logistic Regression.

```

# Max_features = Build a vocabulary that only consider the top max_features ordered by term frequency

# Analyzer = Whether the feature should be made of word or character n-grams. Option ‘char_wb’ creates character n-grams only from text inside word boundaries; n-grams at the edges of words are padded with space.

# ngram_range = (1,1) means only unigrams, (1,2) means unigrams and bigrams, (1,3) means unigrams, bigrams, and trigrams

# Further ngrams knowledge = bigrams means it will learn the occurence of every two words, trigrams would be every 3, etc.

# dtype = type of the matrix returned, default is float64
```

We will use a word and char n-grams as some people like to obfuscate words by using multiple characters, by using both we can hope to catch these.
The idea from this came from [here](https://www.kaggle.com/code/tunguz/logistic-regression-with-words-and-char-n-grams/comments) which has one of the best results for Logistic Regression. This user optimized the ngram_range.

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import FeatureUnion

tfidf_word = TfidfVectorizer(max_features=1000, analyzer="word", ngram_range=(1,3), dtype=np.float32)
tfidf_char = TfidfVectorizer(max_features=1000, analyzer="char", ngram_range=(3,6), dtype=np.float32)

In [13]:
from scipy.sparse import hstack

X_train_word = tfidf_word.fit_transform(X_train["comment_text"])
X_test_word = tfidf_word.fit_transform(X_test["comment_text"])

X_train_char = tfidf_char.fit_transform(X_train["comment_text"])
X_test_char = tfidf_char.fit_transform(X_test["comment_text"])

X_train = hstack([X_train_char, X_train_word])
X_test = hstack([X_test_char, X_test_word])

# Importing Keras Libraries

In [14]:
# Importing libraries
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import GlobalMaxPooling1D
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from tensorflow.keras.layers import Embedding
from keras.utils import pad_sequences


# Building the Model

In [17]:
embedding_dim = 64

model = Sequential()
model.add(Embedding(X_train.shape[0], embedding_dim, input_length=2000))
model.add(Conv1D(128, 5, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(64, activation='relu'))
model.add(Dense(6, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 2000, 64)          8169984   
                                                                 
 conv1d_1 (Conv1D)           (None, 1996, 128)         41088     
                                                                 
 global_max_pooling1d_1 (Glo  (None, 128)              0         
 balMaxPooling1D)                                                
                                                                 
 dense_2 (Dense)             (None, 64)                8256      
                                                                 
 dense_3 (Dense)             (None, 6)                 390       
                                                                 
Total params: 8,219,718
Trainable params: 8,219,718
Non-trainable params: 0
____________________________________________

In [18]:
from keras.callbacks import EarlyStopping

results = {}

es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, restore_best_weights=True, patience=5)
history = model.fit(np.asarray(X_train.toarray()), y_train.values,
                    epochs=50,
                    validation_data=(np.asarray(X_train.toarray()), y_train.values),
                    batch_size=10,
                    callbacks=[es])

loss, training_accuracy = model.evaluate(np.asarray(X_train.toarray()), y_train.values)
print(f"Training Accuracy: {training_accuracy}")

loss, testing_accuracy = model.evaluate(np.asarray(X_test.toarray()), y_test.values)
print(f" Testing Accuracy:  {testing_accuracy}")

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 12: early stopping
Training Accuracy: 0.994163990020752
 Testing Accuracy:  0.9941720366477966


In [19]:
from sklearn.metrics import classification_report, confusion_matrix, multilabel_confusion_matrix

predict = model.predict(np.asarray(X_test.toarray()))



In [20]:
predict

array([[0.09763435, 0.00966826, 0.0533089 , 0.00260989, 0.04747717,
        0.00805398],
       [0.09763435, 0.00966826, 0.0533089 , 0.00260989, 0.04747717,
        0.00805398],
       [0.16527717, 0.01800706, 0.08659099, 0.00299421, 0.08134115,
        0.01254456],
       ...,
       [0.09763435, 0.00966826, 0.0533089 , 0.00260989, 0.04747717,
        0.00805398],
       [0.09763435, 0.00966826, 0.0533089 , 0.00260989, 0.04747717,
        0.00805398],
       [0.09763435, 0.00966826, 0.0533089 , 0.00260989, 0.04747717,
        0.00805398]], dtype=float32)

In [21]:
y_predicted = []
for i in predict:
    test = []
    for j in i:
        if j > 0.5:
            test.append(1)
        else:
            test.append(0)
    y_predicted.append(test)

# y_predicted = np.argmax(predict, axis=1)
# y_predicted

In [22]:
for i in y_predicted:
    print(i)

[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0]
[0, 0, 0, 0,

In [23]:
from sklearn.metrics import multilabel_confusion_matrix
print(multilabel_confusion_matrix(y_test, y_predicted))

[[[28785     0]
  [ 3130     0]]

 [[31594     0]
  [  321     0]]

 [[30185     0]
  [ 1730     0]]

 [[31817     0]
  [   98     0]]

 [[30366     0]
  [ 1549     0]]

 [[31642     0]
  [  273     0]]]


# Saving the Best Model

In [24]:
model.save(filepath="F:/Thesis/models/cnn/cnn")



INFO:tensorflow:Assets written to: F:/Thesis/models/cnn/cnn\assets


INFO:tensorflow:Assets written to: F:/Thesis/models/cnn/cnn\assets


# Loading the Saved Models

In [None]:
# keras.models.load_model("F:/Thesis/models/cnn/cnn")