In [1]:
import sys
import nltk
import sklearn
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv(r"C:\Users\ayush\Desktop\Toxic Comments\train.csv")
data = data.iloc[:20000]
data.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [3]:
comment = data['comment_text']
print(comment[:10])

0    Explanation\nWhy the edits made under my usern...
1    D'aww! He matches this background colour I'm s...
2    Hey man, I'm really not trying to edit war. It...
3    "\nMore\nI can't make any real suggestions on ...
4    You, sir, are my hero. Any chance you remember...
5    "\n\nCongratulations from me as well, use the ...
6         COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK
7    Your vandalism to the Matt Shirvington article...
8    Sorry if the word 'nonsense' was offensive to ...
9    alignment on this subject and which are contra...
Name: comment_text, dtype: object


In [4]:
#using regex to clean the data

# Replace numbers with 'number'
processed = comment.str.replace(r'\d+(\.\d+)?', 'number')

#Remove \n by white pace
processed = processed.str.replace(r'(\n+)(?=[A-Z])', r' ')

# Replace money symbols with 'moneysymb' (£ can by typed with ALT key + 156)
processed = processed.str.replace(r'£|\$', 'moneysymb')

# Remove punctuation
processed = processed.str.replace(r'[^\w\d\s]', ' ')

# Replace whitespace between terms with a single space
processed = processed.str.replace(r'\s+', ' ')

# Remove leading and trailing whitespace
processed = processed.str.replace(r'^\s+|\s+?$', '')





In [5]:
# change words to lower case - Hello, HELLO, hello are all the same word
processed = processed.str.lower()
print(processed)

0        explanation why the edits made under my userna...
1        d aww he matches this background colour i m se...
2        hey man i m really not trying to edit war it s...
3        more i can t make any real suggestions on impr...
4        you sir are my hero any chance you remember wh...
                               ...                        
19995    support asadullah is only two sentences and it...
19996    make me lost my faith about wikipedia so calle...
19997    notability of dem number boyz a tag has been p...
19998    because i didnt do anything wrong in the first...
19999    if you could explain how any private company c...
Name: comment_text, Length: 20000, dtype: object


In [6]:
from nltk.corpus import stopwords

# remove stop words from text messages

stop_words = set(stopwords.words('english'))

processed = processed.apply(lambda x: ' '.join(
    term for term in x.split() if term not in stop_words))

In [7]:
from nltk.tokenize import word_tokenize

In [8]:
# create bag-of-words
all_words = []

for message in processed:
    words = word_tokenize(message)
    for w in words:
        all_words.append(w)
        
all_words = nltk.FreqDist(all_words)

In [9]:
# print the total number of words and the 15 most common words
print('Number of words: {}'.format(len(all_words)))
print('Most common words: {}'.format(all_words.most_common(15)))

Number of words: 52965
Most common words: [('number', 19682), ('article', 7027), ('page', 6060), ('wikipedia', 6023), ('talk', 4555), ('one', 3721), ('would', 3670), ('please', 3575), ('like', 3504), ('see', 2785), ('also', 2561), ('know', 2397), ('think', 2361), ('edit', 2336), ('people', 2197)]


In [10]:
# let's take first 5000 most common words as features
word_features = list(all_words.keys())[:1500]

In [11]:
# The find_features function will determine which of the 1500 word features are contained in the review
def find_features(message):
    words = word_tokenize(message)
    features = {}
    for word in word_features:
        features[word] = (word in words)

    return features

# Lets see an example!
features = find_features(processed[0])
for key, value in features.items():
    if value == True:
        print(key)

explanation
edits
made
username
hardcore
metallica
fan
reverted
vandalisms
closure
gas
voted
new
york
dolls
fac
please
remove
template
talk
page
since
retired
number


In [12]:
list_classes = ["toxic"]
y = data[list_classes].values

In [13]:
messages = list(zip(processed, y))

# define a seed for reproducibility
seed = 1
np.random.seed = seed
np.random.shuffle(messages)


In [14]:
# call find_features function for each SMS message
featuresets = [(find_features(text), label) for (text, label) in messages]

In [15]:
# we can split the featuresets into training and testing datasets using sklearn
from sklearn import model_selection

# split the data into training and testing datasets
training, testing = model_selection.train_test_split(featuresets, test_size = 0.20, random_state=seed)

In [16]:
print(len(training))
print(len(testing))

16000
4000


In [17]:
# using sklearn algorithms in NLTK
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.svm import SVC

model = SklearnClassifier(SVC(kernel = 'linear'))

# train the model on the training data
model.train(training)

# and test on the testing dataset!
accuracy = nltk.classify.accuracy(model, testing)*100
print("SVC Accuracy: {}".format(accuracy))

  y = column_or_1d(y, warn=True)


SVC Accuracy: [94.05]


In [18]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Define models to train
names = ["K Nearest Neighbors", "Decision Tree", "Random Forest", "Logistic Regression", "SGD Classifier",
         "Naive Bayes", "SVM Linear"]

classifiers = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(),
    SGDClassifier(max_iter = 100),
    MultinomialNB(),
    SVC(kernel = 'linear')
]

models = zip(names, classifiers)

for name, model in models:
    nltk_model = SklearnClassifier(model)
    nltk_model.train(training)
    accuracy = nltk.classify.accuracy(nltk_model, testing)*100
    print("{} Accuracy: {}".format(name, accuracy))

  y = column_or_1d(y, warn=True)


K Nearest Neighbors Accuracy: [91.25]


  y = column_or_1d(y, warn=True)


Decision Tree Accuracy: [89.55]


  y = column_or_1d(y, warn=True)


Random Forest Accuracy: [93.225]


  y = column_or_1d(y, warn=True)


Logistic Regression Accuracy: [94.125]
SGD Classifier Accuracy: [94.15]
Naive Bayes Accuracy: [93.225]


  y = column_or_1d(y, warn=True)


SVM Linear Accuracy: [94.05]


In [20]:
# Ensemble methods - Voting classifier
from sklearn.ensemble import VotingClassifier

names = ["K Nearest Neighbors", "Decision Tree", "Random Forest", "Logistic Regression", "SGD Classifier",
         "Naive Bayes", "SVM Linear"]

classifiers = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(),
    SGDClassifier(max_iter = 100),
    MultinomialNB(),
    SVC(kernel = 'linear')
]

models = list(zip(names, classifiers))

nltk_ensemble = SklearnClassifier(VotingClassifier(estimators = models, voting = 'hard', n_jobs = -1))
nltk_ensemble.train(training)
accuracy = nltk.classify.accuracy(nltk_model, testing)*100
print("Voting Classifier: Accuracy: {}".format(accuracy))

Voting Classifier: Accuracy: [94.05]


In [21]:
# make class label prediction for testing set
txt_features, labels = zip(*testing)

prediction = nltk_ensemble.classify_many(txt_features)

In [24]:
# print a confusion matrix and a classification report
print(classification_report(labels, prediction))


              precision    recall  f1-score   support

           0       0.95      0.99      0.97      3603
           1       0.87      0.48      0.62       397

    accuracy                           0.94      4000
   macro avg       0.91      0.74      0.80      4000
weighted avg       0.94      0.94      0.93      4000



In [29]:
pd.DataFrame(
    confusion_matrix(labels, prediction),
    index = [['actual', 'actual'], ['ham', 'spam']],
    columns = [['predicted', 'predicted'], ['ham', 'spam']])



Unnamed: 0_level_0,Unnamed: 1_level_0,predicted,predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,ham,spam
actual,ham,3575,28
actual,spam,205,192
