### Importing Libraries

In [1]:
import pandas as pd
import sklearn
import re
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import nltk
nltk.download('stopwords')

nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
#reading the test data
df_test= pd.read_csv('test.csv')

FileNotFoundError: [Errno 2] File test.csv does not exist: 'test.csv'

In [None]:
df_test

In [None]:
#Reading the train data
df_train= pd.read_csv('train.csv')

In [None]:
df_train

In [None]:
df_train.describe()

In [None]:
df_test.describe()

In [None]:
df_train.isnull().sum()

In [None]:
df_test.isnull().sum()

In [None]:
#Visualizing the data set

In [None]:
train_labels = df_train[['toxic', 'severe_toxic',
                      'obscene', 'threat', 'insult', 'identity_hate']]
label_count = train_labels.sum()

label_count.plot(kind='bar', title='Labels Frequency',rot=0, color='y')

This shows that the number of comments in each category is extremely unbalanced. We can further check whether each comment has been tagged or not?

In [None]:
rowsums = df_train.iloc[:,2:8].sum(axis=1)
valcount = rowsums.value_counts()
valcount.plot.bar()
plt.xlabel("# of labels tagged to")
plt.ylabel("# of comments")
plt.title("Comments that have multiple labels tagged")
plt.show()

print(valcount[0]*100/sum(valcount),"% comments have no labels associated to them.")

In [None]:
df_train.comment_text.shape

In [None]:
df_test.comment_text.shape

In [None]:
print(df_train.comment_text[9])
print(df_test.comment_text[9])

### Data Preprocessing

In [None]:

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
ps = PorterStemmer()
wordnet=WordNetLemmatizer()

corpus = []
for i in range(0, len(df_train['comment_text'])):
    review = re.sub('[^a-zA-Z]', ' ', df_train['comment_text'][i])
    review = review.lower()
    review = review.split()
    
    review =[wordnet.lemmatize(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

In [None]:
corpus[9]

In [None]:
def preprocess_text(text):
    text = text.lower()
    text = text.split()
    text =[wordnet.lemmatize(word) for word in text if not word in set(stopwords.words('english'))]
    
    return ''.join(text)
    

In [None]:
df_test['comment_text'] = df_test['comment_text'].apply(preprocess_text)

In [None]:
df_train['comment_text']=corpus

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

In [None]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import Pipeline
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

In [None]:
# spliting into traindata and testdata 
categories = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
train, test = train_test_split(df_train, random_state=42, test_size=0.33, shuffle=True)
X_train = train.comment_text
X_test = test.comment_text
print(X_train.shape)
print(X_test.shape)

## Multinomial NB

In [None]:
# Define a pipeline combining a text feature extractor with multi lable classifier
NB_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('clf', OneVsRestClassifier(MultinomialNB(
                    fit_prior=True, class_prior=None))),
            ])
for category in categories:
    print('... Processing {}'.format(category))
    # train the model using X_train & y
    NB_pipeline.fit(X_train, train[category])
    # compute the testing accuracy
    prediction = NB_pipeline.predict(X_test)
    print('Test accuracy is {}'.format(accuracy_score(test[category], prediction)))


## LinearSVC

In [None]:
from sklearn.svm import LinearSVC

In [None]:
SVC_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('clf', OneVsRestClassifier(LinearSVC(), n_jobs=1)),
            ])
for category in categories:
    print('... Processing {}'.format(category))
    # train the model using X_dtm & y
    SVC_pipeline.fit(X_train, train[category])
    # compute the testing accuracy
    prediction = SVC_pipeline.predict(X_test)
    print('Test accuracy is {}'.format(accuracy_score(test[category], prediction)))