In [None]:
import pandas as pd
df = pd.read_csv('/kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv.zip')
df.head()

In [None]:
df.info()

# EDA

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(10,6))
sns.barplot(x = list(df.columns[2:]), y= list(df.iloc[:,2:].sum()))
plt.title('Total comments in each category')
plt.xlabel('Number of comments')
plt.ylabel('Types of comments')

In [None]:
count = 0
for i in range(len(df)):
    n = df.iloc[i,2:].sum()
    if n > 1:
        count = count + 1
    
print('Total multi-labeled comments:',count)

In [None]:
multi_count = df.iloc[:,2:].sum(axis=1).value_counts()

plt.figure(figsize=(10,6))
sns.barplot(x = multi_count.index[1:], y = multi_count[1:])
plt.title('Multi-Labelled comments')
plt.ylabel('No. of multi-label comments')
plt.xlabel('number of labels')

In [None]:
from wordcloud import WordCloud, STOPWORDS

plt.figure(figsize=(10,8))
text = df.comment_text.values

cloud = WordCloud(stopwords=STOPWORDS, 
                  background_color='black', 
                  width=2500, 
                  height=1800).generate(''.join(text))
plt.imshow(cloud)

# Cleaning the text data:

In [None]:
import re
import string
from nltk.corpus import stopwords

def clean_text(x):
    '''
    The first two lines replace characters to spaces.
    Than we remove punctuations from text using string
    and finally all the stopwords in the text.
    '''
    text = re.sub(r"\'", r'', x) 
    text = re.sub(r'\n', r' ', text)
    nopunc = [i for i in text if i not in string.punctuation]
    nopunc = ''.join(nopunc)
    stop_word = [j for j in nopunc.split() if j.lower() not in stopwords.words('english')]
    return ' '.join(stop_word)

In [None]:
import datetime
before = datetime.datetime.now()
df['clean_comments'] = df['comment_text'].apply(clean_text)
after = datetime.datetime.now()
print('Time consumed by text cleaning operation: ', after - before)

In [None]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer('english')

def stemm(sentence):
    '''
    Function first splits the sentence to words
    than stem each word to base form and finally
    concat it to string \'stem_word\' '''
    stem_word = ''
    for word in sentence.split():
        stem = stemmer.stem(word)
        stem_word += stem
        stem_word += ' '
    return stem_word

In [None]:
import datetime
before = datetime.datetime.now()

df['clean_comments'] = df['clean_comments'].apply(stemm)

after = datetime.datetime.now()
print('Time consumed by stemming operation: ', after - before)

# Text before and after applying stemming and cleaning text:

In [None]:
print(df['comment_text'][0])
print('-'*40)
print(df['clean_comments'][0])

### TF-IDF of a word gives a product of how frequent this word is in the document multiplied by how unique the word is w.r.t. the entire corpus of documents.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

vectrizer = TfidfVectorizer(analyzer='word',ngram_range=(1,3))
X = vectrizer.fit_transform(df['clean_comments'])

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    df.drop(['id','comment_text','clean_comments'],axis=1),
                                                    test_size=0.33, 
                                                    random_state=42)

### In an “one-to-rest” strategy, we build multiple independent classifiers and, choose the class for which the confidence is maximized.

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import hamming_loss, accuracy_score

model = OneVsRestClassifier(estimator=LogisticRegression())
model.fit(X_train, y_train)

prediction = model.predict(X_test)
print('Accuracy Score: ', accuracy_score(y_test, prediction))
print('hamming loss : ', hamming_loss(y_test, prediction))