In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
data = pd.read_csv('/content/FinalBalancedDataset.csv')

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56745 entries, 0 to 56744
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  56745 non-null  int64 
 1   Toxicity    56745 non-null  int64 
 2   tweet       56745 non-null  object
dtypes: int64(2), object(1)
memory usage: 1.3+ MB


In [None]:
data.head(5)

Unnamed: 0.1,Unnamed: 0,Toxicity,tweet
0,0,0,@user when a father is dysfunctional and is s...
1,1,0,@user @user thanks for #lyft credit i can't us...
2,2,0,bihday your majesty
3,3,0,#model i love u take with u all the time in ...
4,4,0,factsguide: society now #motivation


In [None]:
data = data.drop("Unnamed: 0", axis=1)

In [None]:
data['Toxicity'].value_counts()

Unnamed: 0_level_0,count
Toxicity,Unnamed: 1_level_1
0,32592
1,24153


# **Lemmatizer**
1. Leaves
2. Leafs
Leaf

# **Text pre-processing**

In [None]:
import re
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

# Download necessary NLTK resources if not already downloaded
nltk.download('punkt')
nltk.download('universal_tagset')  # Download universal tagset
nltk.download('omw-1.4') # Download open multilingual wordnet data, if necessary
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Unzipping taggers/universal_tagset.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
wordnet_lemmatizer = WordNetLemmatizer()

**text_preprocessing_explanation**

**Step 1: Tokenization:**
   - The code utilizes the `word_tokenize` function from the NLTK library to split the text into individual words or tokens.

**Step 2: Part-of-Speech (POS) Tagging:**
   - The `pos_tag` function from NLTK assigns a grammatical tag to each token, indicating its part of speech (e.g., noun, verb, adjective).
   - The code utilizes the `universal_tagset` for assigning tags.

**Step 3: Lemmatization:**
   - The code employs the `WordNetLemmatizer` from NLTK to reduce words to their base or dictionary form (lemma).
   - It iterates through the tokens and, based on their POS tags, applies lemmatization.
   - POS tags are important for the lemmatizer to identify the correct lemma (e.g., "running" lemmatizes to "run" if it is a verb).

**Step 4: Filtering and Cleaning:**
   - It processes the lemmatized words, removing any non-alphanumeric characters or specific characters, such as '_'.
   - The code might filter or remove punctuation, special characters, and other unwanted elements.

**Step 5: Joining Tokens:**
   - The final step is to join the processed tokens back into a string, which forms the pre-processed text.

In [None]:
def prepare_text(text):
    def get_wordnet_pos(treebank_tag):
        if treebank_tag.startswith('J'):
            return wordnet.ADJ
        elif treebank_tag.startswith('V'):
            return wordnet.VERB
        elif treebank_tag.startswith('N'):
            return wordnet.NOUN
        elif treebank_tag.startswith('R'):
            return wordnet.ADV
        else:
            return wordnet.NOUN

    text = re.sub(r'[^a-zA-Z\']', ' ', text)
    text = text.split()
    text = ' '.join(text)

    # Use universal tagset to avoid potential LookupError
    text = pos_tag(word_tokenize(text), tagset='universal')

    lemma = []
    for i in text:
        lemma.append(wordnet_lemmatizer.lemmatize(i[0], pos=get_wordnet_pos(i[1])))
    lemma = ' '.join(lemma)
    return lemma

data['clean_tweets'] = data['tweet'].apply(lambda x: prepare_text(x))

In [None]:
data.head(5)

Unnamed: 0,Toxicity,tweet,clean_tweets
0,0,@user when a father is dysfunctional and is s...,user when a father be dysfunctional and be so ...
1,0,@user @user thanks for #lyft credit i can't us...,user user thanks for lyft credit i ca n't use ...
2,0,bihday your majesty,bihday your majesty
3,0,#model i love u take with u all the time in ...,model i love u take with u all the time in ur
4,0,factsguide: society now #motivation,factsguide society now motivation


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

In [None]:
corpus = data['clean_tweets'].values.astype('U')

In [None]:
import nltk

# Download the 'stopwords' dataset
nltk.download('stopwords')

from nltk.corpus import stopwords as nltk_stopwords

stopwords = list(nltk_stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
count_tf_idf = TfidfVectorizer(stop_words = stopwords)
tf_idf = count_tf_idf.fit_transform(corpus)

In [None]:
import pickle

In [None]:
# Save the count_tf_idf object to a pickle file
with open('count_tf_idf.pkl', 'wb') as f:
    pickle.dump(count_tf_idf, f)

In [None]:
tf_idf_train, tf_idf_test, target_train, target_test = train_test_split(
    tf_idf, data['Toxicity'], test_size = 0.8, random_state= 42, shuffle=True
)

# **Create a Binary Classification Model**

In [None]:
model_bayes = MultinomialNB()

In [None]:
model_bayes = model_bayes.fit(tf_idf_train, target_train)

In [None]:
y_pred_proba = model_bayes.predict_proba(tf_idf_test)[::, 1]

In [None]:
y_pred_proba

array([0.90073737, 0.27897187, 0.79049494, ..., 0.09988327, 0.20646643,
       0.3208466 ])

In [None]:
fpr, tpr, _ = roc_curve(target_test, y_pred_proba)

In [None]:
final_roc_auc = roc_auc_score(target_test, y_pred_proba)

In [None]:
final_roc_auc

0.9659680682436473

In [None]:
test_text = "I hate you moron"
test_tfidf = count_tf_idf.transform([test_text])
display(model_bayes.predict_proba(test_tfidf))
display(model_bayes.predict(test_tfidf))

array([[0.398519, 0.601481]])

array([1])

In [None]:
with open('toxicity_model.pkl', 'wb') as file:
        pickle.dump(model_bayes, file)