**Dataset**
labeled datasset collected from twitter

**Objective**
classify tweets containing hate speech from other tweets. <br>
0 -> no hate speech <br>
1 -> contains hate speech <br>

**Total Estimated Time = 90-120 Mins**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Import Libraries

In [2]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

### Load Dataset

In [3]:
df = pd.read_csv('/content/drive/MyDrive/ITI/NLP/dataset.csv')
df.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


### EDA

- check NaNs

In [4]:
df.isnull().values.any()

False

- check duplicates

In [5]:
#df[df.duplicated(['label','tweet'], keep=False)]
df.drop(['id'],inplace=True,axis=1)
num_duplicates = df.duplicated().sum()
print('Number of duplicates:', num_duplicates)

Number of duplicates: 2432


In [6]:
df.drop_duplicates(inplace=True)

- show samples of data texts to find out required preprocessing steps

In [7]:
# a helper functions and imports
from IPython.display import display
def highlight_col(x, df):
    #set by condition
    mask =  df['label'] == 1
    mask2 = df['label'] == 0
    x = pd.DataFrame('', index=df.index, columns=df.columns)
    x.loc[mask] = 'background-color: #e6ffe6'
    x.loc[mask2] = 'background-color: #ffe6e6'
    return x   

In [8]:
pd.set_option('display.max_colwidth', 100000)
df_tmp = df.sample(5)
df_tmp.style.apply(lambda x: highlight_col(x, df_tmp), axis=None)

Unnamed: 0,label,tweet
800,0,"if you want creative workers, give them enough time to play. #success #quote"
10613,0,èªå¿happy bihday..ððâ¨ bihday #å¢ç°èªå¿ #ããã§ã¨ã
30984,0,got me ticket yay ðð#stoneroses
19226,0,"this is ontiatÃ©n, who was in today for an exam and vaccines with dr. latimer. what a happy boy! #dogs #dah"
1159,0,daily #affirmation #motivation #inspiration #purpose #love #peace


- check dataset balancing

In [9]:
class_counts = df['label'].value_counts()
class_proportions = df['label'].value_counts(normalize=True)

print('Class counts:')
print(class_counts)

print('Class proportions:')
print(class_proportions)


Class counts:
0    27517
1     2013
Name: label, dtype: int64
Class proportions:
0    0.931832
1    0.068168
Name: label, dtype: float64


- Cleaning and Preprocessing are:
    - 1
    - 2
    - 3
    - ... etc.

### Cleaning and Preprocessing

In [11]:
pd.set_option("display.max_rows",None)
pd.set_option("max_colwidth", None)
df.head(100)

Unnamed: 0,label,tweet
0,0,@user when a father is dysfunctional and is so selfish he drags his kids into his dysfunction. #run
1,0,@user @user thanks for #lyft credit i can't use cause they don't offer wheelchair vans in pdx. #disapointed #getthanked
2,0,bihday your majesty
3,0,#model i love u take with u all the time in urð±!!! ððððð¦ð¦ð¦
4,0,factsguide: society now #motivation
5,0,[2/2] huge fan fare and big talking before they leave. chaos and pay disputes when they get there. #allshowandnogo
6,0,@user camping tomorrow @user @user @user @user @user @user @user dannyâ¦
7,0,the next school year is the year for exams.ð¯ can't think about that ð­ #school #exams #hate #imagine #actorslife #revolutionschool #girl
8,0,we won!!! love the land!!! #allin #cavs #champions #cleveland #clevelandcavaliers â¦
9,0,@user @user welcome here ! i'm it's so #gr8 !


In [12]:
import sys
!pip install autocorrect
!pip install git+https://github.com/MCFreddie777/language-check.git
!pip install pycontractions
!pip install emoji
!pip install acronym
!{sys.executable} -m pip install contractions
!pip install symspellpy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting autocorrect
  Downloading autocorrect-2.6.1.tar.gz (622 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m622.8/622.8 kB[0m [31m23.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: autocorrect
  Building wheel for autocorrect (setup.py) ... [?25l[?25hdone
  Created wheel for autocorrect: filename=autocorrect-2.6.1-py3-none-any.whl size=622363 sha256=5dc69d52a6b57aacf4c9c61424f25d1567e159729beac7e2970ea4d07aca8c92
  Stored in directory: /root/.cache/pip/wheels/b5/7b/6d/b76b29ce11ff8e2521c8c7dd0e5bfee4fb1789d76193124343
Successfully built autocorrect
Installing collected packages: autocorrect
Successfully installed autocorrect-2.6.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/MCFreddie777/lang

In [13]:
import nltk
from nltk.tokenize import TweetTokenizer
import re
import string
#import enchant
import nltk
from nltk.corpus import stopwords
import string
from autocorrect import Speller
#from pycontractions import Contractions
import emoji
import acronym
import sys
import contractions
import pkg_resources
from symspellpy import SymSpell, Verbosity


nltk.download('stopwords')
nltk.download('punkt')


def tokenize_tweet(tweet):
    # create a TweetTokenizer object
    tknzr = TweetTokenizer()
    # tokenize the tweet
    tokens = tknzr.tokenize(tweet)
    return tokens

def remove_urls(lst):
    pattern = re.compile(r'https?://\S+|www\.\S+')
    return [re.sub(pattern, '', item).strip() for item in lst if re.sub(pattern, '', item).strip() != '']


def remove_non_english(lst):
    """Removes non-English characters from a list of strings"""
    return [''.join([c for c in item if ord(c) < 128]) for item in lst]

def remove_numbers(lst):
    """Removes numbers from a list of strings"""
    pattern = re.compile(r'\d+')
    return [re.sub(pattern, '', item) for item in lst if re.sub(pattern, '', item).strip() != '']

def remove_punctuation(lst):
    """Removes punctuation from a list of strings, including single punctuation characters"""
    translator = str.maketrans('', '', string.punctuation.replace('@', '').replace("'",''))
    result = []
    for item in lst:
        # Remove all punctuation characters
        item = item.translate(translator)
        # Remove any remaining single punctuation characters
        if item != '':
          result.append(item)
    return result

def lowercase_strings(lst):
    """Converts all strings in a list to lowercase"""
    return [item.lower() for item in lst]

def remove_stopwords(lst):
    """Removes stopwords (excluding negation words) from a list of words"""
    stop_words = set(stopwords.words('english'))
    negation_words = ['not', 'no', "n't", 'never']
    filtered_lst = [word for word in lst if word not in stop_words or word in negation_words]
    return filtered_lst

#def correct_spelling(words):
    #"""Corrects misspelled words in a list of words"""
    #spell = Speller(lang='en')
    #corrected_words = [spell(word) for word in words]
    #return corrected_words

sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
dictionary_path = pkg_resources.resource_filename(
        "symspellpy", "frequency_dictionary_en_82_765.txt"
    )
    # term_index is the column of the term and count_index is the
    # column of the term frequency
sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)


def correct_spelling(words,edit_distance):
    """Corrects misspelled words in a list of words"""
    lst_of_words = []
    for word in words:

    # lookup suggestions for single-word input strings
    # max edit distance per lookup
    # (max_edit_distance_lookup <= max_dictionary_edit_distance)
      suggestions = sym_spell.lookup(word, Verbosity.CLOSEST, max_edit_distance=edit_distance, include_unknown=True)
      best_suggestion = str(suggestions[0]).split(',')[0].strip()
      lst_of_words.append(best_suggestion)

    return lst_of_words

def expand_contractions(word_list):
    """Expands contractions in a list of words using pycontractions library"""
    expanded_word_list = []
    for word in word_list:
        if "'" in word:
            expanded_word = contractions.fix(word)
            if "not" in expanded_word:
              expanded_word = expanded_word[:-3] + " " + expanded_word[-3:]
            expanded_word_list.extend([word for word in expanded_word.split()])
        else:
            expanded_word_list.append(word)
    return expanded_word_list

def remove_emojis(words):
    """Removes emojis from a list of words"""
    cleaned_words = []
    for word in words:
        cleaned_word = ''.join(c for c in word if c not in emoji.EMOJI_DATA)
        if cleaned_word != '':
            cleaned_words.append(cleaned_word)
    return cleaned_words

def remove_extra_spaces(words):
    """Removes extra whitespaces at the beginning and at the end of each word in a list"""
    cleaned_words = []
    for word in words:
        cleaned_word = ' '.join(word.split()).strip()
        cleaned_words.append(cleaned_word)
    return cleaned_words

def remove_user_mentions(words):
    """Removes user mentions (@user) from a list of words"""
    cleaned_words = []
    for word in words:
        if not word.startswith('@'):
            cleaned_words.append(word)
    return cleaned_words

def form_sentence(words):
    """Forms a sentence from a list of words"""
    sentence = ' '.join(words)
    return sentence


Initial downloading of word corpus


[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.
[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Unzipping corpora/gutenberg.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [14]:
def clean_tweet(tweet):
    """
    A function to clean a single tweet.
    """
    #tokenize tweet 
    words = tokenize_tweet(tweet)

    #remove extra white-spaces
    words = remove_extra_spaces(words)

    #remove non-English charachters
    words = remove_non_english(words)

    #remove punctiation
    words = remove_punctuation(words)

    #remove emojis
    words = remove_emojis(words)

    #remove urls 
    words = remove_urls(words)

    #remove user mentions 
    words = remove_user_mentions(words)

    #remove numbers
    words = remove_numbers(words)

    #lower case words
    words = lowercase_strings(words)

    #substiute contractions 
    words = expand_contractions(words)
    
    #remove stop words 
    words = remove_stopwords(words)

    #substiute abbreviations 
    #words = expand_abbreviations(words)

    #subsitute misspelled words
    words = correct_spelling(words,2)

    #form a new sentence
    sentence = form_sentence(words)

    return sentence #words

In [15]:
df['clean_tweet'] = df['tweet'].apply(clean_tweet)
df.head(10)

Unnamed: 0,label,tweet,clean_tweet
0,0,@user when a father is dysfunctional and is so selfish he drags his kids into his dysfunction. #run,father dysfunctional selfish drags kids dysfunction run
1,0,@user @user thanks for #lyft credit i can't use cause they don't offer wheelchair vans in pdx. #disapointed #getthanked,thanks left credit not use cause not offer wheelchair vans pix disappointed getthanked
2,0,bihday your majesty,birthday majesty
3,0,#model i love u take with u all the time in urð±!!! ððððð¦ð¦ð¦,model love a take a time or
4,0,factsguide: society now #motivation,factsguide society motivation
5,0,[2/2] huge fan fare and big talking before they leave. chaos and pay disputes when they get there. #allshowandnogo,huge fan fare big talking leave chaos pay disputes get allshowandnogo
6,0,@user camping tomorrow @user @user @user @user @user @user @user dannyâ¦,camping tomorrow danny
7,0,the next school year is the year for exams.ð¯ can't think about that ð­ #school #exams #hate #imagine #actorslife #revolutionschool #girl,next school year year exams not think school exams hate imagine actorslife revolutionschool girl
8,0,we won!!! love the land!!! #allin #cavs #champions #cleveland #clevelandcavaliers â¦,love land allen cars champions cleveland clevelandcavaliers
9,0,@user @user welcome here ! i'm it's so #gr8 !,welcome or


In [17]:
df.to_csv('/content/drive/MyDrive/ITI/NLP/new_dataset.csv',index=False)

**If it takes 60 Mins till here, you are doing Great** <br>
**If not! You also are doing Great**

### Modelling

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.clean_tweet.values, df.label.values,stratify=df.label.values, test_size=0.2)

####Model 1

In [None]:
vec = CountVectorizer(stop_words='english')
clf = LogisticRegression()
pipe = make_pipeline(vec, clf)
pipe.fit(X_train, y_train);

####Evaluation

In [None]:
from sklearn import metrics

def print_report(pipe, x_test, y_test):
    y_pred = pipe.predict(x_test)
    report = metrics.classification_report(y_test, y_pred)
    print(report)
    print("accuracy: {:0.3f}".format(metrics.accuracy_score(y_test, y_pred)))

In [None]:
print_report(pipe, X_test, y_test)

              precision    recall  f1-score   support

           0       0.96      0.99      0.98      5503
           1       0.86      0.42      0.57       403

    accuracy                           0.96      5906
   macro avg       0.91      0.71      0.77      5906
weighted avg       0.95      0.96      0.95      5906

accuracy: 0.956


In [None]:
from sklearn.metrics import f1_score
f1_score(pipe.predict(X_test),y_test)

0.5657237936772047

In [None]:
pipe.score(X_test, y_test)

0.9558076532339993

####Model2

In [None]:
from sklearn.svm import LinearSVC

vec = CountVectorizer(stop_words='english')
lsvc = LinearSVC()
pipe = make_pipeline(vec, lsvc)
pipe.fit(X_train, y_train);

####Evaluation

In [None]:
print_report(pipe, X_test, y_test)

              precision    recall  f1-score   support

           0       0.97      0.99      0.98      5503
           1       0.75      0.54      0.63       403

    accuracy                           0.96      5906
   macro avg       0.86      0.76      0.80      5906
weighted avg       0.95      0.96      0.95      5906

accuracy: 0.957


In [None]:
from sklearn.metrics import f1_score
f1_score(pipe.predict(X_test),y_test)

0.630057803468208

In [None]:
pipe.score(X_test, y_test)

0.9566542499153403

### Enhancement

- Using different N-grams
- Using different text representation technique

In [None]:
vec = CountVectorizer(stop_words='english',ngram_range=(1,3))
clf = LogisticRegression()
pipe = make_pipeline(vec, clf)
pipe.fit(X_train, y_train);

In [None]:
print_report(pipe, X_test, y_test)

              precision    recall  f1-score   support

           0       0.96      1.00      0.98      5503
           1       0.90      0.38      0.54       403

    accuracy                           0.95      5906
   macro avg       0.93      0.69      0.76      5906
weighted avg       0.95      0.95      0.95      5906

accuracy: 0.955


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english',ngram_range=(1, 3))
clf = LogisticRegression()
pipe = make_pipeline(vectorizer, clf)
pipe.fit(X_train, y_train);

In [None]:
print_report(pipe, X_test, y_test)

              precision    recall  f1-score   support

           0       0.94      1.00      0.97      5503
           1       0.86      0.15      0.26       403

    accuracy                           0.94      5906
   macro avg       0.90      0.58      0.62      5906
weighted avg       0.94      0.94      0.92      5906

accuracy: 0.941


In [None]:
f1_score(pipe.predict(X_test),y_test)

0.26105263157894737

In [None]:
feature_names = vectorizer.get_feature_names_out()
coefficients = clf.coef_[0]
feature_importance = pd.Series(coefficients,index = feature_names).sort_values(ascending=False)
feature_importance[:10]

white        6.512898
allahsoil    6.325692
racist       5.763180
trump        5.730201
racism       4.713133
black        4.292426
obama        4.044788
women        4.007596
bigot        3.290791
comments     3.210937
dtype: float64

#### Done!