Accessing the dataset through web scraping

In [1]:
from bs4 import BeautifulSoup
import requests

In [22]:
url = 'https://www.kaggle.com/datasets/vkrahul/twitter-hate-speech?select=train_E6oV3lV.csv'
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html')

In [73]:
print(soup)


<!DOCTYPE html>

<html lang="en">
<head>
<title>Twitter hate speech | Kaggle</title>
<meta charset="utf-8"/>
<meta content="index, follow" name="robots"/>
<meta content="Kaggle is the world’s largest data science community with powerful tools and resources to help you achieve your data science goals." name="description"/>
<meta content="email and messaging" name="keywords"/>
<meta content="width=device-width, initial-scale=1.0, maximum-scale=5.0, minimum-scale=1.0" name="viewport"/>
<meta content="#008ABC" name="theme-color">
<script nonce="KLDDanOQw5Z+kg68kHtQeQ==" type="text/javascript">
    window["pageRequestStartTime"] = 1724524174114;
    window["pageRequestEndTime"] = 1724524174204;
    window["initialPageLoadStartTime"] = new Date().getTime();
  </script>
<script async="" defer="" id="gsi-client" nonce="KLDDanOQw5Z+kg68kHtQeQ==" src="https://accounts.google.com/gsi/client"></script>
<script nonce="KLDDanOQw5Z+kg68kHtQeQ==">window.KAGGLE_JUPYTERLAB_PATH = "/static/jl2-static/ju

In [24]:
import numpy as np
import pandas as pd

In [48]:
df = pd.read_csv('train_E6oV3lV.csv')
df

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation
...,...,...,...
31957,31958,0,ate @user isz that youuu?ðððððð...
31958,31959,0,to see nina turner on the airwaves trying to...
31959,31960,0,listening to sad songs on a monday morning otw...
31960,31961,1,"@user #sikh #temple vandalised in in #calgary,..."


Removing unnecesarry characters

In [49]:
import re

In [50]:
def remove_unnecessary_characters(tweet):
    return re.sub(r'\W+', ' ', tweet)

df['tweet'] = df['tweet'].apply(lambda x: remove_unnecessary_characters(x))


In [51]:
df.head()

Unnamed: 0,id,label,tweet
0,1,0,user when a father is dysfunctional and is so...
1,2,0,user user thanks for lyft credit i can t use ...
2,3,0,bihday your majesty
3,4,0,model i love u take with u all the time in ur...
4,5,0,factsguide society now motivation


Standardization

In [52]:
import nltk

In [53]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/awwabahmed/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [54]:
from nltk.corpus import wordnet     

In [56]:
def standardization(tweet):
    if tweet is None:
        return None
    
    tweet = tweet.lower().strip()
    corrected_words = []
    for word in tweet.split():
        corrected_word = wordnet.morphy(word) if wordnet.morphy(word) else word
        corrected_words.append(corrected_word)
    
    corrected_text = ' '.join(corrected_words)
    
    return corrected_text

df['tweet'] = df['tweet'].apply(standardization)  

In [59]:
df.head()

Unnamed: 0,id,label,tweet
0,1,0,user when a father be dysfunctional and be so ...
1,2,0,user user thanks for lyft credit i can t use c...
2,3,0,bihday your majesty
3,4,0,model i love u take with u all the time in urð...
4,5,0,factsguide society now motivation


Tokenization & Lemmitization

In [61]:
import spacy
nlp = spacy.load('en_core_web_sm')        

In [62]:
def tokenize_and_lemmatize(text):
    doc = nlp(text)  
    tokens = [token.text for token in doc]  
    lemmas = [token.lemma_ for token in doc]  
    return tokens, lemmas

In [63]:
df[['tokens', 'lemmas']] = df['tweet'].apply(lambda x: pd.Series(tokenize_and_lemmatize(x)))

In [68]:
df.iloc[31959]

id                                                    31960
label                                                     0
tweet     listening to sad song on a monday morning otw ...
tokens    [listening, to, sad, song, on, a, monday, morn...
lemmas    [listen, to, sad, song, on, a, monday, morning...
Name: 31959, dtype: object

Remove NA values

In [69]:
df.isnull().sum()

id        0
label     0
tweet     0
tokens    0
lemmas    0
dtype: int64

Featurization: TD-IDF & Word Embeddings

In [70]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [79]:
# TF-IDF
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df['tweet'])
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())

In [89]:
nlp = spacy.load("en_core_web_sm")

In [90]:
def word_embeddings(text):
    doc = nlp(text)  
    embeddings = [token.vector for token in doc]
    return embeddings