[Dataset](https://raw.githubusercontent.com/amankharwal/Website-data/master/twitter.csv)

Imports and initializations

In [93]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

import re
import nltk
from nltk.util import pr
from nltk.corpus import stopwords
import string
from nltk.stem.snowball import stopwords

nltk.download("stopwords")
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [94]:
stemmer = nltk.SnowballStemmer("english") #creating a stemmer object
stopwords = set(stopwords.words("english")) #creating set of stopwords

from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer() #creating a lemmatizer object

In [95]:
def lemmatize_document(document):
    words = nltk.word_tokenize(document)  # Tokenize the document into words
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(lemmatized_words)

In [96]:
df = pd.read_csv("twitter_data.csv")
# print(df.tail())

In [97]:
df['labels'] = df['class'].map({0:"Hate speech detected", 1:"Offensive language detected", 2:"No hate or offensive intent detected"})
# print(df.tail())

In [98]:
df = df[['tweet', 'labels']]
df.head()

Unnamed: 0,tweet,labels
0,!!! RT @mayasolovely: As a woman you shouldn't...,No hate or offensive intent detected
1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,Offensive language detected
2,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,Offensive language detected
3,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,Offensive language detected
4,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,Offensive language detected


Cleaning the data

In [99]:
def clean(text):
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text) #removing text bw square brackets
    text = re.sub('https?://\S+|www\.\S+', '', text) #removing urls
    text = re.sub('<.*?>', '', text)#removing html tags
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text) #removing punctuation marks
    text = re.sub('\n', '', text) #removing newline characters
    text = re.sub('\w*\d\w*', '', text) #removing words with digits
    text = [word for word in text.split(' ') if word not in stopwords]
    text=" ".join(text)
    text = [stemmer.stem(word) for word in text.split(' ')]
    text=" ".join(text)
    # text = [lemmatize_document(word) for word in text.split(' ')]
    return text
df["tweet"] = df["tweet"].apply(clean)
df.head()

Unnamed: 0,tweet,labels
0,rt mayasolov woman shouldnt complain clean ho...,No hate or offensive intent detected
1,rt boy dat coldtyga dwn bad cuffin dat hoe ...,Offensive language detected
2,rt urkindofbrand dawg rt ever fuck bitch sta...,Offensive language detected
3,rt cganderson vivabas look like tranni,Offensive language detected
4,rt shenikarobert shit hear might true might f...,Offensive language detected


Vectorizing the tweets and splitting the dataset

In [100]:
x = np.array(df["tweet"])
y = np.array(df["labels"])

cv = CountVectorizer()

x = cv.fit_transform(x)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.33, random_state = 42)

Applying the decision tree algorithm on the test set

In [101]:
clf = DecisionTreeClassifier()
clf.fit(x_train, y_train)

In [102]:
test_data = "i hate you"
df = cv.transform([test_data]).toarray()
print(clf.predict(df))

['Offensive language detected']
