In [2]:
# importing all the required libraries

import numpy as np
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet 
from nltk.util import ngrams
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import csv
from collections import Counter

from sklearn import svm
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from autocorrect import spell
lemmatizer = WordNetLemmatizer()


#import nltk.lemmetizer as lemmatize

In [4]:
# function to clean the text.  

stop_words = set(stopwords.words('english'))
def clean_frame(df):
    global words    
    for i in range(df.shape[0]):
        # removing all urls
        df.at[i,'tweet'] = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", df.at[i,'tweet']).split())
        # getting all the letters to lower case
        df.at[i,'tweet'] = re.sub('[^A-Za-z\t ]+', '', df.at[i,'tweet']).lower()
        # lemmetizing each word
        lem_words = [(lemmatizer.lemmatize(i)).lower() for i in (df.at[i,'tweet']).split()]
        # making a list words that does not include stopwords(small words like is, an, the, etc.) except not, neither, no, never
        f_words = []
        for word in lem_words:
            if (word not in stop_words) or word=='not' or word=='neither' or word=='no' or word=='never':
                f_words.append(word)
        df.at[i,'tweet'] = ' '.join(f_words)        
    return df    

In [7]:
# Extracting hateful words from hate_speech corpus(hate_speech.csv and trainhate.csv)

df = pd.read_csv("hate_speech.csv")
#df contain label 0(for hatespaeech), 1(for offensive) and 2(for clean text)

df2 = pd.read_csv("trainhate.csv")
df = clean_frame(df)
df2 = clean_frame(df2.head(10000)) #taking the first 6000

# df2 contains label 0(for clean) and 1(for hatespeech)
# making the df2 frame right. x=0 is hatespeech, x=1 is offensive and x=2 is clean

for i in range(df2.shape[0]):
    x = df2.at[i,'label']
    if x==1:
        x = 0
    else:
        x = 2
    df2.at[i,'label'] = x
df2.rename(columns = {'label':'class'}, inplace = True)

l = list(df['tweet'])
df = pd.concat([df[['class','tweet']],df2[['class','tweet']]])
df = df.sample(frac=1).reset_index(drop=True)

# shuffling the dataset
df.reindex(np.random.permutation(df.index))

# getting mostly used word in corpus according to their tfidf score
import random

random.shuffle(l)
vectorizer = TfidfVectorizer(ngram_range=(1,1),max_features = 1000)
#vectorizer1 = TfidfVectorizer(ngram_range=(2, 2),max_features = 200)
#vectorizer2 = TfidfVectorizer(ngram_range=(3, 3),max_features = 100)
X = vectorizer.fit_transform(l)
#X1 = vectorizer1.fit_transform(l)
#X2 = vectorizer2.fit_transform(l)
lexicon0 = vectorizer.get_feature_names()
#lexicon1 = vectorizer1.get_feature_names()
#flex = []
#lexicon2 = vectorizer2.get_feature_names()
lexicon = lexicon0 #+ lexicon1 + lexicon2
random.shuffle(lexicon)
print(len(lexicon))
print(lexicon)
print(df.head())
#print(words[:1000])

1000
['may', 'play', 'wanna', 'rock', 'imma', 'moment', 'around', 'answer', 'fast', 'slit', 'cheat', 'park', 'nig', 'two', 'finally', 'via', 'gettin', 'bunch', 'wtf', 'brownie', 'sleep', 'start', 'walk', 'bottle', 'tv', 'crow', 'deserve', 'dress', 'yu', 'tcot', 'took', 'sick', 'la', 'hoe', 'big', 'lost', 'dope', 'idgaf', 'movie', 'find', 'turn', 'trippin', 'lick', 'long', 'da', 'honestly', 'social', 'idc', 'chill', 'basic', 'text', 'seat', 'ha', 'dis', 'ah', 'town', 'easy', 'jason', 'rick', 'trip', 'live', 'glad', 'bitter', 'ride', 'account', 'kind', 'bomb', 'ebola', 'tight', 'seriously', 'always', 'buy', 'dawg', 'speak', 'morning', 'quick', 'anyway', 'nip', 'turned', 'light', 'smith', 'college', 'hillbilly', 'ill', 'something', 'except', 'second', 'move', 'friend', 'xxx', 'team', 'welcome', 'club', 'monday', 'lucky', 'facebook', 'racist', 'yeah', 'bone', 'run', 'lose', 'nigs', 'de', 'cause', 'mama', 'claim', 'drug', 'faggot', 'people', 'line', 'bigger', 'swag', 'thick', 'nude', 'hate'

In [8]:
df

Unnamed: 0,class,tweet
0,0,fuckin yankee month meet pussy
1,1,rt friend talking bitch heavily fuck walk past
2,1,broke hoe hating paid hoe pimpchampaign
3,2,officially going see dec eek
4,1,hate baldheadass bitch
5,1,rt know color skin stop friendly reminder bitch
6,1,bitch
7,1,mad cause yo bitch choosin put jacuzzi splash
8,1,st century lady spoil man snotty stuck bitch
9,2,thankful love thankful positive


In [9]:
# creating featureset which is the input matrix to be fed in the model
# the size of input matrix is: no_of_examples x size_of_lexicon

featureset = []
for i in range(df.shape[0]):
    wt = word_tokenize(df.at[i,'tweet'])
    lem_words = [(lemmatizer.lemmatize(j)).lower() for j in wt]
    features = np.zeros(len(lexicon))
    for word in lem_words:
        if word in lexicon:
                index_value = lexicon.index(word.lower())
                features[index_value] += 1
        else:
            syns = wordnet.synsets(word)
            g = list(set([w.name()[:-5] for w in syns]))
            for w in g:
                if w in lexicon:
                    index_value = lexicon.index(w.lower())
                    features[index_value] += 1
                    break
            
    features = list(features)
    featureset.append([features,df.at[i,'class']])



In [10]:
# taking training size 80% and testing size 20% 

testing_size = int(0.2*len(featureset))
featureset = np.array(featureset)

# splitting training and testing set
train_x = list(featureset[:,0][:-testing_size])
train_y = list(featureset[:,1][:-testing_size])
test_x = list(featureset[:,0][-testing_size:])
test_y = list(featureset[:,1][-testing_size:])


# setting and compiling the model
clf1 = MLPClassifier(solver='lbfgs',activation='logistic', alpha=1e-5,hidden_layer_sizes=(32,16,8), random_state=1)

# fitting the model on training set
clf1.fit(train_x,train_y)
#print(test_x[:100],test_y[:100])

MLPClassifier(activation='logistic', alpha=1e-05, batch_size='auto',
       beta_1=0.9, beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(32, 16, 8), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=1, shuffle=True, solver='lbfgs', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

In [11]:
# printing the accuracy on test set
print(clf1.score(test_x,test_y))

0.8946233467510063


In [12]:
clf2 = MultinomialNB()
clf2.fit(train_x,train_y)
print(clf2.score(test_x,test_y))

0.8542265669925244


In [19]:
str = input("Enter a statement: ")
wt = word_tokenize(str)
lem_words = [(lemmatizer.lemmatize(j)).lower() for j in wt]
features = np.zeros(len(lexicon))
for word in lem_words:
    if word in lexicon:
        index_value = lexicon.index(word.lower())
        features[index_value] += 1
    else:
        # checking the synonyms of the word
        syns = wordnet.synsets(word)
        g = list(set([w.name()[:-5] for w in syns]))
        for w in g:
            if w in lexicon:
                index_value = lexicon.index(w.lower())
                features[index_value] += 1
                break 
features = list(features)
#print([features])
if clf1.predict([features])==0:
    print('hateful')
elif clf1.predict([features])==1:
    print('offensive')
else:
    print('clean')

Enter a statement: shut up you mf bastard
hateful


In [20]:
from nltk.corpus import wordnet
syns = wordnet.synsets("black")
g = [w.name()[:-5] for w in syns]
print(list(set(g)))


['black', 'blacken', 'total_darkness', 'bootleg']
