# Data Processing for Hate Speech Classification

### Brent Read

#### This project modifies code taken from a NLP Twitter tutorial found on http://www.laurentluce.com/posts/twitter-sentiment-analysis-using-python-and-nltk/

#### The Twitter Corpus and NLTK documentation was referenced:
http://www.nltk.org/howto/twitter.html

## Read in the data

In [None]:
import os
import pandas as pd
import numpy as np
import nltk
from sklearn import cross_validation

os.chdir('/home/bread/424/HateSpeech/')

hateData = pd.read_csv('twitter-hate-speech-classifier-DFE-a845520.csv')

#print hateData

## Divide into three sections

In [None]:
hatefulPosts =  hateData.loc[hateData['does_this_tweet_contain_hate_speech'] == "The tweet contains hate speech"]

hatefulLanguage =  hateData.loc[hateData['does_this_tweet_contain_hate_speech'] == "The tweet uses offensive language but not hate speech"]

nonOffensiveTweets = hateData.loc[hateData['does_this_tweet_contain_hate_speech'] == "The tweet is not offensive"]

listOfConfidences = hateData["does_this_tweet_contain_hate_speech:confidence"]
print listOfConfidences.std()
print listOfConfidences.mean()
#hatefulPosts.head()
#hatefulLanguage.head()
#nonOffensiveTweets.head()

#Get confidence
offensiveRate = nonOffensiveTweets["does_this_tweet_contain_hate_speech:confidence"]
print offensiveRate.std()
print offensiveRate.mean()

hatefulText = hatefulPosts['tweet_text']
nonHatefulOffensive = hatefulLanguage['tweet_text']
controlTweets = nonOffensiveTweets['tweet_text']
'''
for tweet in controlTweets:
    print tweet
'''


## How many tweets are we dealing with for each of these?

In [None]:
print "Hateful Tweets: %s" % len(hatefulPosts)
print "Offensive Language, but not hateful: %s" % len(hatefulLanguage)
print "Nonoffensive Tweets: %s" % len(nonOffensiveTweets)

print "\n"
print "Total: %s" % len(hateData)

### Let's be super legit and make a word cloud!

In [None]:
%matplotlib inline
from wordcloud import WordCloud
import matplotlib.pyplot as plt

textDump = hatefulText.to_string()

wordcloud = WordCloud(max_font_size=40, relative_scaling=.5).generate(textDump)
plt.figure()
plt.imshow(wordcloud)
plt.axis("off")
plt.show()

### Wordcloud for offensive Language

In [None]:
textDump = nonHatefulOffensive.to_string()

wordcloud = WordCloud(max_font_size=40, relative_scaling=.5).generate(textDump)
plt.figure()
plt.imshow(wordcloud)
plt.axis("off")
plt.show()

### And for the control tweets!

In [None]:
textDump = controlTweets.to_string()

wordcloud = WordCloud(max_font_size=40, relative_scaling=.5).generate(textDump)
plt.figure()
plt.imshow(wordcloud)

plt.axis("off")
plt.show()

## Okay, let's actually do some NLP now!

In [None]:
from nltk.tokenize import TweetTokenizer

tknzr = TweetTokenizer(strip_handles=True, reduce_len=True)

untokenizedHate = [tweet.decode('ISO-8859-1') for tweet in hatefulText]
untokenizedOffensive = [tweet.decode('ISO-8859-1') for tweet in nonHatefulOffensive]
untokenizedControl = [tweet.decode('ISO-8859-1') for tweet in controlTweets]

tokenizedHate = [tknzr.tokenize(tweet.decode('ISO-8859-1')) for tweet in hatefulText]
tokenizedOffensive = [tknzr.tokenize(tweet.decode('ISO-8859-1')) for tweet in nonHatefulOffensive]
tokenizedControl = [tknzr.tokenize(tweet.decode('ISO-8859-1')) for tweet in controlTweets]

print tokenizedControl[1]
print tokenizedHate[1]

## Getting a tweet corpus

In [None]:
from nltk.corpus import twitter_samples
twitter_samples.fileids()

strings = twitter_samples.strings('negative_tweets.json')
for string in strings[:15]:
    print(string)

### Let's try our first classifier!

In [None]:
labeled_control_tweets = [(tweet, 'control') for tweet in tokenizedControl]
labeled_offensive_tweets = [(tweet, 'offensive') for tweet in tokenizedOffensive]
labeled_hateful_tweets = [(tweet, 'hateful') for tweet in tokenizedHate]

print labeled_control_tweets[1]
print labeled_hateful_tweets[1]
print labeled_offensive_tweets[1]

In [None]:
def get_word_features(wordlist):
    wordlist = nltk.FreqDist(wordlist)
    word_features = wordlist.keys()
    return word_features
def get_words_in_tweets(tweets):
    all_words = []
    for (words, sentiment) in tweets:
        all_words.extend(words)
    return all_words



### Cheeky Filtering time!

In [None]:
import random

TRAINING_SET_SIZE = 300

test_tweets = []
train_tweets = []

random.shuffle(labeled_control_tweets)
random.shuffle(labeled_hateful_tweets)
random.shuffle(labeled_offensive_tweets)

all_tweets_labeled = labeled_hateful_tweets[:1000] + labeled_control_tweets[:1000] + labeled_offensive_tweets[:1000]
hate_vs_control_labeled = labeled_hateful_tweets[:2300] + labeled_control_tweets[:2300]
hate_vs_offensive_labeled = labeled_hateful_tweets[:2300] + labeled_offensive_tweets[:2300]



#Do the Knuth Shuffle!
random.shuffle(all_tweets_labeled)

test_tweets = all_tweets_labeled[:TRAINING_SET_SIZE]
train_tweets = all_tweets_labeled[TRAINING_SET_SIZE:]

###And for those getting controlled later
labeled_control_tweets_ut = [(tweet, 'control') for tweet in untokenizedControl]
labeled_offensive_tweets_ut = [(tweet, 'offensive') for tweet in untokenizedOffensive]
labeled_hateful_tweets_ut = [(tweet, 'hateful') for tweet in untokenizedHate]


random.shuffle(labeled_control_tweets_ut)
random.shuffle(labeled_offensive_tweets_ut)
random.shuffle(labeled_hateful_tweets_ut)

all_tweets_untokenized = labeled_hateful_tweets_ut[:1000] + labeled_offensive_tweets_ut[:1000] + labeled_control_tweets_ut[:1000]
hate_vs_control_untokenized = labeled_hateful_tweets_ut[:2300] + labeled_control_tweets_ut[:1000]

random.shuffle(all_tweets_untokenized)

test_list_untokenized = all_tweets_untokenized[:TRAINING_SET_SIZE]
train_list_untokenized = all_tweets_untokenized[TRAINING_SET_SIZE:]


# With Cross Valdiation!

In [None]:
hate_vs_offensive_tweets = [tweet[0] for tweet in hate_vs_offensive_labeled]
hate_vs_offensive_labels = [tweet[1] for tweet in hate_vs_offensive_labeled]

print len(hate_vs_offensive_tweets)
print len(hate_vs_offensive_labels)


X_train, X_test, y_train, y_test = cross_validation.train_test_split(hate_vs_offensive_tweets,hate_vs_offensive_labels, test_size=0.2, random_state=42)

## Working on the classifer!

In [None]:
word_features = get_word_features(get_words_in_tweets(train_tweets))

textList = []
labels = []

for tweet in train_tweets:
    textList.append(tweet[0])
    labels.append(tweet[1])

print len(textList)
print len(labels)


print textList[1]
print word_features[:10]

## Extracting Features!

In [None]:
###Taken from Larent Lucent (See citation at top of document)

def extract_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains{%s}' % word] = (word in document_words)
    return features

training_set = nltk.classify.apply_features(extract_features, train_tweets)

## Cross Validation Version!

## Define the classifier!

In [None]:

#classifier = nltk.maxent.ConditionalExponentialClassifier.train(training_set)
classifier = nltk.NaiveBayesClassifier.train(training_set)
#classifier = nltk.DecisionTreeClassifier.train(training_set)


## Optional: Use a SKLearn Wrapper!

In [None]:
#This currently struggles when run locally

from sklearn.ensemble import AdaBoostClassifier
from nltk.classify.scikitlearn import SklearnClassifier

#SKlearn Wrapper
classifier = SklearnClassifier(LinearSVC())
    
classifier.fit(X_train, X_test)

## Get our predictions

In [None]:

predicted_labels = [classifier.classify(extract_features(tweet[0])) for tweet in test_tweets]

actual_labels = [tweet[1] for tweet in test_tweets]

In [None]:
from sklearn.metrics import zero_one_loss
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn import metrics

print zero_one_loss(actual_labels, predicted_labels)
print accuracy_score(actual_labels, predicted_labels)
print confusion_matrix(actual_labels, predicted_labels)

print f1_score(actual_labels, predicted_labels)

print actual_labels
print predicted_labels

#Necessary for f1_score in binary
'''
pList = []
for label in predicted_labels:
    if label is "offensive":
        pList.append(0)
    else:
        pList.append(1)
        
aList = []
for label in actual_labels:
    if label is "offensive":
        aList.append(0)
    else:
        aList.append(1)

print f1_score(aList, pList)
'''


## Trying TextBlob

In [None]:
from textblob import TextBlob
from textblob.classifiers import NaiveBayesClassifier
    
    
print train_list_untokenized[0]
cl = NaiveBayesClassifier(train_list_untokenized)

print cl.classify("I love Christmas!")

### Just kidding, that breaks my computer. RIP.