# Detectie van ongepaste tekst in posts

## Importeren van packages

In [1]:
import pandas as pd
import numpy as np

from nltk.corpus import movie_reviews 
from nltk.classify import NaiveBayesClassifier
from nltk.classify.util import accuracy as nltk_accuracy
from nltk.tokenize import word_tokenize

import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords 
stopwords_english = stopwords.words('english')
import string 
import random

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Eigenaar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Inlezen van data

In [2]:
training_temp= pd.read_csv('data/olid-training.tsv', sep='\t')

### A -> offensive/ not offensive

In [3]:
offensive = []
notOffensive = []
training_a = training_temp[['tweet', 'subtask_a']]
test_a_values = pd.read_csv('data/testset-levela.tsv', sep='\t')
test_a_labels = pd.read_csv('data/labels-levela.csv')
test_a = test_a_values.merge(test_a_labels, on="id")

for index, row in training_a.iterrows():
    if(row['subtask_a']) == 'OFF':
        offensive.append(row['tweet'])
    else:
        notOffensive.append(row['tweet'])

for index, row in test_a.iterrows():
    if(row['subtask_a']) == 'OFF':
        offensive.append(row['tweet'])
    else:
        notOffensive.append(row['tweet'])

### B -> targeted/ untargeted

In [4]:
targeted = []
notTargeted = []
training_b = training_temp[['tweet', 'subtask_b']]
test_b_values = pd.read_csv('data/testset-levelb.tsv', sep='\t')
test_b_labels = pd.read_csv('data/labels-levelb.csv')
test_b = test_b_values.merge(test_b_labels, on="id")

for index, row in training_b.iterrows():
    if(row['subtask_b']) == 'TIN':
        targeted.append(row['tweet'])
    else:
        notTargeted.append(row['tweet'])

for index, row in test_b.iterrows():
    if(row['subtask_b']) == 'TIN':
        targeted.append(row['tweet'])
    else:
        notTargeted.append(row['tweet'])

### C -> offense target

In [5]:
individual = []
group = []
other = []
training_c = training_temp[['tweet', 'subtask_c']]
test_c_values = pd.read_csv('data/testset-levelc.tsv', sep='\t')
test_c_labels = pd.read_csv('data/labels-levelc.csv')
test_c = test_c_values.merge(test_c_labels, on="id")

for index, row in training_c.iterrows():
    if(row['subtask_c']) == 'IND':
        individual.append(row['tweet'])
    elif(row['subtask_c']) ==  'GRP':
        group.append(row['tweet'])
    else:
        other.append(row['tweet'])

for index, row in test_c.iterrows():
    if(row['subtask_c']) == 'IND':
        individual.append(row['tweet'])
    elif(row['subtask_c']) ==  'GRP':
        group.append(row['tweet'])
    else:
        other.append(row['tweet'])

## Detecteren van ongepast taalgebruik

### A -> offensive/ not offensive

#### Een list creëren

In [6]:
offensive_list_clean = []
notOffensive_list_clean = []
words_clean_a = []

for o in offensive:
    offensive_temp = []
    for word in word_tokenize(o):
        word = word.lower()
        if word not in stopwords_english and word not in string.punctuation and len(word) > 1 and not word.isdigit():
            offensive_temp.append(word)
            words_clean_a.append(word)
    offensive_list_clean.append(offensive_temp)

for n in notOffensive:
    notOffensive_temp = []
    for word in word_tokenize(n):
        word = word.lower()
        if word not in stopwords_english and word not in string.punctuation and len(word) > 1 and not word.isdigit():
            notOffensive_temp.append(word)
            words_clean_a.append(word)
    notOffensive_list_clean.append(notOffensive_temp)

In [7]:
all_a = []

for o in offensive_list_clean:
    all_a.append((o,'offensive'))

for n in notOffensive_list_clean:
    all_a.append((n,'not offensive'))

random.shuffle(all_a)

#### De top 3000 woorden

In [8]:
word_frequency_a = nltk.FreqDist(words_clean_a)
print(word_frequency_a.most_common(20))
top_words_a = list(word_frequency_a.keys())[:3000]

[('user', 34024), ('url', 2602), ("''", 2219), ("'s", 1531), ('liberals', 1485), ('gun', 1427), ("n't", 1403), ('...', 1304), ('control', 1286), ('antifa', 1244), ('like', 1172), ('maga', 1067), ('conservatives', 1029), ('people', 968), ('get', 713), ('one', 688), ('trump', 682), ('amp', 682), ('know', 669), ('would', 581)]


#### De featureset creëren

In [9]:
def find_top_words(words):
    wordset = set(words)
    result = {}
    for w in top_words_a:
        result[w] = (w in wordset) # true if top_word is occurring in the wordset
    return result

In [10]:
featuresets_a = []
for (words, category) in all_a:
    featuresets_a.append((find_top_words(words), category))

#### De classifier trainen

In [11]:
index = int(len(featuresets_a)*0.7+1)
training_set_a = featuresets_a[:index]
test_set_a = featuresets_a[index:]

classifier_a = nltk.NaiveBayesClassifier.train(training_set_a)
print('\nAccuracy of the NaiveBayesClassifier:', nltk_accuracy(classifier_a, test_set_a))


Accuracy of the NaiveBayesClassifier: 0.7528966658784583


In [12]:
# de waardevolste woorden
classifier_a.show_most_informative_features(15)

Most Informative Features
                   bitch = True           offens : not of =     57.4 : 1.0
                   idiot = True           offens : not of =     56.4 : 1.0
                  idiots = True           offens : not of =     27.8 : 1.0
                    fuck = True           offens : not of =     23.8 : 1.0
                  stupid = True           offens : not of =     19.1 : 1.0
                 asshole = True           offens : not of =     18.3 : 1.0
                  coward = True           offens : not of =     18.3 : 1.0
                  fucked = True           offens : not of =     17.5 : 1.0
                    shit = True           offens : not of =     17.2 : 1.0
                    ugly = True           offens : not of =     16.7 : 1.0
                   pussy = True           offens : not of =     15.1 : 1.0
                 bitches = True           offens : not of =     14.3 : 1.0
                 fucking = True           offens : not of =     13.8 : 1.0

### B -> targeted/ untargeted

#### Een list creëren

In [13]:
targeted_list_clean = []
notTargeted_list_clean = []
words_clean_b = []

for t in targeted:
    targeted_temp = []
    for word in word_tokenize(t):
        word = word.lower()
        if word not in stopwords_english and word not in string.punctuation and len(word) > 1 and not word.isdigit():
            targeted_temp.append(word)
            words_clean_b.append(word)
    targeted_list_clean.append(targeted_temp)

for nt in notTargeted:
    notTargeted_temp = []
    for word in word_tokenize(nt):
        word = word.lower()
        if word not in stopwords_english and word not in string.punctuation and len(word) > 1 and not word.isdigit():
            notTargeted_temp.append(word)
            words_clean_b.append(word)
    notTargeted_list_clean.append(notTargeted_temp)

In [14]:
all_b = []

for t in targeted_list_clean:
    all_b.append((t,'targeted'))

for nt in notTargeted_list_clean:
    all_b.append((nt,'untargeted'))

random.shuffle(all_b)

#### De top 3000 woorden

In [15]:
word_frequency_b = nltk.FreqDist(words_clean_b)
print(word_frequency_b.most_common(20))
top_words_b = list(word_frequency_b.keys())[:3000]

[('user', 33561), ('url', 2173), ("''", 2155), ('liberals', 1446), ("'s", 1437), ('gun', 1372), ("n't", 1342), ('...', 1242), ('control', 1232), ('antifa', 1194), ('like', 1127), ('maga', 1023), ('conservatives', 972), ('people', 933), ('get', 686), ('amp', 677), ('one', 655), ('trump', 655), ('know', 643), ('would', 572)]


#### De featureset creëren

In [16]:
def find_top_words(words):
    wordset = set(words)
    result = {}
    for w in top_words_b:
        result[w] = (w in wordset) # true if top_word is occurring in the wordset
    return result

In [17]:
featuresets_b = []
for (words, category) in all_b:
    featuresets_b.append((find_top_words(words), category))

#### De classiefier trainen

In [18]:
index = int(len(featuresets_b)*0.7+1)
training_set_b = featuresets_b[:index]
test_set_b = featuresets_b[index:]

classifier_b = nltk.NaiveBayesClassifier.train(training_set_b)
print('\nAccuracy of the NaiveBayesClassifier:', nltk_accuracy(classifier_b, test_set_b))


Accuracy of the NaiveBayesClassifier: 0.7378184516448182


In [19]:
# de waardevolste woorden
classifier_b.show_most_informative_features(15)

Most Informative Features
                  coward = True           target : untarg =     23.5 : 1.0
                   bitch = True           target : untarg =     21.7 : 1.0
                  idiots = True           target : untarg =     18.7 : 1.0
                 asshole = True           target : untarg =     15.0 : 1.0
                  stupid = True           target : untarg =     15.0 : 1.0
                  rapist = True           target : untarg =     14.4 : 1.0
                assholes = True           target : untarg =     11.4 : 1.0
                 bitches = True           target : untarg =     11.4 : 1.0
               hypocrite = True           target : untarg =     11.4 : 1.0
                traitors = True           target : untarg =     11.4 : 1.0
                 fascist = True           target : untarg =     11.0 : 1.0
                   idiot = True           target : untarg =     10.5 : 1.0
              hypocrites = True           target : untarg =     10.5 : 1.0

### C -> offense target

In [20]:
individual_list_clean = []
group_list_clean = []
other_list_clean = []
words_clean_c = []

for i in individual:
    individual_temp = []
    for word in word_tokenize(i):
        word = word.lower()
        if word not in stopwords_english and word not in string.punctuation and len(word) > 1 and not word.isdigit():
            individual_temp.append(word)
            words_clean_c.append(word)
    individual_list_clean.append(individual_temp)

for g in group:
    group_temp = []
    for word in word_tokenize(g):
        word = word.lower()
        if word not in stopwords_english and word not in string.punctuation and len(word) > 1 and not word.isdigit():
            group_temp.append(word)
            words_clean_c.append(word)
    group_list_clean.append(group_temp)

for o in other:
    other_temp = []
    for word in word_tokenize(o):
        word = word.lower()
        if word not in stopwords_english and word not in string.punctuation and len(word) > 1 and not word.isdigit():
            other_temp.append(word)
            words_clean_c.append(word)
    other_list_clean.append(other_temp)

In [21]:
all_c = []

for i in individual_list_clean:
    all_c.append((i,'individual'))

for g in group_list_clean:
    all_c.append((g,'group'))

for o in other_list_clean:
    all_c.append((o,'other'))

random.shuffle(all_c)

#### De top 3000 woorden

In [22]:
word_frequency_c = nltk.FreqDist(words_clean_c)
print(word_frequency_c.most_common(20))
top_words_c = list(word_frequency_c.keys())[:3000]

[('user', 33558), ('url', 2162), ("''", 2153), ('liberals', 1446), ("'s", 1436), ('gun', 1372), ("n't", 1339), ('...', 1239), ('control', 1232), ('antifa', 1193), ('like', 1123), ('maga', 1022), ('conservatives', 972), ('people', 932), ('get', 684), ('amp', 677), ('trump', 655), ('one', 655), ('know', 642), ('would', 570)]


#### De featureset creëren

In [23]:
def find_top_words(words):
    wordset = set(words)
    result = {}
    for w in top_words_c:
        result[w] = (w in wordset) # true if top_word is occurring in the wordset
    return result

In [24]:
featuresets_c = []
for (words, category) in all_c:
    featuresets_c.append((find_top_words(words), category))

#### De classifier trainen

In [25]:
index = int(len(featuresets_c)*0.7+1)
training_set_c = featuresets_c[:index]
test_set_c = featuresets_c[index:]

classifier_c = nltk.NaiveBayesClassifier.train(training_set_c)
print('\nAccuracy of the NaiveBayesClassifier:', nltk_accuracy(classifier_c, test_set_c))


Accuracy of the NaiveBayesClassifier: 0.7154894671623296


In [26]:
# de waardevolste woorden
classifier_c.show_most_informative_features(15)

Most Informative Features
                  idiots = True            group : other  =     89.6 : 1.0
                  coward = True           indivi : other  =     35.1 : 1.0
                   hates = True            group : other  =     31.8 : 1.0
                   idiot = True           indivi : other  =     29.1 : 1.0
                   bitch = True           indivi : other  =     27.3 : 1.0
                 traitor = True           indivi : other  =     24.7 : 1.0
               hypocrite = True           indivi : other  =     22.6 : 1.0
                 bitches = True            group : other  =     22.5 : 1.0
                 dumbass = True            group : other  =     20.2 : 1.0
            hypocritical = True            group : other  =     20.2 : 1.0
                     neo = True            group : other  =     20.2 : 1.0
                    ugly = True           indivi : other  =     19.5 : 1.0
                 asshole = True           indivi : other  =     18.0 : 1.0