In [1]:
import pandas as pd
import urllib
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
import pickle
import preprocessor as p
import numpy as np

In [2]:
import string

def load_data():
    filename = "../data/twitter_data.pkl"
    print("Loading data from file: " + filename)
    data = pickle.load(open(filename, 'rb'))
    x_text = []
    labels = [] 
    ids = []
    for i in range(len(data)):
        text = "".join(l for l in data[i]['text'] if l not in string.punctuation)
        x_text.append((data[i]['text']).encode('utf-8'))
        labels.append(data[i]['label'])
    return x_text,labels

In [3]:
x_text, labels = load_data()
dict1 = {'racism':1,'sexism':1,'none':0}  # Both are 1 because both are cases of bullying
labels = np.array([dict1[b] for b in labels])

Loading data from file: ../data/twitter_data.pkl


In [4]:
comments = pd.DataFrame({'comment': x_text, 'attack': labels})

In [5]:
comments.head()

Unnamed: 0,comment,attack
0,b'rt @colonelkickhead: another bloody instant ...,0
1,b'@azzamalirhabi @jihadia8 this video of the p...,0
2,"b""oh really ? no more instant restaurants ? ...",0
3,b'rt @benfrancisallen: it has not been a good ...,0
4,b'rt @notofeminism: i don\xe2\x80\x99t need fe...,0


In [6]:
swear_words = "../swear_words.txt"
word_list = []
with open(swear_words) as f:
    word_list = f.read().splitlines()

In [7]:
from textblob import TextBlob

def tokens(post):
    return TextBlob(post.decode('utf8')).words
    
def get_bad_word_count(post):
    x = tokens(post)
    count = 0
    bad_words = []
    for word in x:
        if word in word_list:
            count+=1
            bad_words.append(word)
    return count, str(bad_words)

comments['bad_word_count'],comments['bad_word_list']= zip(*comments['comment'].map(get_bad_word_count)) 

In [9]:
def data_analysis(comments):
    total = len(comments)
    bully = len(comments[comments['attack']==1])
    swear = len(comments[comments['bad_word_count']>0])
    swear_bully = len(comments[(comments['bad_word_count']>0) & (comments['attack']==1) ])

    P_B = float(bully)/total
    P_S = float(swear)/total
    P_B_S = float(swear_bully)/swear
    P_S_B = float(swear_bully)/bully
    
    print("P(B): " + str(P_B))
    print("P(S): " + str(P_S))
    print("P(B|S): " + str(P_B_S))
    print("P(S|B): " + str(P_S_B))
    
    sen_length = 100
    
    short_sentence = comments['comment'].where(comments['comment'].str.len()<=sen_length).count()
    long_sentence = comments['comment'].where(comments['comment'].str.len()>sen_length).count()
    
   
    print("Number of short sentences (sentence length <=" + str(sen_length) + "): " + str(short_sentence))
    print("Number of long sentences (sentence length >" + str(sen_length) + "): " + str(long_sentence))
    

In [10]:
data_analysis(comments)

P(B): 0.31410814170292106
P(S): 0.135052827843381
P(B|S): 0.42982052462034054
P(S|B): 0.184804115552038
Number of short sentences (sentence length <=100): 7689
Number of long sentences (sentence length >100): 8401


In [11]:
print(comments['comment'])

0        b'rt @colonelkickhead: another bloody instant ...
1        b'@azzamalirhabi @jihadia8 this video of the p...
2        b"oh really ?  no more instant restaurants ?  ...
3        b'rt @benfrancisallen: it has not been a good ...
4        b'rt @notofeminism: i don\xe2\x80\x99t need fe...
                               ...                        
16085    b'rt @mydearwormwood: "i want equal rights ,  ...
16086    b'rt @alexxxarich: go ahead and call me sexist...
16087    b'@irtsuki i have had the epic ,  but i always...
16088    b'@colonel_shami so do you think that the daes...
16089    b'rt @infosec_hulk: @freebsdgirl my skin green...
Name: comment, Length: 16090, dtype: object
