# Importing Required Libraries

In [1]:
import pandas as pd

# Defining Class Map

In [2]:
class_map = {
    'none' : 0,
    'sexism' : 1,
    'racism' : 2
}

# File Paths

In [3]:
train_file_path = '/content/trainset.txt'
dev_file_path = '/content/devset.txt'
test_file_path = '/content/testset.txt'

wiegand_lexicon_file_path = '/content/hate_lexicon_wiegand.txt'
small_lexicon = '/content/hate_lexicon_small.txt'

# Funtions

## Read Contents of a File

In [4]:
def get_data_from_file(file_path):
  with open(file_path) as f:
    lines = f.readlines()
    contents = [line.rstrip('\n').strip() for line in lines]

  return contents

## Generate Feature Matrix

In [None]:
# gets the feauture matrix based on the dataset and lexicon
def get_feature_matrix(dataset,lexicons):
  feature_matrix = []


  for line in dataset:
    all_words = line.split()
    line_words = all_words[:-1]
    feature_vector = []

    for lexicon in lexicons:
      if lexicon in line_words:
        feature_vector.append(1)
      else:
        feature_vector.append(0)

    label_number = class_map[all_words[-1]]
    feature_vector.append(label_number)
    
    feature_matrix.append(feature_vector)

  return feature_matrix

# Extract Data

## Training Data

In [5]:
# gets training data
training_data = get_data_from_file(train_file_path)
training_data = [text for text in training_data if text]

In [6]:
# view first 5 records of training data
training_data[:5]

['These two are revolting MKR MKR sexism',
 'katieandnikki stop calling yourselves pretty and hot you re not and saying it a million times doesn t make you either STFU MKR sexism',
 'The menus look like they were made by a year old little girl in this case just the mental age of a year old girl I guess MKR sexism',
 'Wish these blondes were in that How To Get Away With Murder show MKR sexism',
 'Were butchers but can t tell red poultry is not cooked salmonellaqueens MKR sexism']

In [7]:
# gets wiegand lexicons
wiegand_lexicons = get_data_from_file(wiegand_lexicon_file_path)

In [9]:
# view first 5 wiegand lexicons
wiegand_lexicons[:5]

['Hun', 'Jap', 'Jihadi', 'Yardie', 'abhor']

In [10]:
# adds labels as one of the columns
all_columns = wiegand_lexicons[:]
all_columns.append('label')

In [11]:
# gets the feature matrix for wiegand lexicons
train_wiegand_feature_matrix = get_feature_matrix(training_data, wiegand_lexicons)

In [12]:
# converts the wiegand feature matrix into a data frame
df_train_wiegand = pd.DataFrame(train_wiegand_feature_matrix, columns = all_columns)

In [13]:
# viewing first 5 rows
df_train_wiegand.head()

Unnamed: 0,Hun,Jap,Jihadi,Yardie,abhor,abominable,abomination,abusive,affront,agitator,...,wolly,wop,worthless,wreck,wreck.1,yarpie,yokel,zealot,zigabo,label
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [14]:
# saves the wiegand feature matrix into a csv
df_train_wiegand.to_csv("train_wiegand.csv", index=False)

## Testing Data

In [15]:
# gets testing data
testing_data = get_data_from_file(test_file_path)
testing_data = [text for text in testing_data if text]

In [16]:
testing_data[:5]

['I believe the correct term is bortches gailsimone BenDay I m not sexist but some girls are just straight birches honestly sexism',
 'Yeah nothing to do with growing up being told as such liamkiniery Men s MMA is far better than women MMA notsexist justbiology sexism',
 'RT Vonta I m not sexist but I d never get my hair cut by a women just like I m not racist but I d never eat chicken fried by a white p sexism',
 'Turn up your hearing aid A thought C tWINO I m not sexist but it s nearly impossible to listen to a girl talking about football on ESPN sexism',
 'RT andythewookie YesYoureSexist your right im a wanker But I don t even have enough space in five tweets to point out how much of a cu sexism']

In [17]:
test_wiegand_feature_matrix = get_feature_matrix(testing_data, wiegand_lexicons)

In [18]:
df_test_wiegand = pd.DataFrame(test_wiegand_feature_matrix, columns = all_columns)

In [19]:
df_test_wiegand.head()

Unnamed: 0,Hun,Jap,Jihadi,Yardie,abhor,abominable,abomination,abusive,affront,agitator,...,wolly,wop,worthless,wreck,wreck.1,yarpie,yokel,zealot,zigabo,label
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [20]:
# saves the wiegand feature matrix into a csv
df_test_wiegand.to_csv("test_wiegand.csv", index=False)

## Development Data

In [21]:
# gets testing data
devlopment_data = get_data_from_file(dev_file_path)
devlopment_data = [text for text in devlopment_data if text]

In [22]:
devlopment_data[:5]

['RT KweezyKevin I m not sexist but Jesus Christ Sarah palin is a complete joke in politics Hillary is no different better but not much sexism',
 'RT Deanowen YesYoureSexist PhilDoran typical woman sticking her nose in sexism',
 'RT PhilDoran Deanowen YesYoureSexist go wash some dishes or make me something delicious too eat sexism',
 'RT MarchandsEgo I m not sexist in anyway also But some of y all feminists are off your rockers sexism',
 'I have some news for you BDJ Mauri I m not sexist but sexist jokes are the funniest jokes out there in my opinion sexism']

In [23]:
dev_wiegand_feature_matrix = get_feature_matrix(devlopment_data, wiegand_lexicons)

In [24]:
df_dev_wiegand = pd.DataFrame(dev_wiegand_feature_matrix, columns = all_columns)

In [25]:
df_dev_wiegand.head()

Unnamed: 0,Hun,Jap,Jihadi,Yardie,abhor,abominable,abomination,abusive,affront,agitator,...,wolly,wop,worthless,wreck,wreck.1,yarpie,yokel,zealot,zigabo,label
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [26]:
# saves the wiegand feature matrix into a csv
df_dev_wiegand.to_csv("dev_wiegand.csv", index=False)