# Importing Required Libraries

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

# Defining Class Map

In [2]:
class_map = {
    'none' : 0,
    'sexism' : 1,
    'racism' : 2
}

# File Paths

In [3]:
train_file_path = '/content/trainset.txt'
dev_file_path = '/content/devset.txt'
test_file_path = '/content/testset.txt'

# Funtions

## Read Contents of a File

In [4]:
def get_data_from_file(file_path):
  with open(file_path) as f:
    lines = f.readlines()
    contents = [line.rstrip('\n').strip() for line in lines]

  return contents

## Split Labels From Data

In [5]:
def split_lables_from_data(data):
  label = []

  for i, string in enumerate(data):
      words = string.split()
      last_word = words[-1]
      label.append(class_map[last_word])

      # Remove the last word from the string
      data[i] = " ".join(words[:-1])

  return data,label

## Get Transformed Data

In [6]:
def get_transformed_data(vec, data):
  X = vec.transform(data)
  return X

# Extract Data

## Training Data

In [7]:
# gets training data
training_data = get_data_from_file(train_file_path)
training_data = [text for text in training_data if text]

In [8]:
# view first 5 records of training data
training_data[:5]

['These two are revolting MKR MKR sexism',
 'katieandnikki stop calling yourselves pretty and hot you re not and saying it a million times doesn t make you either STFU MKR sexism',
 'The menus look like they were made by a year old little girl in this case just the mental age of a year old girl I guess MKR sexism',
 'Wish these blondes were in that How To Get Away With Murder show MKR sexism',
 'Were butchers but can t tell red poultry is not cooked salmonellaqueens MKR sexism']

In [9]:
# splitting the training data and it's label
train_data,train_label = split_lables_from_data(training_data)

In [10]:
# creating features
# using a vectorizer to generate character n grams and fitting to training data
ngram_low = 3
ngram_high = 3
vectorizer = CountVectorizer(analyzer='char', ngram_range = (ngram_low, ngram_high))
vectorizer.fit(train_data)

In [11]:
# transforming the training data
transformed_train = get_transformed_data(vectorizer,train_data)

In [12]:
# creating a dataframe of the training data
df_train = pd.DataFrame(transformed_train.toarray(), columns=vectorizer.get_feature_names_out())

In [13]:
# viewing the first 5 rows of training data
df_train.head()

Unnamed: 0,a,aa,ab,ac,ad,ae,af,ag,ah,ai,...,zzk,zzl,zzm,zzn,zzo,zzr,zzs,zzt,zzy,zzz
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
# converting the training labels into a pandas dataframe
df_train_label = pd.DataFrame(train_label, columns=['label'])

In [16]:
# viewing the first 5 rows of training labels
df_train_label.head()

Unnamed: 0,label
0,1
1,1
2,1
3,1
4,1


In [33]:
# concatenating the dataframes horizontally
df_train_data = pd.concat([df_train, df_train_label], axis=1)

In [36]:
# saves the training data into a csv
df_train_data.to_csv("training_data_char3gram.csv", index=False)

## Testing Data

In [17]:
# gets testing data
testing_data = get_data_from_file(test_file_path)
testing_data = [text for text in testing_data if text]

In [18]:
testing_data[:5]

['I believe the correct term is bortches gailsimone BenDay I m not sexist but some girls are just straight birches honestly sexism',
 'Yeah nothing to do with growing up being told as such liamkiniery Men s MMA is far better than women MMA notsexist justbiology sexism',
 'RT Vonta I m not sexist but I d never get my hair cut by a women just like I m not racist but I d never eat chicken fried by a white p sexism',
 'Turn up your hearing aid A thought C tWINO I m not sexist but it s nearly impossible to listen to a girl talking about football on ESPN sexism',
 'RT andythewookie YesYoureSexist your right im a wanker But I don t even have enough space in five tweets to point out how much of a cu sexism']

In [19]:
# splitting the testing data and it's label
test_data,test_label = split_lables_from_data(testing_data)

In [20]:
# transforming the testing data
transformed_test = get_transformed_data(vectorizer,test_data)

In [21]:
# creating a dataframe of the testing data
df_test = pd.DataFrame(transformed_test.toarray(), columns=vectorizer.get_feature_names_out())

In [22]:
# viewing the first 5 rows of testing data
df_test.head()

Unnamed: 0,a,aa,ab,ac,ad,ae,af,ag,ah,ai,...,zzk,zzl,zzm,zzn,zzo,zzr,zzs,zzt,zzy,zzz
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2,0,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
# converting the testing labels into a pandas dataframe
df_test_label = pd.DataFrame(test_label, columns=['label'])

In [24]:
# viewing the first 5 rows of testing labels
df_test_label.head()

Unnamed: 0,label
0,1
1,1
2,1
3,1
4,1


In [37]:
# concatenating the dataframes horizontally
df_test_data = pd.concat([df_test, df_test_label], axis=1)

In [38]:
# saves the wiegand feature matrix into a csv
df_test_data.to_csv("testing_data_char3gram.csv", index=False)

## Development Data

In [25]:
# gets testing data
devlopment_data = get_data_from_file(dev_file_path)
devlopment_data = [text for text in devlopment_data if text]

In [26]:
devlopment_data[:5]

['RT KweezyKevin I m not sexist but Jesus Christ Sarah palin is a complete joke in politics Hillary is no different better but not much sexism',
 'RT Deanowen YesYoureSexist PhilDoran typical woman sticking her nose in sexism',
 'RT PhilDoran Deanowen YesYoureSexist go wash some dishes or make me something delicious too eat sexism',
 'RT MarchandsEgo I m not sexist in anyway also But some of y all feminists are off your rockers sexism',
 'I have some news for you BDJ Mauri I m not sexist but sexist jokes are the funniest jokes out there in my opinion sexism']

In [27]:
# splitting the development data and it's label
develop_data,develop_label = split_lables_from_data(devlopment_data)

In [28]:
# transforming the development data
transformed_develop = get_transformed_data(vectorizer,develop_data)

In [29]:
# creating a dataframe of the development data
df_develop = pd.DataFrame(transformed_develop.toarray(), columns=vectorizer.get_feature_names_out())

In [30]:
# viewing the first 5 rows of development data
df_develop.head()

Unnamed: 0,a,aa,ab,ac,ad,ae,af,ag,ah,ai,...,zzk,zzl,zzm,zzn,zzo,zzr,zzs,zzt,zzy,zzz
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [31]:
# converting the development labels into a pandas dataframe
df_develop_label = pd.DataFrame(develop_label, columns=['label'])

In [32]:
# viewing the first 5 rows of development labels
df_develop_label.head()

Unnamed: 0,label
0,1
1,1
2,1
3,1
4,1


In [39]:
# concatenating the dataframes horizontally
df_develop_data = pd.concat([df_develop, df_develop_label], axis=1)

In [40]:
# saves the wiegand feature matrix into a csv
df_develop_data.to_csv("development_data_char3gram.csv", index=False)