In [None]:
import os 
os.chdir('./nlp_assignment-master')

In [19]:
import pandas as pd
import os

# Load the annotations metadata
annotations_path = './data/hate-speech-dataset/annotations_metadata.csv'
annotations_df = pd.read_csv(annotations_path)

# Base paths for train and test files
train_base_path = './data/hate-speech-dataset/sampled_train'
test_base_path = './data/hate-speech-dataset/sampled_test'

# Function to load text files and merge with metadata
def load_and_merge_texts(file_base_path, annotations_df):
    merged_data = []
    for file_id in os.listdir(file_base_path):
        file_path = os.path.join(file_base_path, file_id)
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
            metadata = annotations_df[annotations_df['file_id'] == file_id.split('.')[0]].iloc[0]
            merged_data.append({
                'file_id': metadata['file_id'],
                'user_id': metadata['user_id'],
                'subforum_id': metadata['subforum_id'],
                'num_contexts': metadata['num_contexts'],
                'label': metadata['label'],
                'text': text
            })
    return pd.DataFrame(merged_data)

# Load and merge train and test data
train_data = load_and_merge_texts(train_base_path, annotations_df)
test_data = load_and_merge_texts(test_base_path, annotations_df)

# # Save to CSV
# train_data.to_csv('./data/train_data.csv', index=False)
# test_data.to_csv('./data/test_data.csv', index=False)


In [None]:
import pandas as pd

# Assuming filtered_data is your DataFrame and it includes the 'label' column

# Convert the 'label' column to a categorical type if it isn't already
test_data['label'] = test_data['label'].astype('category')

# Use the category codes as numerical labels
test_data['label_id'] = test_data['label'].cat.codes
test_data.to_csv('./data/hate-speech-dataset/test_data.csv')

In [24]:
import pandas as pd
# Calculate lengths of each text entry
text_lengths = train_data['text'].apply(len)

# Calculate minimum, maximum, and average length
min_length = text_lengths.min()
max_length = text_lengths.max()
average_length = text_lengths.mean()

print(f'Minimum text length: {min_length}')
print(f'Maximum text length: {max_length}')
print(f'Average text length: {average_length:.2f}')


Minimum text length: 3
Maximum text length: 1582
Average text length: 104.37


In [25]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
train_data['text'] = train_data['text'].str.lower()

# Define a function to remove stop words from a single string
def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

# Apply the 'remove_stopwords' function to the 'Text' column
train_data['text'] = train_data['text'].apply(remove_stopwords)


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/adham.ibrahim/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
import pandas as pd
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
# Define a function to perform stemming on a single string
def perform_stemming(text):
    stemmer = PorterStemmer()
    tokens = word_tokenize(text)
    stemmed_words = []
    for token in tokens:
        stemmed_words.append(stemmer.stem(token))
    return ' '.join(stemmed_words)

# Apply the 'perform_stemming' function to the 'Text' column
train_data['text'] = train_data['text'].apply(perform_stemming)

# Print the updated DataFrame
#print(data_text)

In [27]:
import pandas as pd
import re


# Define a function to remove non-alphabetic characters from a single string
def remove_non_alphabetic(text):
    pattern = r'[^a-zA-Z\s]'
    return re.sub(pattern, '', text)

# Apply the 'remove_non_alphabetic' function to the 'Text' column
train_data['text'] = train_data['text'].apply(remove_non_alphabetic)



In [35]:
train_data['text'].head()

0                                           br  tapio 
1                                         thank  c  r 
2    would like see white peopl get togeth get us w...
3    sad happen want take think want date one negro...
4                        http  rnebarkashovrufotoffjpg
Name: text, dtype: object

In [36]:
train_data['text'][4]

'http  rnebarkashovrufotoffjpg'

In [44]:
# Assuming combined_data is your DataFrame

# Keep rows where the 'text' column does NOT start with 'http'
filtered_data = train_data[~train_data['text'].str.startswith('http')]

# Now, filtered_data contains only the rows where the 'text' column doesn't start with "http"


In [49]:
filtered_data.head()

Unnamed: 0,file_id,user_id,subforum_id,num_contexts,label,text,type
0,14096493_3,589735,1381,0,noHate,br tapio,train
1,14097600_1,598929,1381,0,noHate,thank c r,train
2,13946095_2,594398,1388,0,noHate,would like see white peopl get togeth get us w...,train
3,13483062_1,575681,1346,1,hate,sad happen want take think want date one negro...,train
5,31706302_10,586694,1363,0,noHate,appli techniqu one mani occas seen consider su...,train


In [53]:
import pandas as pd

# Assuming filtered_data is your DataFrame and it includes the 'label' column

# Convert the 'label' column to a categorical type if it isn't already
filtered_data['label'] = filtered_data['label'].astype('category')

# Use the category codes as numerical labels
filtered_data['label_id'] = filtered_data['label'].cat.codes

# Now, filtered_data has a new column 'label_id' with numeric encoding of the 'label' column


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data['label'] = filtered_data['label'].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data['label_id'] = filtered_data['label'].cat.codes


In [55]:
filtered_data

Unnamed: 0,file_id,user_id,subforum_id,num_contexts,label,text,type,label_id
0,14096493_3,589735,1381,0,noHate,br tapio,train,1
1,14097600_1,598929,1381,0,noHate,thank c r,train,1
2,13946095_2,594398,1388,0,noHate,would like see white peopl get togeth get us w...,train,1
3,13483062_1,575681,1346,1,hate,sad happen want take think want date one negro...,train,0
5,31706302_10,586694,1363,0,noHate,appli techniqu one mani occas seen consider su...,train,1
...,...,...,...,...,...,...,...,...
2387,30586792_1,739402,1348,0,noHate,next time see someon pant fall help,test,1
2388,14100101_1,599177,1381,0,noHate,get ride see time summer cool need get someon ...,test,1
2389,14672042_2,575713,1371,6,hate,likelyhood s jew paint road skank follow,test,0
2390,13494648_1,575123,1395,0,noHate,compar pictur pictur serb nt see much differ a...,test,1


In [None]:
data.dropna(subset=['text'], inplace=True)

In [62]:
train_data.to_csv('./data/hate-speech-dataset/train_data.csv')

In [63]:
train_data['label'].value_counts()

label
hate      953
noHate    938
Name: count, dtype: int64