In [1]:
#load libraries
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader, TensorDataset


In [2]:
# Load  dataset
data = pd.read_csv('EnglishFinal.csv')  # Replace 'YourDataset.csv' with your dataset file
data

Unnamed: 0,text,label
0,You can go to auctomuckty and to drumnadrocit ...,Not Hate
1,Good luck. I wouldn't trust any women out ther...,Hate
2,what a pile of wank my cousin is,Not Hate
3,jews are Esau - direct quote,Not Hate
4,"like what the actual fuck, it was so infuriati...",Not Hate
...,...,...
11995,Very interesting paper. Although I was expecti...,Not Hate
11996,"As an Arab here on Reddit, I'm really scared o...",Not Hate
11997,I really hate our area since dirty fucking pak...,Hate
11998,I wish this incompetent cunt would just drop d...,Not Hate


In [3]:
# Split data into features (text) and labels
X = data['text']
y = data['label']


In [4]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [5]:
# Count Vectorization
vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Initialize the label encoder
label_encoder = LabelEncoder()

# Fit the label encoder on your string labels and transform them to numerical labels
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)


In [6]:
# Create PyTorch tensors for input data and labels
X_train_tensor = torch.tensor(X_train_vec.toarray(), dtype=torch.float32)
y_train_tensor = torch.tensor(y_train_encoded, dtype=torch.long)
X_test_tensor = torch.tensor(X_test_vec.toarray(), dtype=torch.float32)
y_test_tensor = torch.tensor(y_test_encoded, dtype=torch.long)

In [7]:
# Define a deep feedforward NLP model
class DeepNLPModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_hidden_layers, output_size):
        super(DeepNLPModel, self).__init__()
        # Define your model architecture here
        self.fc_layers = nn.ModuleList([nn.Linear(input_size, hidden_size)])
        self.relu = nn.ReLU()
        for _ in range(num_hidden_layers - 1):
            self.fc_layers.append(nn.Linear(hidden_size, hidden_size))
        self.output_layer = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        for layer in self.fc_layers:
            x = self.relu(layer(x))
        out = self.output_layer(x)
        return out

In [8]:
# Initialize the deep NLP model
input_size = X_train_vec.shape[1]
hidden_size = 128  # Adjust as needed
num_hidden_layers = 2  # Number of hidden layers (adjust as needed)
output_size = len(label_encoder.classes_)  # Number of unique labels

model = DeepNLPModel(input_size, hidden_size, num_hidden_layers, output_size)



In [9]:
# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


In [10]:
# Create DataLoader for training
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)


In [11]:
# Training loop
num_epochs = 20  # Adjust as needed
for epoch in range(num_epochs):
    model.train()
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
    print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}')

Epoch [1/20], Loss: 0.5213
Epoch [2/20], Loss: 0.3315
Epoch [3/20], Loss: 0.2826
Epoch [4/20], Loss: 0.0513
Epoch [5/20], Loss: 0.0122
Epoch [6/20], Loss: 0.0362
Epoch [7/20], Loss: 0.0549
Epoch [8/20], Loss: 0.0500
Epoch [9/20], Loss: 0.0030
Epoch [10/20], Loss: 0.0078
Epoch [11/20], Loss: 0.0728
Epoch [12/20], Loss: 0.0179
Epoch [13/20], Loss: 0.1063
Epoch [14/20], Loss: 0.0225
Epoch [15/20], Loss: 0.0498
Epoch [16/20], Loss: 0.0003
Epoch [17/20], Loss: 0.0003
Epoch [18/20], Loss: 0.0098
Epoch [19/20], Loss: 0.0105
Epoch [20/20], Loss: 0.0001


In [12]:
# Evaluate the model on the test set
model.eval()
with torch.no_grad():
    test_outputs = model(X_test_tensor)
    _, predicted = torch.max(test_outputs, 1)
    correct = (predicted == y_test_tensor).sum().item()
    total = len(y_test_tensor)
    accuracy = correct / total

print(f'Test Accuracy: {accuracy * 100:.2f}%')

Test Accuracy: 61.86%


In [13]:
import joblib

In [14]:
joblib.dump(vectorizer , "English_Vectorizer.pkl")
joblib.dump(label_encoder, "English_Label_Encoder.pkl")
torch.save(model.state_dict , "English_Language_Model.pth")