<a href="https://colab.research.google.com/github/springboardmentor891v/HATE_SPEECH_DETECTION_INFOSYS_INTERNSHIP_OCT2024/blob/main/Thangirala_Pravallika_Hate_Speech_Detection_Infosys_Intern.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Step 1: Import required libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string


In [3]:
# Download NLTK resources
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt_tab')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [4]:
# Step 2: Load the dataset
data = pd.read_csv('/content/labeled_data.csv')

In [5]:
# Select features (text) and target (labels)
X = data['tweet']  # Text data column
y = data['class']  # Target column

In [6]:
# Step 3: Preprocessing function
def preprocess_text(text):
    # Tokenization
    tokens = word_tokenize(text.lower())
    # Remove punctuation
    tokens = [word for word in tokens if word not in string.punctuation]
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

# Apply preprocessing to all tweets
print("\nPreprocessing text data...")
X_preprocessed = X.apply(preprocess_text)


Preprocessing text data...


In [7]:
# Step 4: Convert text into numerical features using TF-IDF
print("\nConverting text to numerical features...")
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
X_tfidf = tfidf_vectorizer.fit_transform(X_preprocessed)



Converting text to numerical features...


In [8]:
# Step 5: Split the data into training, validation, and test sets
print("\nSplitting the data...")
X_train, X_temp, y_train, y_temp = train_test_split(X_tfidf, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)


Splitting the data...


In [9]:
# Step 6: Train a Random Forest model
print("Training the Random Forest model...")
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

Training the Random Forest model...


In [10]:
# Step 7: Evaluate the model
print("\nEvaluating the model...")
train_accuracy = accuracy_score(y_train, rf_model.predict(X_train))
val_accuracy = accuracy_score(y_val, rf_model.predict(X_val))
test_accuracy = accuracy_score(y_test, rf_model.predict(X_test))

print("Training Accuracy:", round(train_accuracy * 100), "%")
print("Validation Accuracy:", round(val_accuracy * 100), "%")
print("Test Accuracy:", round(test_accuracy * 100), "%")



Evaluating the model...
Training Accuracy: 99 %
Validation Accuracy: 90 %
Test Accuracy: 90 %


In [11]:
# Define the mapping of numeric predictions to labels
def map_prediction_to_label(prediction):
    if prediction == 1:
        return "Non-Hate Speech"
    elif prediction == 2:
        return "Hate Speech"
    else:
        return "Unknown"

# Test the model with a new tweet
test_data = "i am very happy!"

# Preprocess the test data
test_data_preprocessed = preprocess_text(test_data)

# Convert the preprocessed text to numerical features using TF-IDF
test_data_tfidf = tfidf_vectorizer.transform([test_data_preprocessed])

# Make the prediction using the trained model
prediction = rf_model.predict(test_data_tfidf)[0]

print("testing tweet(value)):", prediction)
print("testing tweet(label):", map_prediction_to_label(prediction))
# Predict on the test set
y_test_pred = rf_model.predict(X_test) # Changed rf.model to rf_model
# Calculate accuracy
test_accuracy = accuracy_score(y_test, y_test_pred)
print("Accuracy on Test data:",round(test_accuracy * 100,8), "%")

testing tweet(value)): 2
testing tweet(label): Hate Speech
Accuracy on Test data: 89.83324368 %
