In [None]:
"""

This section of the code is responsible for importing necessary libraries, downloading required NLTK resources,
and loading the Twitter dataset for analysis.

"""


# Importing necessary libraries
import pandas as pd  # For data manipulation and analysis
import numpy as np  # For numerical computing
from sklearn.feature_extraction.text import CountVectorizer  # For converting text data into a matrix of token counts
from sklearn.model_selection import train_test_split  # For splitting datasets into training and testing sets
from sklearn.tree import DecisionTreeClassifier  # For training a decision tree classifier
import re  # For regular expression operations
import nltk  # For natural language processing tasks
from nltk.corpus import stopwords  # For accessing common stopwords in various languages
import string  # For string constants

# Downloading stopwords from NLTK if not already downloaded
nltk.download('stopwords')

# Creating a SnowballStemmer for stemming English words
stemmer = nltk.SnowballStemmer("english")

# Fetching English stopwords
english_stopwords = set(stopwords.words('english'))

# Loading the Twitter dataset
twitter_data = pd.read_csv("twitter.csv")

# Displaying the first few rows of the dataset
print(twitter_data.head())

In [None]:
# Mapping class labels to descriptive categories
class_label_mapping = {0: "Hate Speech", 
                      1: "Offensive Language", 
                      2: "No Hate and Offensive"}
twitter_data["labels"] = twitter_data["class"].map(class_label_mapping)
print(twitter_data.head())

In [None]:
# Extracting the 'tweet' and 'labels' columns from the 'twitter_data' DataFrame into a new DataFrame
# This new DataFrame is named 'tweet_labels_data' for clarity and ease of understanding.
tweet_labels_data = twitter_data[["tweet", "labels"]]
print(tweet_labels_data.head())

In [None]:
import re
import string
from nltk.corpus import stopwords

def preprocess_text(text):
    """
    This function preprocesses the input text by converting it to lowercase, removing URLs, HTML tags, punctuation, 
    newlines, and words containing digits. It also removes English stopwords and stems the remaining words.
    
    Parameters:
    text (str): The input text to be preprocessed.
    
    Returns:
    str: The preprocessed text.
    """
    # Convert the text to lowercase
    text = str(text).lower()
    
    # Remove URLs
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    
    # Remove HTML tags
    text = re.sub(r'<.*?>+', '', text)
    
    # Remove punctuation
    text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)
    
    # Remove newlines
    text = re.sub(r'\n', '', text)
    
    # Remove words containing digits
    text = re.sub(r'\w*\d\w*', '', text)
    
    # Remove English stopwords
    text = [word for word in text.split(' ') if word not in english_stopwords]
    
    # Join the words back into a string
    text = " ".join(text)
    
    # Stem the words
    text = [stemmer.stem(word) for word in text.split(' ')]
    
    # Join the stemmed words back into a string
    text = " ".join(text)
    
    return text

# Apply the preprocess_text function to the "tweet" column
twitter_data["tweet"] = twitter_data["tweet"].apply(preprocess_text)
twitter_data


In [None]:
# Convert the 'tweet' column into a numpy array for processing
tweet_data_array = np.array(twitter_data["tweet"])
# Convert the 'labels' column into a numpy array for processing
labels_data_array = np.array(twitter_data["labels"])

# Initialize a CountVectorizer to convert text data into a matrix of token counts
vectorizer = CountVectorizer()
# Fit the vectorizer to the tweet data and transform it into a matrix of token counts
vectorized_tweet_data = vectorizer.fit_transform(tweet_data_array) # Fit the Data

# Split the vectorized tweet data and labels into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(vectorized_tweet_data, labels_data_array, test_size=0.33, random_state=42)

# Initialize a DecisionTreeClassifier for classification
decision_tree_classifier = DecisionTreeClassifier()
# Train the decision tree classifier on the training data
decision_tree_classifier.fit(X_train,y_train)

In [None]:
# Predict the labels for X_test
y_pred = decision_tree_classifier.predict(X_test)

print(y_pred)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

#  Predict the labels for X_test
y_pred = decision_tree_classifier.predict(X_test)

# Calculate the metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')  # 'weighted' accounts for label imbalance
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Print the results
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print("\nConfusion Matrix:")
print(conf_matrix)

# Alternatively, you can print a detailed classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
