In [2]:
import pandas as pd

x_train_path = './data/X_train.csv'
y_train_path = './data/y_train.csv'
x_test_path = './data/X_test.csv'
y_test_path = './data/y_test.csv'

x_train = pd.read_csv(x_train_path)
y_train = pd.read_csv(y_train_path)
y_test = pd.read_csv(y_test_path)
x_test = pd.read_csv(x_test_path)


In [3]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier


In [4]:
class SarcasmModel:
    """ 
    Interface for sarcasm detection model
    """

    def __init__(self, var_smoothing):
        self.var_smoothing = var_smoothing

    def __str__(self):
        f"SarcasmModel(Smoothing Factor='{self.SarcasmModel}')"

    def __repr__(self):
        f"SarcasmModel(Smoothing Factor='{self.SarcasmModel}')"

    def train(self):
        pass

    def test(self):
        pass


In [5]:
class RandomForestSarcasm(SarcasmModel):
    """
    Can perform Random Forest classification on a group of text files.
    It will read through each file in the training folder, find the TF-IDF vector space for the text data,
    Then it will fit the Random Forest model on them.
    """

    def __init__(self, n_estimators=100, max_depth=None, random_state=None):
        """
        Initializes the Random Forest classifier with the specified number of trees and max depth.
        :param n_estimators: Number of trees in the forest.
        :param max_depth: Maximum depth of the tree.
        :param random_state: Random state for reproducibility.
        """
        super().__init__(var_smoothing=0)  # var_smoothing not used in Random Forest
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.random_state = random_state

    def train(self, x_train, y_train):
        """
        Trains the Random Forest classifier on the given training data.
        :param x_train: feature training set.
        :param y_train: response training set.
        :return: TF-IDF training vector space and fitted training model.
        """
        tfidf = TfidfVectorizer()
        train_tfidf = tfidf.fit_transform(x_train)
        rf_model = RandomForestClassifier(
            n_estimators=self.n_estimators, max_depth=self.max_depth, random_state=self.random_state)
        rf_model.fit(train_tfidf, y_train)
        return tfidf, rf_model

    def test(self, training_model, tfidf, x_test):
        """
        Tests the Random Forest model on the given test data.
        :param training_model: fitted training model.
        :param tfidf: TF-IDF training vector space.
        :param x_test: feature test set.
        :return: response variable prediction.
        """
        test_tfidf = tfidf.transform(x_test)
        y_pred = training_model.predict(test_tfidf)
        return y_pred


In [6]:
from sklearn.metrics import confusion_matrix


rf_sarcasm = RandomForestSarcasm()

y_train_mapped = y_train['class'].map({'notsarc': 0, 'sarc': 1}).values
y_test_mapped = y_test['class'].map({'notsarc': 0, 'sarc': 1}).values

tfidf, trained_rf_model = rf_sarcasm.train(x_train['text'], y_train_mapped)

y_pred_rf = rf_sarcasm.test(trained_rf_model, tfidf, x_test['text'])

confusion_matrix_rf = confusion_matrix(y_test_mapped, y_pred_rf)

confusion_matrix_rf


array([[630, 296],
       [266, 686]], dtype=int64)