## Import data

In [3]:
import pandas as pd

data_file = 'data/processed_data.csv'

df = pd.read_csv(data_file, index_col=0)
df

Unnamed: 0,headline,is_sarcastic
0,thirtysomething scientists unveil doomsday clo...,1
1,dem rep totally nails congress falling short g...,0
2,eat veggies deliciously different recipes,0
3,inclement weather prevents liar getting work,1
4,mother comes pretty close using word streaming...,1
...,...,...
28614,jews celebrate rosh hashasha something,1
28615,internal affairs investigator disappointed con...,1
28616,beautiful acceptance speech week came queer ko...,0
28617,mars probe destroyed orbiting spielberggates s...,1


## Machine Learning - Naive Bayes

In [8]:
headlines = df['headline']
split_headlines = headlines.str.split()
split_headlines

0        [thirtysomething, scientists, unveil, doomsday...
1        [dem, rep, totally, nails, congress, falling, ...
2          [eat, veggies, deliciously, different, recipes]
3        [inclement, weather, prevents, liar, getting, ...
4        [mother, comes, pretty, close, using, word, st...
                               ...                        
28614         [jews, celebrate, rosh, hashasha, something]
28615    [internal, affairs, investigator, disappointed...
28616    [beautiful, acceptance, speech, week, came, qu...
28617    [mars, probe, destroyed, orbiting, spielbergga...
28618                         [dad, clarifies, food, stop]
Name: headline, Length: 28503, dtype: object

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from nltk.tokenize import word_tokenize
from nltk.tokenize import TweetTokenizer
from sklearn.pipeline import make_pipeline
from sklearn.base import TransformerMixin



def tokenize_basic(text):
    return text.split()

def tokenize_nltk(text):
    return word_tokenize(text)

def tokenize_tweet(text):
    tknzr = TweetTokenizer()
    return tknzr.tokenize(text)

class TokenizerTransformer(TransformerMixin):
    def __init__(self, tokenizer_func):
        self.tokenizer_func = tokenizer_func

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return [' '.join(self.tokenizer_func(text)) for text in X]

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(df['headline'], df['is_sarcastic'], test_size=0.2, random_state=42)

# Naive Bayes Model with different tokenization methods
for tokenizer_func in [tokenize_basic, tokenize_nltk, tokenize_tweet]:
    # Create a pipeline with our custom tokenizer and a CountVectorizer
    pipeline = make_pipeline(
        TokenizerTransformer(tokenizer_func),
        CountVectorizer(),
        MultinomialNB()
    )

    # Train the model
    pipeline.fit(X_train, y_train)

    # Predict on the test set
    y_pred = pipeline.predict(X_test)

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Tokenization Method: {tokenizer_func.__name__}")
    print(f"Accuracy: {accuracy}")
    print(classification_report(y_test, y_pred))

TypeError: Expected sequence or array-like, got <class 'pandas.core.strings.accessor.StringMethods'>