In [1]:
import re
import string
import numpy as np 
import pandas as pd 
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer

In [2]:
from nb_classifier import NaiveBayesClassifier  

In [3]:
# Load data
train = pd.read_csv('processed_train_data.csv')
test = pd.read_csv('processed_test_data.csv')

In [4]:
# Preprocessing
train.dropna(inplace=True)
stop_words = set(stopwords.words('english'))
additional_stop_words = {'u', 'im'}
stop_words |= additional_stop_words

In [5]:
stop_words = set(stopwords.words('english'))

# Dictionary for common typos and slangs
typos_slangs = {
    "dont": "don't",
    "cant": "can't",
    "lol": "laugh out loud",
    "brb": "be right back",
    "jk": "just kidding",
    # Add more typos and slangs as needed
}

def clean_text(text):
    text = str(text).lower()
    
    # Use raw strings (prefix with r) to prevent SyntaxWarnings
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub(r'\n', '', text)
    text = re.sub(r'\w*\d\w*', '', text)
    
    # Replace typos and slangs
    for typo, correction in typos_slangs.items():
        text = text.replace(typo, correction)
    
    # Tokenization
    tokens = word_tokenize(text)
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tagged_tokens = nltk.pos_tag(tokens)
    lemmatized_tokens = [
        lemmatizer.lemmatize(token, pos=pos[0].lower()) if pos[0].lower() in ['n', 'v', 'a']
        else lemmatizer.lemmatize(token)
        for token, pos in tagged_tokens
    ]

    # Handle negations
    processed_tokens = []
    negation = False
    for token in lemmatized_tokens:
        if token in {'not', 'no', 'never', 'neither', 'nor', "cannot", "won't"}:
            negation = True
        elif negation:
            token = 'not_' + token
            negation = False
        processed_tokens.append(token)
    
    processed_text = ' '.join([word for word in processed_tokens if word not in stop_words])
    
    return processed_text


In [6]:
train['text'] = train['text'].apply(clean_text)
test['text'] = test['text'].apply(clean_text)

In [7]:
# Feature extraction
X_train = train['text']
X_test = test['text']
y_train = train['sentiment']
y_test = test['sentiment']


In [8]:
X_train

0                                            id respond go
1                                  sooo sad miss san diego
2                                                bos bully
3                                    interview leave alone
4                      son couldnt put release already buy
                               ...                        
27475    wish could come see u denver husband lose job ...
27476    ive wonder rake client make clear net nt force...
27477    yay good enjoy break probably need hectic week...
27478                                                worth
27479                           flirt go atg smile yay hug
Name: text, Length: 27413, dtype: object

In [9]:
# # Training Naive Bayes Classifier
# nb_classifier = NaiveBayesClassifier()
nb_classifier.train(X_train, y_train)

NameError: name 'nb_classifier' is not defined

In [None]:
# Prediction
y_pred = nb_classifier.predict(X_test)

In [None]:
# Model evaluation
accuracy = np.mean(y_pred == y_test)
print("Accuracy (Naive Bayes from scratch): ", accuracy * 100)


In [None]:
# Example predictions
input_sentences = ["highly appreciated", "hell"] 
for input_sentence in input_sentences:
    processed_input = clean_text(input_sentence)
    predicted_label = nb_classifier.predict([processed_input])[0]
    print(f"Input: {input_sentence}, Processed Input: {processed_input}, Predicted class label: {predicted_label}")

In [15]:
import pickle


with open('naive_b_classifier.pkl', 'wb') as f:
    pickle.dump(nb_classifier, f)
