In [1]:
import re
import string
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score


In [2]:
# Load data
train = pd.read_csv('processed_train_data.csv')
test = pd.read_csv('processed_test_data.csv')

In [3]:
# Preprocessing
train.dropna(inplace=True)


In [4]:
stop_words = set(stopwords.words('english'))
additional_stop_words = {'u', 'im', 'not', 'no', 'never', 'neither', 'nor'}
stop_words |= additional_stop_words

In [6]:

stop_words = set(stopwords.words('english'))

# Dictionary for common typos and slangs
typos_slangs = {
    "dont": "don't",
    "cant": "can't",
    "lol": "laugh out loud",
    "brb": "be right back",
    "jk": "just kidding",
    # Add more typos and slangs as needed
}

def clean_text(text):
    text = str(text).lower()
    
    # Use raw strings (prefix with r) to prevent SyntaxWarnings
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub(r'\n', '', text)
    text = re.sub(r'\w*\d\w*', '', text)
    
    # Replace typos and slangs
    for typo, correction in typos_slangs.items():
        text = text.replace(typo, correction)
    
    # Tokenization
    tokens = word_tokenize(text)
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tagged_tokens = nltk.pos_tag(tokens)
    lemmatized_tokens = [
        lemmatizer.lemmatize(token, pos=pos[0].lower()) if pos[0].lower() in ['n', 'v', 'a']
        else lemmatizer.lemmatize(token)
        for token, pos in tagged_tokens
    ]

    # Handle negations
    processed_tokens = []
    negation = False
    for token in lemmatized_tokens:
        if token in {'not', 'no', 'never', 'neither', 'nor', "cannot", "won't"}:
            negation = True
        elif negation:
            token = 'not_' + token
            negation = False
        processed_tokens.append(token)
    
    processed_text = ' '.join([word for word in processed_tokens if word not in stop_words])
    
    return processed_text


In [7]:
train['text'] = train['text'].apply(clean_text)
test['text'] = test['text'].apply(clean_text)

In [8]:

tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(train['text'])
X_test_tfidf = tfidf_vectorizer.transform(test['text'])

In [9]:
X_train_tfidf

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 191880 stored elements and shape (27413, 23963)>

In [10]:
y_train = train['sentiment']
y_test = test['sentiment']

In [11]:
# Model training
svm_model = SVC(kernel='linear')
svm_model.fit(X_train_tfidf, y_train)

In [12]:
# Model evaluation
test_pred = svm_model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, test_pred)
print("Accuracy (SVM with TF-IDF): ", accuracy * 100)

Accuracy (SVM with TF-IDF):  71.58317872603587


In [13]:
# Example predictions
input_sentences = ["horror", "better"] 
for input_sentence in input_sentences:
    processed_input = clean_text(input_sentence)
    X_input = tfidf_vectorizer.transform([processed_input])
    predicted_label = svm_model.predict(X_input)
    print(f"Input: {input_sentence}, Processed Input: {processed_input}, Predicted class label: {predicted_label}")

Input: horror, Processed Input: horror, Predicted class label: ['negative']
Input: better, Processed Input: better, Predicted class label: ['positive']


In [14]:
train[100:200]

Unnamed: 0,text,sentiment
101,certainly notcheer huh,neutral
102,week post myhorrible traumatic jumping cholla ...,negative
103,realy want go cause nice everybodys busy,negative
104,awesome im ocean beach know way yourbiggestfan...,positive
105,least get watch time let go pen,neutral
...,...,...
196,im soo boredim deffo miss music channel,negative
197,nite nite bday girl fun concert,positive
198,nicotine replacement patch hour far good sleep...,neutral
199,sanderson whats twatter lately either ca nt ge...,neutral


In [15]:
import pickle


with open('svm_classifier.pkl', 'wb') as f:
    pickle.dump((tfidf_vectorizer, svm_model), f)


In [16]:
import pickle


# Load
with open('svm_classifier.pkl', 'rb') as f:
    tfidf_loaded, svm_model_loaded = pickle.load(f)

In [17]:
# Example predictions
input_sentences = ["best", "i am bad what the hell"] 
for input_sentence in input_sentences:
    processed_input = clean_text(input_sentence)
    X_input = tfidf_loaded.transform([processed_input])
    predicted_label = svm_model_loaded.predict(X_input)
    print(f"Input: {input_sentence}, Processed Input: {processed_input}, Predicted class label: {predicted_label}")

Input: best, Processed Input: best, Predicted class label: ['positive']
Input: i am bad what the hell, Processed Input: bad hell, Predicted class label: ['negative']
