<a href="https://colab.research.google.com/github/akash22ak/Sentiment-Polarity-Classification/blob/master/spc_linear.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report

In [2]:
# SVM implementation from scratch with corrected label mapping
class SVM:
    def __init__(self, learning_rate=0.001, lambda_param=0.01, n_iters=1000):
        self.learning_rate = learning_rate
        self.lambda_param = lambda_param
        self.n_iters = n_iters
        self.w = None
        self.b = None

    def fit(self, X, y):
        n_samples, n_features = X.shape
        y_ = np.where(y == 0, -1, 1)  # Correct mapping: 0 -> -1, 1 -> 1 for training

        self.w = np.zeros(n_features)
        self.b = 0

        for _ in range(self.n_iters):
            for idx, x_i in enumerate(X):
                condition = y_[idx] * (np.dot(x_i, self.w) - self.b) >= 1
                if condition:
                    self.w -= self.learning_rate * (2 * self.lambda_param * self.w)
                else:
                    self.w -= self.learning_rate * (2 * self.lambda_param * self.w - np.dot(x_i, y_[idx]))
                    self.b -= self.learning_rate * y_[idx]

    def predict(self, X):
        # Predict the class label based on the sign of the decision boundary
        linear_output = np.dot(X, self.w) - self.b
        return np.sign(linear_output)


In [3]:
# Load data function to read the dataset and split it into training, validation, and test sets
def load_data():
    with open('rt-polarity.pos', 'r', encoding='latin-1') as pos_file, open('rt-polarity.neg', 'r', encoding='latin-1') as neg_file:
        positive_sentences = pos_file.readlines()
        negative_sentences = neg_file.readlines()

    # Create labels: 1 for positive, 0 for negative
    positive_labels = [1] * len(positive_sentences)
    negative_labels = [0] * len(negative_sentences)

    # Create DataFrames for easy manipulation
    pos_df = pd.DataFrame({'text': positive_sentences, 'label': positive_labels})
    neg_df = pd.DataFrame({'text': negative_sentences, 'label': negative_labels})

    # Split data into training, validation, and test sets
    train_data = pd.concat([pos_df[:4000], neg_df[:4000]]).sample(frac=1, random_state=42).reset_index(drop=True)
    val_data = pd.concat([pos_df[4000:4500], neg_df[4000:4500]])
    test_data = pd.concat([pos_df[4500:], neg_df[4500:]])

    return train_data, val_data, test_data


In [4]:
# Feature extraction using TF-IDF
def feature_extraction(train_data, val_data, test_data):
    vectorizer = TfidfVectorizer(max_features=5000)

    X_train = vectorizer.fit_transform(train_data['text']).toarray()
    X_val = vectorizer.transform(val_data['text']).toarray()
    X_test = vectorizer.transform(test_data['text']).toarray()

    y_train = train_data['label'].values
    y_val = val_data['label'].values
    y_test = test_data['label'].values

    return X_train, y_train, X_val, y_val, X_test, y_test


In [6]:
def main():
    train_data, val_data, test_data = load_data()

    X_train, y_train, X_val, y_val, X_test, y_test = feature_extraction(train_data, val_data, test_data)

    svm_model = SVM(learning_rate=0.001, lambda_param=0.01, n_iters=1000)
    svm_model.fit(X_train, y_train)

    y_pred_val = svm_model.predict(X_val)

    y_pred_val = np.where(y_pred_val == -1, 0, y_pred_val)

    print("Validation Results:")
    print(classification_report(y_val, y_pred_val, zero_division=1))

    y_pred_test = svm_model.predict(X_test)

    y_pred_test = np.where(y_pred_test == -1, 0, y_pred_test)

    print("Test Results:")
    print(classification_report(y_test, y_pred_test, zero_division=1))

if __name__ == "__main__":
    main()


Validation Results:
              precision    recall  f1-score   support

           0       0.70      0.63      0.67       500
           1       0.67      0.73      0.70       500

    accuracy                           0.68      1000
   macro avg       0.68      0.68      0.68      1000
weighted avg       0.68      0.68      0.68      1000

Test Results:
              precision    recall  f1-score   support

           0       0.72      0.66      0.69       831
           1       0.69      0.74      0.71       831

    accuracy                           0.70      1662
   macro avg       0.70      0.70      0.70      1662
weighted avg       0.70      0.70      0.70      1662

