<a href="https://colab.research.google.com/github/abhinavmishra-0302/NLP_Assignment/blob/main/NLP_Assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Import necessary libraries

In [None]:
import os
import tarfile
import urllib.request
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, accuracy_score
from sklearn.svm import SVC

## Download and Preprocess the Dataset

In [None]:
# Download dataset
dataset_url = "https://www.cs.cornell.edu/people/pabo/movie-review-data/rt-polaritydata.tar.gz"
urllib.request.urlretrieve(dataset_url, "rt-polaritydata.tar.gz")

# Extract the dataset
with tarfile.open("rt-polaritydata.tar.gz", "r:gz") as tar:
    tar.extractall()

# Load data
def load_sentences(file_path):
    with open(file_path, 'r', encoding='latin-1') as file:
        return file.readlines()

# File paths
pos_file = 'rt-polaritydata/rt-polarity.pos'
neg_file = 'rt-polaritydata/rt-polarity.neg'

# Load positive and negative sentences
positive_sentences = load_sentences(pos_file)
negative_sentences = load_sentences(neg_file)

# Creating DataFrame for easy handling
data = pd.DataFrame({
    'text': positive_sentences + negative_sentences,
    'label': [1] * len(positive_sentences) + [0] * len(negative_sentences)  # 1: Positive, 0: Negative
})

# Split into training, validation, and test sets as per assignment
train_data = pd.concat([data[data['label'] == 1].iloc[:4000], data[data['label'] == 0].iloc[:4000]])
validation_data = pd.concat([data[data['label'] == 1].iloc[4000:4500], data[data['label'] == 0].iloc[4000:4500]])
test_data = pd.concat([data[data['label'] == 1].iloc[4500:], data[data['label'] == 0].iloc[4500:]])

# Reset index for each dataset
train_data.reset_index(drop=True, inplace=True)
validation_data.reset_index(drop=True, inplace=True)
test_data.reset_index(drop=True, inplace=True)

print(f"Training set size: {len(train_data)}")
print(f"Validation set size: {len(validation_data)}")
print(f"Test set size: {len(test_data)}")

Training set size: 8000
Validation set size: 1000
Test set size: 1662


## Feature Extraction using TF-IDF

In [None]:
# Using TF-IDF Vectorizer for better feature extraction
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)

# Transforming datasets
X_train = tfidf_vectorizer.fit_transform(train_data['text'])
y_train = train_data['label']

X_validation = tfidf_vectorizer.transform(validation_data['text'])
y_validation = validation_data['label']

X_test = tfidf_vectorizer.transform(test_data['text'])
y_test = test_data['label']

print("TF-IDF feature extraction complete.")

TF-IDF feature extraction complete.


## Build and train the model

### 1. Logistic Regression Model

In [None]:
# Define the base model
base_model = LogisticRegression(solver='liblinear')

# Hyperparameter grid for optimization
param_grid = {
    'C': [0.1, 1, 10, 100],
    'penalty': ['l1', 'l2']  # L1 for Lasso, L2 for Ridge
}

# Perform Grid Search with Cross-Validation
grid_search = GridSearchCV(base_model, param_grid, cv=5, scoring='f1', n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Get the best model
best_model = grid_search.best_estimator_
print(f"Best Parameters: {grid_search.best_params_}")

# Train the optimized model
best_model.fit(X_train, y_train)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
Best Parameters: {'C': 1, 'penalty': 'l2'}


### 2. SVM Model

In [None]:
# Define the parameter grid
param_grid_svm = {
    'C': [0.1, 1, 10, 100],  # Regularization parameter
    'kernel': ['linear', 'poly'],  # Kernel types
    'gamma': ['scale', 'auto'],  # Kernel coefficient
}

# Define Support Vector Classifier
svm_model = SVC(random_state=42)

# GridSearchCV for hyperparameter tuning
grid_search_svm = GridSearchCV(svm_model, param_grid_svm, scoring='f1', cv=5, n_jobs=-1, verbose=2)
grid_search_svm.fit(X_train, y_train)

# Get the best model
best_svm_model = grid_search_svm.best_estimator_
print(f"Best SVM Hyperparameters: {grid_search_svm.best_params_}")

Fitting 5 folds for each of 16 candidates, totalling 80 fits
Best SVM Hyperparameters: {'C': 1, 'gamma': 'scale', 'kernel': 'poly'}


## Evaluate on Validation and Test Sets

### 1. Logistic Regression

In [None]:
# Function to evaluate the model and print results
def evaluate_model(model, X, y_true):
    y_pred = model.predict(X)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    accuracy = accuracy_score(y_true, y_pred)
    return {"TP": tp, "TN": tn, "FP": fp, "FN": fn, "Precision": precision, "Recall": recall, "F1-Score": f1, "Accuracy": accuracy}

# Evaluate on validation set
validation_results = evaluate_model(best_model, X_validation, y_validation)
print(f"Validation Results: {validation_results}")

# Evaluate on test set
test_results = evaluate_model(best_model, X_test, y_test)
print(f"Test Results: {test_results}")

Validation Results: {'TP': 364, 'TN': 399, 'FP': 101, 'FN': 136, 'Precision': 0.7827956989247312, 'Recall': 0.728, 'F1-Score': 0.7544041450777202, 'Accuracy': 0.763}
Test Results: {'TP': 619, 'TN': 613, 'FP': 218, 'FN': 212, 'Precision': 0.7395459976105138, 'Recall': 0.7448856799037304, 'F1-Score': 0.7422062350119905, 'Accuracy': 0.7412755716004813}


### 2. SVM model

In [None]:
# Define a function to calculate and print evaluation metrics
def evaluate_model(model, X, y_true):
    y_pred = model.predict(X)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    accuracy = accuracy_score(y_true, y_pred)
    return {"TP": tp, "TN": tn, "FP": fp, "FN": fn, "Precision": precision, "Recall": recall, "F1-Score": f1, "Accuracy": accuracy}

# Evaluate on validation set
validation_results_svm = evaluate_model(best_svm_model, X_validation, y_validation)
print(f"Validation Results (SVM): {validation_results_svm}")

# Evaluate on test set
test_results_svm = evaluate_model(best_svm_model, X_test, y_test)
print(f"Test Results (SVM): {test_results_svm}")

Validation Results (SVM): {'TP': 364, 'TN': 397, 'FP': 103, 'FN': 136, 'Precision': 0.7794432548179872, 'Recall': 0.728, 'F1-Score': 0.7528438469493278, 'Accuracy': 0.761}
Test Results (SVM): {'TP': 614, 'TN': 604, 'FP': 227, 'FN': 217, 'Precision': 0.7300832342449465, 'Recall': 0.7388688327316486, 'F1-Score': 0.7344497607655502, 'Accuracy': 0.7328519855595668}


## Save and Report Results

In [None]:
# Save results to a CSV file for easy reporting
import csv

# Create a CSV to store the results
with open("optimized_evaluation_results.csv", "w") as file:
    writer = csv.writer(file)
    writer.writerow(["Dataset", "TP", "TN", "FP", "FN", "Precision", "Recall", "F1-Score", "Accuracy"])
    writer.writerow(["Validation"] + list(validation_results.values()))
    writer.writerow(["Test"] + list(test_results.values()))

print("Optimized evaluation results saved to 'optimized_evaluation_results.csv'.")

Optimized evaluation results saved to 'optimized_evaluation_results.csv'.
