In [1]:
import os
import pickle
import pandas as pd
import numpy as np
from sklearn import svm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from scipy.sparse import csr_matrix, hstack
from imblearn.over_sampling import SMOTE

from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [2]:
dataset_dir = '/content/gdrive/MyDrive/Essentials in Text and Speech Processing'

x_train = pd.read_csv(os.path.join(dataset_dir, 'x_train.csv'))
x_test = pd.read_csv(os.path.join(dataset_dir, 'x_test.csv'))
y_train = pd.read_csv(os.path.join(dataset_dir, 'y_train.csv'))
y_test = pd.read_csv(os.path.join(dataset_dir, 'y_test.csv'))

## Preprocessing

Process the plain text data as it has not been saved in tfidf vectorized format: title, company_profile, description, requirements.

In [3]:
# Check for missing values in x_train
missing_values_train = x_train.isnull().sum()
missing_values_test = x_test.isnull().sum()

print(f"Missing values in x_train:\n{missing_values_train[missing_values_train > 0]}")
print(f"Missing values in x_test:\n{missing_values_test[missing_values_test > 0]}")

Missing values in x_train:
requirements    3
dtype: int64
Missing values in x_test:
description     1
requirements    3
dtype: int64


In [4]:
x_train['requirements'] = x_train['requirements'].fillna('')
x_test['requirements'] = x_test['requirements'].fillna('')
x_test['description'] = x_test['description'].fillna('')

In [5]:
vectorizer = TfidfVectorizer(max_features=5000)

x_train_tfidf = vectorizer.fit_transform(x_train['title'] + " " + x_train['company_profile'] + " " + x_train['description'] + " " + x_train['requirements'])
x_test_tfidf = vectorizer.transform(x_test['title'] + " " + x_test['company_profile'] + " " + x_test['description'] + " " + x_test['requirements'])

x_train = pd.DataFrame(x_train_tfidf.toarray(), columns=vectorizer.get_feature_names_out())
x_test = pd.DataFrame(x_test_tfidf.toarray(), columns=vectorizer.get_feature_names_out())

print(x_train.shape, x_test.shape)

(14304, 5000) (3576, 5000)


## Oversampling

In [6]:
# Ensure y_train and y_test are in the correct format by flattening
y_train = y_train.values.ravel()
y_test = y_test.values.ravel()

# Oversampling the minority for balanced learning
smote = SMOTE(random_state=42)
x_train_resampled, y_train_resampled = smote.fit_resample(x_train, y_train)

# Check the new class distribution
unique, counts = np.unique(y_train_resampled, return_counts=True)
print("Class distribution after SMOTE:", dict(zip(unique, counts)))

Class distribution after SMOTE: {0: 13619, 1: 13619}


## SVM Training & Evaluation

In [7]:
# Train the SVM model
svm_model = svm.SVC(kernel='linear') # tests with different kernels?
svm_model.fit(x_train_resampled, y_train_resampled)

# Make predictions on the original test set
y_pred = svm_model.predict(x_test)

## Results

In [8]:
# Get accuracy and classification report
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
report = classification_report(y_test, y_pred)
report_dict = classification_report(y_test, y_pred, output_dict=True)
conf_matrix = confusion_matrix(y_test, y_pred)

In [9]:
print(f"Accuracy:", np.format_float_positional(accuracy, precision=10))
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
# print("Classification Report:\n", report)
print("Classification Report (Full Precision):")
for label, metrics in report_dict.items():
    if isinstance(metrics, dict):
        for metric_name, value in metrics.items():
            print(f"{label} - {metric_name}: {np.format_float_positional(value, precision=10)}")
    else:
        print(f"{label}: {np.format_float_positional(metrics, precision=10)}")
print("Confusion Matrix:\n", conf_matrix)

Accuracy: 0.9829418345
Precision: 0.9827
Recall: 0.9829
F1 Score: 0.9828
Classification Report (Full Precision):
0 - precision: 0.9900058789
0 - recall: 0.9920471281
0 - f1-score: 0.9910254524
0 - support: 3395.
1 - precision: 0.8448275862
1 - recall: 0.8121546961
1 - f1-score: 0.8281690141
1 - support: 181.
accuracy: 0.9829418345
macro avg - precision: 0.9174167326
macro avg - recall: 0.9021009121
macro avg - f1-score: 0.9095972332
macro avg - support: 3576.
weighted avg - precision: 0.9826576488
weighted avg - recall: 0.9829418345
weighted avg - f1-score: 0.9827824392
weighted avg - support: 3576.
Confusion Matrix:
 [[3368   27]
 [  34  147]]


## Pickle

In [10]:
output_dir = '/content/gdrive/MyDrive/Essentials in Text and Speech Processing/Results'

# Save the SVM model just in case
with open(os.path.join(output_dir, 'svm_model.pkl'), 'wb') as model_file:
    pickle.dump(svm_model, model_file)

# Save the evaluation results just in case
with open(os.path.join(output_dir, 'svm_classification_report.txt'), 'w') as report_file:
    report_file.write("Classification Report:\n")
    report_file.write(report + "\n")

    report_file.write("Accuracy:\n")
    report_file.write(f"{accuracy:.4f}\n\n")

    report_file.write("Confusion Matrix:\n")
    for row in conf_matrix:
        report_file.write(" ".join(map(str, row)) + "\n")

print("Model and classification report have been saved.")

Model and classification report have been saved.
