# CS4248 Project - SVM classifier

### Import dependencies

In [10]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import string
import time
import csv
import os
import sys
from nltk.corpus import stopwords
from sklearn import svm
from sklearn.metrics import f1_score, classification_report

### Feature Engineering
Tools to read and encode data

In [2]:
class FeatureEngineer:
    def __init__(self, tfidf=True, number_of_words=False, number_of_exclamatory=False):
        self.tfidf = tfidf
        self.number_of_words = number_of_words
        self.number_of_exclamatory = number_of_exclamatory
        self.tfidf_vectorizer = TfidfVectorizer(max_df=0.85,min_df=0.15)
    
    def encode(self, process_input_filename=None, original_input_filename=None, output_filename=None, istrain=True):
        data, labels = self.read_data(process_input_filename, col1='text', col2='label')
        odata, olabels = self.read_data(original_input_filename, col1=1, col2=0, header=False)
        combination = []
        if self.tfidf:
            tf = self.tf(data, labels, istrain)
            combination.append(tf)
        if self.number_of_words:
            combination.append(self.numofwords(data))
        if self.number_of_exclamatory:
            combination.append(self.numofexclam(odata))
            
        matrix = [[] for _ in range(len(data))]
        for k in range(len(combination)):
            for i in range(len(combination[k])):
                for j in combination[k][i]:
                    matrix[i].append(j)   
        self.generate(matrix, labels, output_filename)
        return tf, labels
    
    def generate(self, matrix, labels, output_filename):
        with open(output_filename, mode='w', newline='') as file:
            writer = csv.writer(file)
            fieldnames = ['id'] + [('d' + str(i)) for i in range(1, len(matrix[0]) + 1)] + ['label']
            writer.writerow(fieldnames)
            for i, row in enumerate(matrix):
                new_row = [i]
                for r in row:
                    new_row.append(r)
                new_row.append(labels[i])
                writer.writerow(new_row)
    
    def read_data(self, filename, col1, col2, header=True):
        if not header:
            raw_data = pd.read_csv(filename, header=None)
        else:
            raw_data = pd.read_csv(filename)
        data,labels=raw_data[col1].tolist(), raw_data[col2].tolist()
        return data, labels
    
    def tf(self, data, labels, istrain):
        if istrain:
            tfidf_matrix = self.tfidf_vectorizer.fit_transform(data)
            tfidf_matrix = tfidf_matrix.toarray()
        else:
            tfidf_matrix = self.tfidf_vectorizer.transform(data)
            tfidf_matrix = tfidf_matrix.toarray()
        return tfidf_matrix
    
    def numofwords(self, data):
        res = [[len(data[i].split())] for i in range(len(data))]
        return res
    
    def numofexclam(self, data):
        res = [[0] for i in range(len(data))]
        for i in range(len(data)):
            for ch in data[i]:
                if ch == '!':
                    res[i][0] += 1
        return res

### Processing the data and encoding training and testing sets

In [None]:
fe = FeatureEngineer(tfidf=True, number_of_words=True, number_of_exclamatory=True)

In [5]:
train_tfidf, train_labels = fe.encode(process_input_filename='process_train.csv', original_input_filename='../dataset/fulltrain.csv', output_filename='../dataset/feature_train.csv')

In [6]:
test_tfidf, test_labels = fe.encode(process_input_filename='process_test.csv', original_input_filename='../dataset/balancedtest.csv', output_filename='../dataset/feature_test.csv',istrain=False)

### Training the SVM model with several kernel methods
The best results are provided using `kernel='rbf'`

In [16]:
SVM = svm.SVC(kernel='rbf')
print('Started SVM training ...')
SVM.fit(train_tfidf, train_labels)
print('Started SVM testing ...')
predictions = SVM.predict(test_tfidf)
print('F1 score :', f1_score(predictions, test_labels, average='macro'))

Started SVM training ...
Started SVM testing ...
F1 score : 0.5218411741284723


### Testing the model with F1-score

In [11]:
TEXT_LABELS = {0: 'Satire', 1: 'Hoax', 2: 'Propaganda', 3: 'Reliable'}
class_names = list(TEXT_LABELS.values())
print(classification_report(y_pred, test_labels, target_names=class_names))

              precision    recall  f1-score   support

      Satire       0.60      0.56      0.57       805
        Hoax       0.16      0.49      0.25       251
  Propaganda       0.68      0.52      0.59       985
    Reliable       0.78      0.61      0.68       959

    accuracy                           0.55      3000
   macro avg       0.55      0.54      0.52      3000
weighted avg       0.64      0.55      0.58      3000



### Training and testing the model with other `kernel` values

In [9]:
SVM = svm.SVC(kernel='linear')
print('Started SVM training ...')
SVM.fit(train_tfidf, train_labels)
print('Started SVM testing ...')
predictions = SVM.predict(test_tfidf)
print('F1 score :', f1_score(predictions, test_labels, average='macro'))

Started SVM training ...
Started SVM testing ...
F1 score : 0.4812646952702081


In [10]:
SVM = svm.SVC(kernel='poly')
print('Started SVM training ...')
SVM.fit(train_tfidf, train_labels)
print('Started SVM testing ...')
predictions = SVM.predict(test_tfidf)
print('F1 score :', f1_score(predictions, test_labels, average='macro'))

Started SVM training ...
Started SVM testing ...
F1 score : 0.5174559993447944


In [11]:
SVM = svm.SVC(kernel='sigmoid')
print('Started SVM training ...')
SVM.fit(train_tfidf, train_labels)
print('Started SVM testing ...')
predictions = SVM.predict(test_tfidf)
print('F1 score :', f1_score(predictions, test_labels, average='macro'))

Started SVM training ...
Started SVM testing ...
F1 score : 0.40418482283010543
