#  SVM Text Classification

##load data

data link:


 https://drive.google.com/drive/folders/1--TnLVN8hLLcOnkGP6EQzrrBCMpEuGS5?usp=share_link

##  Importing Related Modules


In [1]:
import re
import pandas as pd
import numpy as np
from sklearn import svm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.metrics import classification_report, accuracy_score

## Data preprocessing

In [2]:
def clean_str(string):
 """
 Tokenization/string cleaning for all datasets except for SST.
 Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
 """
 string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
 string = re.sub(r"\'s", " \'s", string)
 string = re.sub(r"\'ve", " \'ve", string)
 string = re.sub(r"n\'t", " n\'t", string)
 string = re.sub(r"\'re", " \'re", string)
 string = re.sub(r"\'d", " \'d", string)
 string = re.sub(r"\'ll", " \'ll", string)
 string = re.sub(r",", " , ", string)
 string = re.sub(r"!", " ! ", string)
 string = re.sub(r"\(", " \( ", string)
 string = re.sub(r"\)", " \) ", string)
 string = re.sub(r"\?", " \? ", string)
 string = re.sub(r"\s{2,}", " ", string)
 return string.strip().lower()


In [7]:
def load_data_and_labels(positive_data_file, negative_data_file):
 """
 Loads MR polarity data from files, splits the data into words and generates labels.
 Returns split sentences and labels.
 """
 # Load data from files
 positive_examples = list(open(positive_data_file, "r", encoding='iso-8859-1').readlines()) 
 positive_examples = [s.strip() for s in positive_examples]
 negative_examples = list(open(negative_data_file, "r", encoding='iso-8859-1').readlines()) 
 negative_examples = [s.strip() for s in negative_examples]
 # Split by words
 x = positive_examples + negative_examples
 x = [clean_str(sent) for sent in x]
 x = np.array(x)
 # Generate labels
 positive_labels = [1] * len(positive_examples)
 negative_labels = [0] * len(negative_examples)
 y = np.concatenate([positive_labels, negative_labels], 0)
 
 
 shuffle_indices = np.random.permutation(np.arange(len(y)))
 shuffled_x = x[shuffle_indices]
 shuffled_y = y[shuffle_indices]
 
 return shuffled_x, shuffled_y

## Load data:



In [8]:
positive_data_file = 'data/rt-polarity.pos'
negative_data_file = 'data/rt-polarity.neg'
x, y = load_data_and_labels(positive_data_file, negative_data_file)


Show data features: 


In [9]:
x[:5]

array(["a depressingly retrograde , 'post feminist' romantic comedy that takes an astonishingly condescending attitude toward women",
       "either you 're willing to go with this claustrophobic concept or you 're not",
       "gosling 's combination of explosive physical energy and convincing intelligence helps create a complex , unpredictable character",
       "with 'bowling for columbine , ' michael moore gives us the perfect starting point for a national conversation about guns , violence , and fear",
       'this action thriller dark comedy is one of the most repellent things to pop up in a cinematic year already littered with celluloid garbage'],
      dtype='<U266')

In [10]:
y[:5]


array([0, 1, 1, 1, 0])

## split 

In [11]:
test_size = 2000
x_train, y_train = x[:-2000], y[:-2000]
x_test, y_test = x[-2000:], y[-2000:]
label_map = {0: 'negative', 1: 'positive'}


## Define the main class of the classifier, define training, and test functions.


In [14]:

class SVM_Classifier(object):
 
 def __init__(self, use_chi=False):
 
   self.use_chi = use_chi # Whether use chi-square test for feature selection
   # SVM
   self.model = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
   # use tf-idf extract features
   self.feature_processor = TfidfVectorizer()
   # chi-square test for feature selection
   if use_chi:
      self.feature_selector = SelectKBest(chi2, k=10000) # 34814 -> 10000
 
 def fit(self, x_train, y_train, x_test, y_test):
 
   x_train_fea = self.feature_processor.fit_transform(x_train)
   if self.use_chi:
      x_train_fea = self.feature_selector.fit_transform(x_train_fea, y_train)
   self.model.fit(x_train_fea, y_train)
 
   train_accuracy = self.model.score(x_train_fea, y_train)
   print("Training Accuracy：{}".format(round(train_accuracy, 3)))
 
   x_test_fea = self.feature_processor.transform(x_test)
   if self.use_chi:
      x_test_fea = self.feature_selector.transform(x_test_fea)
   y_predict = self.model.predict(x_test_fea)
   test_accuracy = accuracy_score(y_test, y_predict)
   print("Test Accuracy：{}".format(round(test_accuracy, 3)))
   print('Test set evaluate：')
   print(classification_report(y_test, y_predict, target_names=['negative', 'positive']))
 
 def single_predict(self, text):
   text_fea = self.feature_processor.transform([text])
   if self.use_chi:
     text_fea = self.feature_selector.transform(text_fea)
   predict_idx = self.model.predict(text_fea)[0]
   predict_label = label_map[predict_idx]
   return predict_label


## Train the SVM classifier without the chi-square test. 


In [15]:
 svm_classifier = SVM_Classifier()
svm_classifier.fit(x_train, y_train, x_test, y_test)


Training Accuracy：0.936
Test Accuracy：0.762
Test set evaluate：
              precision    recall  f1-score   support

    negative       0.76      0.76      0.76       996
    positive       0.76      0.76      0.76      1004

    accuracy                           0.76      2000
   macro avg       0.76      0.76      0.76      2000
weighted avg       0.76      0.76      0.76      2000



## Train SVM classifiers and use chi-square test. 


In [16]:
svm_classifier = SVM_Classifier(use_chi=True)
svm_classifier.fit(x_train, y_train, x_test, y_test)

Training Accuracy：0.92
Test Accuracy：0.757
Test set evaluate：
              precision    recall  f1-score   support

    negative       0.75      0.77      0.76       996
    positive       0.77      0.74      0.75      1004

    accuracy                           0.76      2000
   macro avg       0.76      0.76      0.76      2000
weighted avg       0.76      0.76      0.76      2000



## chi-square feature analysis


In [18]:

def feature_analysis():
 feature_names = svm_classifier.feature_processor.get_feature_names_out()
 feature_scores = svm_classifier.feature_selector.scores_
 fea_score_tups = list(zip(feature_names, feature_scores))
 fea_score_tups.sort(key=lambda tup: tup[1], reverse=True)
 
 return fea_score_tups
feature_analysis()[:500]

[('too', 28.586492680391004),
 ('bad', 25.416566933865724),
 ('dull', 12.744288853976371),
 ('moving', 11.64661597885886),
 ('boring', 10.488514911373365),
 ('no', 8.773242362253436),
 ('touching', 8.77035351849974),
 ('and', 8.601428446471616),
 ('enjoyable', 8.54944626666883),
 ('heart', 8.485753513796631),
 ('solid', 8.368057896870553),
 ('best', 8.181289066117529),
 ('wonderful', 7.930132790445738),
 ('mess', 7.908352634764329),
 ('funny', 7.606281164956832),
 ('portrait', 7.533702554611895),
 ('engrossing', 7.515779641772426),
 ('tv', 7.321278137957245),
 ('entertaining', 7.3185184400184955),
 ('worst', 7.316132850085436),
 ('flat', 7.306739179044317),
 ('just', 7.262946019029469),
 ('feels', 7.134616985666686),
 ('smart', 7.0951487603566905),
 ('powerful', 7.07786029215524),
 ('video', 6.965263078443845),
 ('performances', 6.95795894024276),
 ('warm', 6.89704175082893),
 ('so', 6.83680508715214),
 ('thoughtful', 6.7558576581979155),
 ('silly', 6.730474554497938),
 ('pretentious',

In [None]:
# from sklearn.feature_extraction.text import TfidfVectorizer

# # create a TfidfVectorizer object
# tfidf = TfidfVectorizer()

# # fit the vectorizer to your text data
# tfidf.fit(text_data)

# # convert the TfidfVectorizer object to a CountVectorizer object
# cv = CountVectorizer(vocabulary=tfidf.vocabulary_)
# cv_arr = cv.fit_transform(text_data).toarray()

# # get the feature names
# feature_names = cv.get_feature_names()


## Single sentence test


Test the prediction result of a single sentence: 


In [19]:

svm_classifier.single_predict("beautiful actors, great movie")

'positive'

In [20]:
svm_classifier.single_predict("it's really boring")


'negative'