### Import Library

In [1]:
import re                         
import nltk                      
import numpy as np
import pandas as pd
import seaborn as sns 
import matplotlib.pyplot as plt
from pylab import rcParams
from nltk.tokenize import word_tokenize # Tokenizing 
from nltk.corpus import stopwords #Remove Stopwords

nltk.download('stopwords')
nltk.download('punkt')
stopwords = stopwords.words('english') 


%matplotlib inline
rcParams['figure.figsize'] = 10, 6

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Arie\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Arie\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Read Dataset 

In [2]:
df = pd.read_csv('comment-spam.csv')

In [3]:
df.head()

Unnamed: 0,No,Comment,Class
0,1,this song is racist,0
1,2,and how many subscribers compared to her over ...,1
2,3,HI! CHECK OUT OUR AWESOME COVERS! AND SAY WHAT...,1
3,4,well done shakira,0
4,5,:D subscribe to me for daily vines,1


In [4]:
df['Class'].value_counts()

1    669
0    631
Name: Class, dtype: int64

### Text Preprocessing

In [5]:
def clean_data(sentence):
  sentence = sentence.lower()                                         # convert to lowercase
  sentence = sentence.strip()                                         # remove whitespaces
  sentence = re.sub(r'[-+]?[0-9]+', '', sentence)                     # remove number
  sentence = re.sub(r'https?://\S+|www\.\S+', '', sentence)           # remove URL
  sentence = re.sub(r'[^\w\s]', '', sentence)                         # remove punctuations
  words = word_tokenize(sentence)                                     # tokenize words
  words = [w for w in words if w not in stopwords]                    # remove words
  emoji_removal = re.compile("["                                      # remove emoji & symbol
      u"\U0001F600-\U0001F64F"  # emoticons
      u"\U0001F300-\U0001F5FF"  # symbols & pictographs
      u"\U0001F680-\U0001F6FF"  # transport & map symbols
      u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
      u"\U00002500-\U00002BEF"  # chinese char
                          "]+", flags=re.UNICODE)
  words = ' '.join(words)
  return emoji_removal.sub(r'', words)

In [6]:
clean_comment = df['Comment'].apply(clean_data)
df.insert(1,"clean_comment", clean_comment)
df.head()

Unnamed: 0,No,clean_comment,Comment,Class
0,1,song racist,this song is racist,0
1,2,many subscribers compared million,and how many subscribers compared to her over ...,1
2,3,hi check awesome covers say think,HI! CHECK OUT OUR AWESOME COVERS! AND SAY WHAT...,1
3,4,well done shakira,well done shakira,0
4,5,subscribe daily vines,:D subscribe to me for daily vines,1


In [7]:
!pip install -q jcopml



In [8]:
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer           
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.metrics import classification_report, ConfusionMatrixDisplay

from jcopml.pipeline import num_pipe, cat_pipe
from jcopml.utils import save_model, load_model
from jcopml.feature_importance import mean_score_decrease

from jcopml.plot import plot_confusion_matrix
from jcopml.plot import plot_roc_curve
from jcopml.plot import plot_classification_report
from jcopml.plot import plot_pr_curve

SEED = 100

import warnings
warnings.simplefilter(action='ignore')

### Read Clean Data

In [9]:
df.head()

Unnamed: 0,No,clean_comment,Comment,Class
0,1,song racist,this song is racist,0
1,2,many subscribers compared million,and how many subscribers compared to her over ...,1
2,3,hi check awesome covers say think,HI! CHECK OUT OUR AWESOME COVERS! AND SAY WHAT...,1
3,4,well done shakira,well done shakira,0
4,5,subscribe daily vines,:D subscribe to me for daily vines,1


In [10]:
df.drop('No', axis = 1, inplace = True)
df.head()

Unnamed: 0,clean_comment,Comment,Class
0,song racist,this song is racist,0
1,many subscribers compared million,and how many subscribers compared to her over ...,1
2,hi check awesome covers say think,HI! CHECK OUT OUR AWESOME COVERS! AND SAY WHAT...,1
3,well done shakira,well done shakira,0
4,subscribe daily vines,:D subscribe to me for daily vines,1


In [11]:
df.isnull().sum()

clean_comment    0
Comment          0
Class            0
dtype: int64

### Split Data Train Test

In [12]:
X = df['clean_comment']
y = df['Class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=27)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1040,), (260,), (1040,), (260,))

### Text representation: TF-IDF

In [13]:
N_GRAM = (1,2)  # uni-gram + bi-g

tfidf = TfidfVectorizer(ngram_range=N_GRAM)
                        
X_train_tfidf = tfidf.fit_transform(X_train).toarray()
X_test_tfidf = tfidf.transform(X_test).toarray()

X_train_tfidf.shape, X_test_tfidf.shape

((1040, 7527), (260, 7527))

### Modelling

In [14]:
svm = SVC()                         
svm.fit(X_train_tfidf, y_train)     

accuracy = accuracy_score(y_train, svm.predict(X_train_tfidf))

print(f'accuracy on training set: {accuracy}')

accuracy on training set: 0.9913461538461539


In [15]:
svm_predict = svm.predict(X_test_tfidf) # SVM predict on testing set
accuracy_test = accuracy_score(y_test, svm_predict)

print(f'accuracy on testing set: {accuracy_test}')

accuracy on testing set: 0.9192307692307692


### Hyperparameter Tuning

In [16]:
# Define pipeline
PIPELINE = Pipeline([
  ('tf-idf', TfidfVectorizer(ngram_range=N_GRAM)),   # Feature extraction: TF-IDF
  ('svm', SVC(probability=True, random_state=100))  # Algoritm: SVM
])

In [17]:
# Define parameter range
PARAM_GRID = {'svm__C': [0.1, 1, 10, 100, 1000],
              'svm__gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'svm__kernel': ['rbf', 'linaer', 'poly']}

In [18]:
# Define cross-validation
KFOLD = KFold(n_splits=10)

In [19]:
model = RandomizedSearchCV(PIPELINE, PARAM_GRID, cv=KFOLD, n_iter=50, n_jobs=-1, verbose=1, random_state=100)

In [20]:
model.fit(X_train, y_train)

Fitting 10 folds for each of 50 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    8.7s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   17.9s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:   42.2s
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:   46.7s finished


RandomizedSearchCV(cv=KFold(n_splits=10, random_state=None, shuffle=False),
                   estimator=Pipeline(steps=[('tf-idf',
                                              TfidfVectorizer(ngram_range=(1,
                                                                           2))),
                                             ('svm',
                                              SVC(probability=True,
                                                  random_state=100))]),
                   n_iter=50, n_jobs=-1,
                   param_distributions={'svm__C': [0.1, 1, 10, 100, 1000],
                                        'svm__gamma': [1, 0.1, 0.01, 0.001,
                                                       0.0001],
                                        'svm__kernel': ['rbf', 'linaer',
                                                        'poly']},
                   random_state=100, verbose=1)

In [21]:
best_params = model.best_params_
best_score  = model.best_score_
train_score = model.score(X_train, y_train)
test_score  = model.score(X_test, y_test)

print(f'Best parameter  : {best_params}')
print(f'Best score      : {best_score}')
print(f'Train score     : {train_score}')
print(f'Test score      : {test_score}')

Best parameter  : {'svm__kernel': 'rbf', 'svm__gamma': 0.001, 'svm__C': 1000}
Best score      : 0.9134615384615385
Train score     : 0.9769230769230769
Test score      : 0.9307692307692308


### Evaluation

In [22]:
plot_classification_report(X_train, y_train, X_test, y_test, model, report=True)

Train report
              precision    recall  f1-score   support

           0       0.96      1.00      0.98       509
           1       1.00      0.96      0.98       531

    accuracy                           0.98      1040
   macro avg       0.98      0.98      0.98      1040
weighted avg       0.98      0.98      0.98      1040


Test report
              precision    recall  f1-score   support

           0       0.90      0.96      0.93       122
           1       0.96      0.91      0.93       138

    accuracy                           0.93       260
   macro avg       0.93      0.93      0.93       260
weighted avg       0.93      0.93      0.93       260



### Result on Test

In [23]:
df_test = pd.DataFrame(X_test)
df_test['actual'] = y_test
df_test['prediction'] = model.predict(X_test)

df_test.head(20)

Unnamed: 0,clean_comment,actual,prediction
310,subscribe pls eminem fans,1,1
202,hey guys ignore please give chance name yuliya...,1,1
1072,ima rapper trying get notice please check mixt...,1,1
1236,love,0,0
1210,goot,0,0
655,check video youtube,1,1
229,check berzerk video channel,1,1
364,,1,0
386,check video youtube,1,1
556,please buy new christmas shirts buy time decem...,1,1


### Predict Text

In [24]:
text = input ("Enter your text:  ")
print(text)
txt = [text]
print(model.predict(txt))

Enter your text:  check video youtube
check video youtube
[1]
