### Part 1: Import Libraries and Dataset

In [1]:
# import Libaries & Packages
import numpy as np                  # Import Numpy for data statistical analysis
import pandas as pd                 # Import Pandas for data manipulation using dataframes
import seaborn as sns               # Statistical data visualization
import matplotlib.pyplot as plt     # Import matplotlib for data visualisation



In [2]:
# Import Youtube Ham or Spam dataset taken from UCI
df1 = pd.read_csv("dataset/Youtube01-Psy.csv")              # Psy youtube channel most viewed video comments dataset
df2 = pd.read_csv("dataset/Youtube02-KatyPerry.csv")        # KatyPerry youtube channel most viewed video comments dataset
df3 = pd.read_csv("dataset/Youtube03-LMFAO.csv")            # Psy LMFAO channel most viewed video comments dataset
df4 = pd.read_csv("dataset/Youtube04-Eminem.csv")           # Eminem youtube channel most viewed video comments dataset
df5 = pd.read_csv("dataset/Youtube05-Shakira.csv")          # Shakira youtube channel most viewed video comments dataset

In [3]:
# Merge all the datasset into single file
frames = [df1,df2,df3,df4,df5]                          # make a list of all file
df_merged = pd.concat(frames)                           # concatenate the all the file into single
keys = ["Psy","KatyPerry","LMFAO","Eminem","Shakira"]   # Merging with Keys
df_with_keys = pd.concat(frames,keys=keys)              # concatenate data with keys
dataset=df_with_keys

In [4]:
# Infomation about dataset
print(dataset.size)                 # size of dataset
print(dataset.shape)                # shape of datadet
print(dataset.keys())               # attributes of dataset

9780
(1956, 5)
Index(['COMMENT_ID', 'AUTHOR', 'DATE', 'CONTENT', 'CLASS'], dtype='object')


### Part 2: Data Preprocessing

In [5]:
# working with text content
dataset = dataset[["CONTENT" , "CLASS"]]             # context = comments of viewers & Class = ham or Spam

In [6]:
# Predictor and Target attribute
dataset_X = dataset['CONTENT']                       # predictor attribute
dataset_y = dataset['CLASS']                         # target attribute

In [7]:
# Feature Extraction from Text using  TF-IDF model
from sklearn.feature_extraction.text import TfidfVectorizer   # import TF-IDF model from scikit Learn

In [8]:
# Extract Feature With TF-IDF model 
corpus = dataset_X                               # declare the variable
cv = TfidfVectorizer()                           # initialize the TF-IDF  model
X = cv.fit_transform(corpus).toarray()           # fit the corpus data into BOW model

In [9]:
# Split the dataset into Train and Test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, dataset_y, test_size=0.2, random_state=0)

In [10]:
# shape of predictor attrbute after Extract Features
X.shape

(1956, 4454)

### Part 3: Building a Model

In [11]:
# import the model from sklean
from sklearn.svm import SVC             # import the Support Vector Machine Classifier model

In [12]:
 # initialize the model
classifier = SVC(kernel = 'linear', random_state= 0)      

In [13]:
# fit the dataset into our classifier model for training
classifier.fit(X_train, y_train)

SVC(kernel='linear', random_state=0)

### Part 4: Making a Prediction and Evaluate the Result

In [14]:
# predict the result
y_pred = classifier.predict(X_test)
print(y_pred)

[0 0 1 0 0 1 1 0 1 0 0 0 0 1 1 1 0 0 1 0 1 1 1 1 0 1 1 1 0 0 0 1 1 1 0 1 0
 0 0 0 1 0 1 0 1 1 1 1 0 1 1 0 0 1 0 1 1 1 0 0 0 1 0 0 0 1 1 1 1 0 1 0 0 1
 0 0 0 1 0 1 0 1 0 0 0 1 1 1 0 1 0 1 1 0 0 1 0 0 1 1 0 0 0 1 1 1 1 1 0 1 0
 0 1 1 1 0 1 1 1 1 1 1 1 0 0 0 1 0 1 1 0 0 0 0 0 0 1 0 1 0 1 0 1 1 1 1 0 0
 1 1 1 0 0 1 0 0 1 1 0 1 0 1 1 1 0 0 0 1 0 1 1 0 1 0 1 0 0 1 1 1 1 1 0 1 0
 0 1 0 0 0 0 1 1 0 0 0 0 1 1 1 0 0 1 1 1 1 1 0 0 0 1 0 1 1 1 0 0 0 0 0 0 0
 1 1 0 0 1 0 0 1 1 1 1 1 0 1 1 0 1 1 0 1 0 1 0 0 1 0 0 0 1 1 0 1 1 1 0 0 1
 1 1 1 0 0 0 0 1 0 1 0 0 1 1 1 0 1 0 1 0 1 1 1 0 0 1 0 1 1 1 0 0 0 0 0 0 0
 1 0 1 0 1 0 1 1 0 0 1 0 1 0 1 0 0 0 0 1 1 0 1 0 0 1 1 1 1 1 0 0 0 0 0 1 1
 0 1 0 1 0 1 1 1 1 1 1 0 0 0 0 0 0 1 1 0 1 1 1 1 0 1 0 0 1 0 1 1 1 0 1 0 1
 1 1 0 0 0 1 1 0 1 0 0 0 0 1 0 1 0 0 1 1 1 0]


In [15]:
# Making a Confusion Matrix
from sklearn.metrics import confusion_matrix
confusion_matrix= confusion_matrix(y_test, y_pred)
print(confusion_matrix)
#[row, column]
TP = confusion_matrix[1, 1]        
TN = confusion_matrix[0, 0]           
FP = confusion_matrix[0, 1]           
FN = confusion_matrix[1, 0]

[[178   5]
 [ 15 194]]


In [16]:
# Evaluate the Result
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, mean_absolute_error, roc_auc_score
 
# Accuracy Score 
print('Accuracy Score:', accuracy_score(y_test, y_pred)) 

# Precision Score
print('Precision Score:', precision_score(y_test, y_pred))   

# True positive Rate (TPR) or Sensitivity or Recall    
print('True positive Rate:', recall_score(y_test, y_pred))             

# False positive Rate (FPR)
print('False positive Rate', FP / float(TN + FP))                       

# F1 Score or F-Measure or F-Score
print('F1 Score:', f1_score(y_test, y_pred))                 

# Specificity
print('Specificity:', TN / (TN + FP))                    

# Mean Absolute Error
print('Mean Absolute Error:', mean_absolute_error(y_test, y_pred))   

# ROC Area
print('ROC Area:', roc_auc_score(y_test, y_pred)) 

Accuracy Score: 0.9489795918367347
Precision Score: 0.9748743718592965
True positive Rate: 0.9282296650717703
False positive Rate 0.0273224043715847
F1 Score: 0.9509803921568627
Specificity: 0.9726775956284153
Mean Absolute Error: 0.05102040816326531
ROC Area: 0.9504536303500928


### Part 5: Save, Load and Used this Model

In [17]:
# import pickle library 
import pickle               # pickle used for serializing and de-serializing a Python object structure

In [18]:
# save the model (Serialization using pickle)
Support_Vector_Machine = open("model.pkl","wb")          # open the file for writing
pickle.dump(classifier,Support_Vector_Machine)           # dumps an object to a file object
Support_Vector_Machine.close()                           # here we close the fileObject

In [19]:
# Load the model (Deserialization using pickle)
ytb_model = open("model.pkl","rb")           # open the file for reading
new_model = pickle.load(ytb_model)           # load the object from the file into new_model
new_model

SVC(kernel='linear', random_state=0)

In [20]:
# Used the model for Prediction
comment = ["Hey Music Fans I really appreciate all of you,but see this song too"]
vect = cv.transform(comment).toarray()
new_model.predict(vect)

array([1], dtype=int64)

In [21]:
if new_model.predict(vect) == 1:
    print("Spam")
else:
    print("Ham")

Spam
