In [1]:
# Importing libraries for vectorization
from sklearn.feature_extraction.text import CountVectorizer
import operator

# Importing libraries for Naive Bayes Train
import pickle
import pandas as pd
import numpy as np
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
import email.parser 
import os, sys, stat
import shutil
import re
from pathlib import Path

In [2]:
# Inputting training datasets

input_preprocessed = "./TR-mails/result3"
input_original = "./TR-mails/TR"

def getSingleDatasetFromDir ( filename ):
    if not os.path.exists(filename): # dest path doesnot exist
        print ("ERROR: input file does not exist:"), filename
        os.exit(1)
    fp = open(filename, errors="ignore")
    payload = fp.read()
    
    return payload

def getDatasetsFromDir ( srcdir ):
    datasets = []
    files = os.listdir(srcdir)
    for i in range(0,2500):
        file = "TRAIN_"+str(i+1)+".eml"
        srcpath = os.path.join(srcdir, file)
        src_info = os.stat(srcpath)
        body = getSingleDatasetFromDir (srcpath)
        datasets.append(body)
    return datasets

preprocessed_datasets = getDatasetsFromDir(input_preprocessed)
original_datasets = getDatasetsFromDir(input_original)

# Inputtting label training datasets
label_path = "./train-label.csv"
label_data = pd.read_csv(label_path)
label_data['Prediction']
label_data = np.asarray(label_data['Prediction'])

# Convert to dataframe
df_preprocessed = pd.DataFrame({'teks': preprocessed_datasets,
                  'label': label_data})
df_original = pd.DataFrame({'teks': original_datasets,
                  'label': label_data})

In [3]:
# Original Datasets
print("ORIGINAL DATASETS")

#Naive Bayes Train
x_train, x_test, y_train, y_test = train_test_split(df_original['teks'], df_original['label'], test_size = 0.1)

tfidf_clf = Pipeline([ ('count',CountVectorizer()),
                      ('tfidf',TfidfTransformer()),
                      ('clf', MultinomialNB())])
no_tfidf_clf = Pipeline([ ('count',CountVectorizer()),
                      ('clf', MultinomialNB())])

tfidf_clf.fit(x_train, y_train)
no_tfidf_clf.fit(x_train, y_train)

print("With TF-IDF: ", tfidf_clf.score(x_test,y_test))
print("Without TF-IDF: ", no_tfidf_clf.score(x_test,y_test))

ORIGINAL DATASETS
With TF-IDF:  0.876
Without TF-IDF:  0.952


In [4]:
# Preprocessed Datasets
print("PREPROCESSED DATASETS")

#Naive Bayes Train
x_train, x_test, y_train, y_test = train_test_split(df_preprocessed['teks'], df_preprocessed['label'], test_size = 0.1)

tfidf_clf = Pipeline([ ('count',CountVectorizer()),
                      ('tfidf',TfidfTransformer()),
                      ('clf', MultinomialNB())])
no_tfidf_clf = Pipeline([ ('count',CountVectorizer()),
                      ('clf', MultinomialNB())])

tfidf_clf.fit(x_train, y_train)
no_tfidf_clf.fit(x_train, y_train)

print("With TF-IDF: ", tfidf_clf.score(x_test,y_test))
print("Without TF-IDF: ", no_tfidf_clf.score(x_test,y_test))

PREPROCESSED DATASETS
With TF-IDF:  0.812
Without TF-IDF:  0.944
