# K-Nearest Neighbors (K-NN)

### 參考課程實作並在datasets_483_982_spam.csv的資料集中獲得90% 以上的 accuracy (testset)

## Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
import glob
import codecs
import re

## Importing the dataset

In [2]:
dataset = pd.read_csv(r'datasets_483_982_spam.csv', encoding = 'latin-1')
dataset = dataset.iloc[:, :2]
dataset.columns = ['label', 'content']

mapping_spam = {'spam':1, 'ham':0}

dataset['label'] = dataset['label'].map(mapping_spam)

dataset.head()

Unnamed: 0,label,content
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


### 取出訓練內文與標註

In [3]:
# X = all_data[:,0]
# Y = all_data[:,1].astype(np.uint8)

X = list(dataset['content'].values)
print('Training Data Examples : \n{}'.format(X[:5]))

Training Data Examples : 
['Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...', 'Ok lar... Joking wif u oni...', "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's", 'U dun say so early hor... U c already then say...', "Nah I don't think he goes to usf, he lives around here though"]


In [4]:
# print('Training Data Examples : \n{}'.format(X[:5]))

In [5]:
Y = dataset['label'].values
print('Labeling Data Examples : \n{}'.format(Y[:5]))

Labeling Data Examples : 
[0 0 1 0 0]


In [6]:
# print('Labeling Data Examples : \n{}'.format(Y[:5]))

### 文字預處理

In [7]:
from sklearn.metrics import confusion_matrix
from nltk.corpus import stopwords

import nltk

nltk.download('stopwords')

# Lemmatize with POS Tag
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer 

"""可以參考課程練習方式清理文字，或是使用自己的方式"""
lemmatizer = WordNetLemmatizer()

def get_word_pos(word: str):                               # 使用 pos_tag 得到該 word 詞性丟給 lemmatizer
    tag = nltk.pos_tag([word])[0][1][0].upper()       # 再取得該 word 的 lemma
    tag_mapping = {'J': wordnet.ADJ,
                   'N': wordnet.NOUN,
                   'V': wordnet.VERB,
                   'R': wordnet.ADV}
    return tag_mapping.get(tag, wordnet.NOUN)

def clean_content(X: list) -> list:
    # 去除非字母的符號, 數字
    X_remove_symbol = [ re.sub('[^a-zA-Z]', ' ', x).lower() for x in X]
    # 斷字 tokenize
    X_word_tokenize = [nltk.word_tokenize(x) for x in X_remove_symbol]
    # 去除停用詞後再 lemmatize
    stop_words = set(stopwords.words('english'))
    X_lemmatize = []
    for content in X_word_tokenize:
        content_clean = []
        for word in content:
            if word not in stop_words:
                word = lemmatizer.lemmatize(word, get_word_pos(word))
                content_clean.append(word)
        X_lemmatize.append(content_clean)
    
    X_output = [' '.join(x) for x in X_lemmatize]
    return X_output
                

                 

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\brian\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
X = clean_content(X)
X[:2]

['go jurong point crazy available bugis n great world la e buffet cine get amore wat',
 'ok lar joking wif u oni']

In [9]:
# 確認總共有幾個字
def how_many_words(X: list) -> int:
    total = set()
    for x in X:
        content = x.split(' ')
        total |= set(content)
    return len(total)

how_many_words(X)

6540

### Bag of words

In [10]:
from sklearn.feature_extraction.text import CountVectorizer
#max_features是要建造幾個column，會按造字出現的高低去篩選 
cv=CountVectorizer(max_features = 3000)
X_BOW=cv.fit_transform(X).toarray()

In [11]:
X_BOW.shape

(5572, 3000)

### TFIDF

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=3000, smooth_idf=True)
X_tfidf = vectorizer.fit_transform(X)

X_tfidf.shape

(5572, 3000)

## Splitting the dataset into the Training set and Test set

In [13]:
from sklearn.model_selection import train_test_split
X_BOW_train, X_BOW_test, y_BOW_train, y_BOW_test = train_test_split(X_BOW, Y, test_size = 0.2, stratify = Y, random_state = 0)
X_TF_train, X_TF_test, y_TF_train, y_TF_test = train_test_split(X_tfidf, Y, test_size = 0.2, stratify = Y, random_state = 0)


## Training the K-NN model on the Training set

In [14]:
from sklearn.neighbors import KNeighborsClassifier
classifier_BOW = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2)
classifier_TF = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2)
classifier_BOW.fit(X_BOW_train, y_BOW_train)
classifier_TF.fit(X_TF_train, y_TF_train)

KNeighborsClassifier()

In [31]:
# from sklearn.neighbors import KNeighborsClassifier
# classifier = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2)
# classifier.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

## Predicting a new result

In [15]:
print('Trainset Accuracy: {}'.format(classifier_BOW.score(X_BOW_train, y_BOW_train)))
print('Testset Accuracy: {}'.format(classifier_BOW.score(X_BOW_test, y_BOW_test)))

Trainset Accuracy: 0.9452546555979359
Testset Accuracy: 0.9183856502242153


In [16]:
print('Trainset Accuracy: {}'.format(classifier_TF.score(X_TF_train, y_TF_train)))
print('Testset Accuracy: {}'.format(classifier_TF.score(X_TF_test, y_TF_test)))

Trainset Accuracy: 0.9356069104778999
Testset Accuracy: 0.9112107623318386


In [32]:
# print('Trainset Accuracy: {}'.format(classifier.score(X_train, y_train)))

Trainset Accuracy: 0.9421135292797846


In [33]:
# print('Testset Accuracy: {}'.format(classifier.score(X_test, y_test)))

Testset Accuracy: 0.9121076233183857


## Predicting the Test set results

In [17]:
y_BOW_pred = classifier_BOW.predict(X_BOW_test)
y_TF_pred = classifier_TF.predict(X_TF_test)

## Making the Confusion Matrix

In [18]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm_BOW = confusion_matrix(y_BOW_test, y_BOW_pred)
print(cm_BOW)
accuracy_score(y_BOW_test, y_BOW_pred)

[[966   0]
 [ 91  58]]


0.9183856502242153

In [19]:
cm_TF = confusion_matrix(y_TF_test, y_TF_pred)
print(cm_TF)
accuracy_score(y_TF_test, y_TF_pred)

[[965   1]
 [ 98  51]]


0.9112107623318386

In [35]:
# from sklearn.metrics import confusion_matrix, accuracy_score
# cm = confusion_matrix(y_test, y_pred)
# print(cm)
# accuracy_score(y_test, y_pred)

[[949   0]
 [ 98  68]]


0.9121076233183857