# K-Nearest Neighbors (K-NN)

### 參考課程實作並在datasets_483_982_spam.csv的資料集中獲得90% 以上的 accuracy (testset)

## Importing the libraries

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
%cd './drive/My Drive/NLP/day20'

/content/drive/My Drive/NLP/day20


In [3]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
import glob
import codecs
import re

## Importing the dataset

In [4]:
dataset = pd.read_csv(r'datasets_483_982_spam.csv', encoding = 'latin-1' , usecols=[0,1])

"讀取資料集"
dataset.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
print(len(dataset))

5572


In [6]:
all_data=[]

for i in range(len(dataset)):
  if (dataset.iloc[i,0] == 'ham'):
    is_spam = 0
  else:
    is_spam = 1
  
  content = dataset.iloc[i,1].strip()
  all_data.append([content , is_spam])

all_data = np.array(all_data)

### 取出訓練內文與標註

In [7]:
X = all_data[:,0]
Y = all_data[:,1].astype(np.uint8)

In [8]:
print('Training Data Examples : \n{}'.format(X[:5]))

Training Data Examples : 
['Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'
 'Ok lar... Joking wif u oni...'
 "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"
 'U dun say so early hor... U c already then say...'
 "Nah I don't think he goes to usf, he lives around here though"]


In [9]:
print('Labeling Data Examples : \n{}'.format(Y[:5]))

Labeling Data Examples : 
[0 0 1 0 0]


### 文字預處理

In [10]:
from sklearn.metrics import confusion_matrix
from nltk.corpus import stopwords

import nltk

nltk.download(['stopwords','punkt','averaged_perceptron_tagger','wordnet'])

# Lemmatize with POS Tag
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer 

lemmatizer = WordNetLemmatizer()

def get_wordnet_pos(word):
  pos = nltk.pos_tag([word])[0][1][0].upper()
  tag_dict = { "J": wordnet.ADJ,
          "N": wordnet.NOUN,
          "V": wordnet.VERB,
          "R": wordnet.ADV}
  return tag_dict.get(pos , wordnet.NOUN)


def clean_content(X):
  # remove non-alphabet characters
  X_clean = [re.sub('[^a-zA-Z]',' ', x).lower() for x in X]
  # tokenize
  X_word_tokenize = [nltk.word_tokenize(x) for x in X_clean]
  # stopwords & lemmatize
  X_stopwords_lemmatizer = []
  
  for word_list in X_word_tokenize:
    content_clean = []
    
    for word in word_list:
      if word not in set(stopwords.words('english')):
        word = lemmatizer.lemmatize(word , get_wordnet_pos(word))
        content_clean.append(word)

    X_stopwords_lemmatizer.append(content_clean)
    
  X_output = [' '.join(x) for x in X_stopwords_lemmatizer]
  return X_output


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [11]:
X = clean_content(X)

### Bag of words

In [12]:
from sklearn.feature_extraction.text import CountVectorizer
#max_features是要建造幾個column，會按造字出現的高低去篩選 
cv=CountVectorizer(max_features = 1500)
X=cv.fit_transform(X).toarray()

In [13]:
X.shape

(5572, 1500)

## Splitting the dataset into the Training set and Test set

In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)

## Training the K-NN model on the Training set

In [15]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors=5)
classifier.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

## Predicting a new result

In [16]:
print('Trainset Accuracy: {}'.format(classifier.score(X_train, y_train)))

Trainset Accuracy: 0.942562261610949


In [17]:
print('Testset Accuracy: {}'.format(classifier.score(X_test, y_test)))

Testset Accuracy: 0.9147982062780269


## Predicting the Test set results

In [18]:
y_pred = classifier.predict(X_test)

## Making the Confusion Matrix

In [19]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[949   0]
 [ 95  71]]


0.9147982062780269