# NLP : Metode SVM
---
Yogi Afrizah (G1A017012)

Deri Afrianda (G1A017021)



In [7]:
import pandas as pd
import re
import nltk

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

## Dataset

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
dataset = 'drive/MyDrive/Colab Notebooks/Amazon_Unlocked_Mobile.csv'
data = pd.read_csv(dataset)
data.head()

Unnamed: 0,Product Name,Brand Name,Price,Rating,Reviews,Review Votes
0,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,I feel so LUCKY to have found this used (phone...,1.0
1,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,"nice phone, nice up grade from my pantach revu...",0.0
2,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,Very pleased,0.0
3,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,It works good but it goes slow sometimes but i...,0.0
4,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,Great phone to replace my lost phone. The only...,0.0


In [5]:
# ambil hanya pada brand samsung
data = data.loc[data['Brand Name'] == 'Samsung', 'Brand Name':'Reviews']
# ambil hanya 1000 data
data = data.iloc[0:1000]
# ambil hanya feature Reviews dan Rating
data = data[['Reviews', 'Rating']]
# hapus data mis
data.dropna(inplace=True)

In [6]:
data.head()

Unnamed: 0,Reviews,Rating
0,I feel so LUCKY to have found this used (phone...,5
1,"nice phone, nice up grade from my pantach revu...",4
2,Very pleased,5
3,It works good but it goes slow sometimes but i...,4
4,Great phone to replace my lost phone. The only...,4


## Preprocessing


In [None]:
# melabelkan data menjadi 3 kategori
def pelabelan(rate):
  if rate < 3:
    return 'negatif'
  else:
    return 'positif'

data['Label'] = data['Rating'].apply(pelabelan)
data.head()

Unnamed: 0,Reviews,Rating,Label
0,I feel so LUCKY to have found this used (phone...,5,positif
1,"nice phone, nice up grade from my pantach revu...",4,positif
2,Very pleased,5,positif
3,It works good but it goes slow sometimes but i...,4,positif
4,Great phone to replace my lost phone. The only...,4,positif


In [8]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [None]:
data['Reviews'][1]

'nice phone, nice up grade from my pantach revue. Very clean set up and easy set up. never had an android phone but they are fantastic to say the least. perfect size for surfing and social media. great phone samsung'

In [None]:
lemma = WordNetLemmatizer()
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

def CleanReview(txt):
  txt = re.sub(r'http\S+', ' ', txt)                                                 # hapus url
  txt = re.sub('[^a-zA-Z]','  ', txt)                                                 # hapus teks tidak relevan (karakter selain a-z)
  txt = str(txt).lower()                                                                # jadikan huruf kecil semua
  txt = word_tokenize(txt)                                                         # tokenize
  txt = [item for item in txt if item not in stop_words]           # hapus stop words ('saya' 'aku' 'kamu' dll)
  # txt = [stemmer.stem(i) for i in txt]                                         # stemming : pemotongan akhir kata
  txt = [lemma.lemmatize(word=w,pos='v') for w in txt]        # lemmatization : mengembalikan ke bentuk dasar kata
  txt = [i for i in txt if len(i) > 2]                                                # hapus kata yang kurang dari dua huruf/karakter
  txt = ' '.join(txt)                                                                        # penggabungan setiap token menjadi kalimat utuh
  return txt

data['CleanReview'] = data['Reviews'].apply(CleanReview)

In [None]:
data.head()

Unnamed: 0,Reviews,Rating,Label,CleanReview
0,I feel so LUCKY to have found this used (phone...,5,positif,feel lucky find use phone use hard phone line ...
1,"nice phone, nice up grade from my pantach revu...",4,positif,nice phone nice grade pantach revue clean set ...
2,Very pleased,5,positif,please
3,It works good but it goes slow sometimes but i...,4,positif,work good slow sometimes good phone love
4,Great phone to replace my lost phone. The only...,4,positif,great phone replace lose phone thing volume bu...


## Perform SVM

In [None]:
# split x dan y
x = data['CleanReview']
y = data['Label']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [None]:
x_train

183409    irresponsible send another article one want reach
183310            good phone expect work well straight talk
182838    wonderful phone far best phone ever load crap ...
184772    great phone large display size screen however ...
183421                    complain much mean happy purchase
                                ...                        
183447    time get make call laeft thatbut right even ye...
182826                           would buy phone total junk
184870    samsung galaxy ace phone look cover like bigge...
183159    far good wife phone trouble picture videos als...
183331    specifically buy version galaxy rather newer c...
Name: CleanReview, Length: 799, dtype: object

In [None]:
# perform count vectorizer
vectorizer = CountVectorizer()
vectorizer.fit(x_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [None]:
# x_train
x_train = vectorizer.transform(x_train)
x_test = vectorizer.transform(x_test)

In [None]:
x_train.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 1, 1, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0]])

In [None]:
for c in [0.01, 0.05, 0.25, 0.5, 0.75,  1]:
  svm = LinearSVC(C=c)
  svm.fit(x_train, y_train)
  print('Akurasi untuk c = %s: %s' %(c, accuracy_score(y_test, svm.predict(x_test))))


Akurasi untuk c = 0.01: 0.865
Akurasi untuk c = 0.05: 0.87
Akurasi untuk c = 0.25: 0.865
Akurasi untuk c = 0.5: 0.86
Akurasi untuk c = 0.75: 0.86
Akurasi untuk c = 1: 0.865


In [None]:
svm = LinearSVC(C = 0.05)
svm.fit(x_train, y_train)

LinearSVC(C=0.05, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [None]:
print('Accuracy score model final: %s ' %accuracy_score(y_test, svm.predict(x_test)))

Accuracy score model final: 0.87 


## Evaluasi Model

In [None]:
y_pred = svm.predict(x_test)
print('Accuracy of SVM classifier on test set: {:.2f}'.format(svm.score(x_test, y_test)))

confusion_matrix = confusion_matrix(y_test, y_pred)
print(confusion_matrix)
print(classification_report(y_test, y_pred))

Accuracy of SVM classifier on test set: 0.87
[[ 13  17]
 [  9 161]]
              precision    recall  f1-score   support

     negatif       0.59      0.43      0.50        30
     positif       0.90      0.95      0.93       170

    accuracy                           0.87       200
   macro avg       0.75      0.69      0.71       200
weighted avg       0.86      0.87      0.86       200

