In [1]:
# import the necessary packages
import os
import re
import string
import math
import numpy as np
from decimal import Decimal
import pandas as pd
import nltk
from nltk.stem.isri import ISRIStemmer
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize,RegexpTokenizer

In [2]:
# import the dataset as xlsx sheet 
dataset = pd.read_excel('dataset.xlsx', encoding ='utf-8-sig')

In [3]:
dataset.shape

(312, 3)

**There are 312 reviews**

In [4]:
dataset.isnull().values.any()

False

**There are no null values in the dataset to drop**

In [5]:
dataset.head()

Unnamed: 0,ID,Reviews,Category
0,1,مسرور جدا من الشراء من موقع فورديل ساشتري مرة ...,Fake
1,2,قصير جدا جدا لن اعيد التعامل معكم فورديل,Fake
2,3,اجمالا سعيد بالمنتج,Fake
3,4,بضاعة جميلة بسعر مناسب,Fake
4,5,عمل رائع جدا,Fake


In [6]:
# Convert catigorical to numerical
dataset['Category'].replace(('Fake', 'Real'),(1, 0),inplace = True)

In [7]:
dataset.head()


Unnamed: 0,ID,Reviews,Category
0,1,مسرور جدا من الشراء من موقع فورديل ساشتري مرة ...,1
1,2,قصير جدا جدا لن اعيد التعامل معكم فورديل,1
2,3,اجمالا سعيد بالمنتج,1
3,4,بضاعة جميلة بسعر مناسب,1
4,5,عمل رائع جدا,1


In [8]:
# Split the data set into x features the and y target
x=dataset.iloc[:,1]
y=dataset.iloc[:,2]

In [9]:
# Creates a dictionary of x 
# A dictionary is a collection which is unordered, changeable and indexed.
X=x.to_dict()

# Preprocessing

In [10]:
# Download the stop word from nl toolkit
nltk.download('stopwords');

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\asmam\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
# import the arabic stop words
stop_words = set(stopwords.words('arabic')) 
stop_words;

### 1. Tokanizing and Remove Stopwords  

In [12]:
# Tokanizing the sentences
# And remove STOPWORDs
filtered_sentence = [] 
tokenizer = RegexpTokenizer(r'\w+')
for sen in x:
    example_sent = tokenizer.tokenize(sen)
    S = [w for w in example_sent if not w in stop_words] 
    filtered_sentence.append(S)    

### Example  

The Review before Tokenize and remove Stopwords <br>

**Original**

In [13]:
x[9]

'صديقي يبدو مذهلا في ذلك و أود أن أوصي هذا بائع'

**After Tokenization** 

In [14]:
tokenizer.tokenize(x[9])

['صديقي',
 'يبدو',
 'مذهلا',
 'في',
 'ذلك',
 'و',
 'أود',
 'أن',
 'أوصي',
 'هذا',
 'بائع']

> **The review is split into words**

**After Removing Stopwords** 

In [15]:
filtered_sentence[9]

['صديقي', 'يبدو', 'مذهلا', 'و', 'أود', 'أوصي', 'بائع']

>**The words في & ذلك & أن & هذا have been removed as they are stopwords** 


## 2. Stemming 

In [16]:
#ISRI Arabic stemmer based on algorithm: Arabic Stemming without a root dictionary.
st = ISRIStemmer()
Stemmed = []
for sen in filtered_sentence:
    stemer = []
    for word in sen:
        stemer.append(st.stem(word))
    Stemmed.append(stemer)   

### Example  

The review after removing stopwords <br>

**After Removing Stopwords**

In [17]:
filtered_sentence[9]

['صديقي', 'يبدو', 'مذهلا', 'و', 'أود', 'أوصي', 'بائع']

**After Stemming**

In [18]:
Stemmed[9]

['صدق', 'بدو', 'ذهل', 'و', 'اود', 'اوص', 'بئع']

> **The above example show how ISRIStemmer work** 

In [19]:
# Convert the stemmer to string 
data_string = []
for p in range(len(Stemmed)):
# initialize an empty string 
    str1 = ""  
    # traverse in the string   
    for ele in Stemmed[p]:  
        str1 += ele+ " "
    data_string.append(str1)

In [20]:
# Convert the stemmer to dictionary
dictOfWords = { i : data_string[i] for i in range(0, len(data_string) )}

# Training and Testing

In [21]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split

# Split the dataset into training and testing 
# with 30% testing and 80% training 
X_train, X_test, y_train, y_test = train_test_split(dictOfWords, y, test_size = 0.2, random_state = 0)

In [22]:
# Vectorize the dataset 
count_vect=CountVectorizer()
X_train_counts=count_vect.fit_transform(X_train)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

### The random over sampler because the dataset is imbalanced 

In [23]:
from imblearn.over_sampling import RandomOverSampler
sm=RandomOverSampler()
X_train_res, y_train_res = sm.fit_sample(X_train_tfidf, y_train)

In [24]:
unique, counts = np.unique(y_train_res, return_counts=True)
print(list(zip(unique, counts)))

[(0, 201), (1, 201)]


## SVM Model

In [59]:
from sklearn.svm import SVC

clf= SVC(kernel='rbf', gamma=0.1, C=1)
clf.fit(X_train_res, y_train_res)
print('The Training Accuracy', clf.score(X_train_res, y_train_res))
X_test_tfidf=count_vect.transform(X_test)
y_pred=clf.predict(X_test_tfidf)

from sklearn.metrics import confusion_matrix,accuracy_score
cm = confusion_matrix(y_test, y_pred)
Accuracy_Score = accuracy_score(y_test, y_pred)
print('\nThe Testing Accuracy', Accuracy_Score)
from sklearn.metrics import f1_score
print('\nThe F1 Score', f1_score(y_test, y_pred))

The Training Accuracy 0.8930348258706468

The Testing Accuracy 0.8412698412698413

The F1 Score 0.6666666666666667


In [60]:
from sklearn import metrics
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

             precision    recall  f1-score   support

          0       0.93      0.86      0.90        50
          1       0.59      0.77      0.67        13

avg / total       0.86      0.84      0.85        63

