# Arabic Sentiment using Tensorflow

Text -> Numaric representation -> Model
# First Approach using Vector Space Model VSM
in two steps:
* document vector and remove stopword
* convert document vector to numbers by:
1. Term Frequency TF
2. Invers Document Frequency IDF

IDF=Log(N/DF)+1 Number of documents


first approach has some limitations:
1. dictionary size (dimensions) can became huge
2. context not preserved

the solution: using Universal Sentence Encoder (USE). it will be the second approach 

    Generating features from Text by: Vector space model (VSM)
    Including Inverse Document frequency (IDF)

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import regularizers
import operator
import math
from functools import reduce
from sklearn.model_selection import train_test_split
import os
import json


In [None]:
df = pd.read_csv('../input/reviews/5556-ar-reviews.csv')
df.head()

In [None]:
df['label'].value_counts(normalize=True)

In [None]:
df = df.sample(frac = 1) 

# Process Data

In [None]:
stopwords = {'فإذا', 'أنى', 'بمن', 'حتى', 'لم', 'أنتما', 'هناك', 'تينك', 'بل', 'إي', 'عن', 'ولكن', 'وإذا', 'دون', 'إنا', 'إذن', 'بكم', 'حين', 'عند', 'هل', 'إلا', 'هاته', 'ذينك', 'اللواتي', 'كذا', 'لستما', 'هي', 'اللتان', 'أكثر', 'كلتا', 'لكن', 'ليستا', 'هكذا', 'عسى', 'إذ', 'إن', 'اللاتي', 'إذا', 'بهم', 'نحن', 'فيما', 'ذاك', 'بكن', 'بيد', 'لهن', 'هذي', 'كأي', 'ذوا', 'أي', 'كلاهما', 'هذين', 'أينما', 'كي', 'إليكن', 'ماذا', 'هيا', 'هنالك', 'بي', 'بما', 'تلكما', 'بعض', 'بهن', 'تين', 'ريث', 'على', 'غير', 'حيثما', 'كأن', 'بخ', 'هاتان', 'هاهنا', 'ما', 'هيهات', 'لدى', 'شتان', 'لسنا', 'كيفما', 'مع', 'ممن', 'كما', 'إنما', 'يا', 'عليه', 'لك', 'ذه', 'ذان', 'لهما', 'ليست', 'لنا', 'مه', 'أنتن', 'في', 'لولا', 'بس', 'لها', 'أقل', 'عليك', 'فلا', 'مهما', 'ليسا', 'ذين', 'ذات', 'كلما', 'ذا', 'ذو', 'فيه', 'تي', 'هنا', 'هاتين', 'ها', 'هم', 'ألا', 'لا', 'سوى', 'وإذ', 'كم', 'لست', 'حيث', 'إليكما', 'لوما', 'الذين', 'كلا', 'التي', 'كأين', 'ذواتي', 'لستم', 'هذا', 'فمن', 'ذلكم', 'وما', 'كيف', 'لكم', 'حاشا', 'بك', 'والذي', 'أن', 'لهم', 'لسن', 'ثمة', 'ذي', 'وإن', 'ومن', 'أيها', 'له', 'متى', 'بلى', 'اللتين', 'لستن', 'بكما', 'قد', 'كليكما', 'لكما', 'هلا', 'آي', 'لكنما', 'اللذين', 'اللائي', 'ذلكن', 'لاسيما', 'ذلك', 'مذ', 'اللتيا', 'هما', 'إليك', 'سوف', 'منها', 'والذين', 'أنتم', 'هاتي', 'لكي', 'اللذان', 'ذواتا', 'عما', 'فيها', 'إلى', 'تلك', 'كل', 'لي', 'هو', 'فيم', 'إليكم', 'بها', 'ذانك', 'إنه', 'هؤلاء', 'أولئك', 'إذما', 'بنا', 'من', 'خلا', 'ليسوا', 'ثم', 'لعل', 'وهو', 'نحو', 'أين', 'لئن', 'عدا', 'آه', 'كأنما', 'كليهما', 'الذي', 'لن', 'نعم', 'هذه', 'بهما', 'ليت', 'تلكم', 'أما', 'منذ', 'أو', 'هاك', 'بماذا', 'كذلك', 'أنا', 'آها', 'فإن', 'عل', 'منه', 'هيت', 'أف', 'أم', 'إيه', 'كيت', 'ته', 'لكيلا', 'ليس', 'مما', 'هذان', 'أنت', 'حبذا', 'ولو', 'أوه', 'إما', 'لو', 'بين', 'به', 'ولا', 'لما', 'بعد', 'هن', 'ذلكما', 'أولاء','و'}

maxDictionaryLength = 8000

def tokenize(sentence, isCreateDict=False):
    tmpTokens = sentence.lower().split()
    tokens = [token for token in tmpTokens if ((token not in stopwords) and (len(token)> 0)) ]
     
    if isCreateDict:
        for token in tokens:
            if token in dictionary_dict:
                dictionary_dict[token] += 1
            else:
                dictionary_dict[token] = 1
    documentTokens.append(tokens)
    return tokens


def getInverseDocumentFrequency(documentTokens, dictionary):
    return list(map(lambda word : 1 + math.log(len(documentTokens) / reduce(lambda acc,curr: (1 if (word in curr) else 0) + acc, documentTokens,0)),dictionary))


  
def encoder(sentence, dictionary, idfs):
    tokens = tokenize(sentence)
    tfs = getTermFrequency(tokens, dictionary)
    tfidfs = getTfIdf(tfs,idfs)
    return tfidfs


def getTermFrequency(tokens, dictionary):
    return  list(map(lambda token: reduce(lambda acc,curr : (acc + 1 if (curr == token) else acc), tokens,0), dictionary))



def getTfIdf(tfs, idfs):
    return [tf * idf for (tf,idf) in zip(tfs,idfs)]



**Sample Test** Code used in the slides ( Module : preparing data for machine learning model )

In [None]:
dictionary_dict = {}
documentTokens = []
testComments = ['للراحة عنوان . كل شي. لا شي', 'شيء جميل']

for comment in testComments:
    documentTokens.append(tokenize(comment,True))


dictionary = sorted(dictionary_dict, key=dictionary_dict.get, reverse=True)
idfs = getInverseDocumentFrequency(documentTokens, dictionary);

tfidfs = []

for comment in testComments:
    tfidfs.append(encoder(comment, dictionary, idfs))

print(dictionary_dict)
print(dictionary)
print(idfs)
print(tfidfs)

In [None]:
dictionary_dict = {}
documentTokens = []
df['tokens'] = df['text'].apply(lambda x : tokenize(x, True))

In [None]:
df.head()

In [None]:
dictionary = sorted(dictionary_dict, key=dictionary_dict.get, reverse=True)
dictionary = dictionary[:maxDictionaryLength]
print('Length of dictionary : {0}'.format(len(dictionary)))
print(dictionary[:10])

In [None]:
idfs = getInverseDocumentFrequency(documentTokens, dictionary)
len(idfs)

In [None]:
df['features'] = df['text'].apply(lambda x : encoder(x,dictionary, idfs))
df['features'].head()

In [None]:
df_new = df['features'].apply(lambda x : pd.Series(x))
df_new['label'] = df['label']

# Train Test Split

In [None]:
train, test = train_test_split(df_new, test_size=0.2)
train, val = train_test_split(train, test_size=0.1)
print(len(train), 'train examples')
print(len(val), 'validation examples')
print(len(test), 'test examples')

In [None]:
print(train.shape, test.shape, val.shape)

In [None]:
def df_to_dataset(dataframe, shuffle=True, batch_size=16):
    dataframe = dataframe.copy()
    labels = dataframe.pop('label')
    ds = tf.data.Dataset.from_tensor_slices((dataframe.values, labels))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(dataframe))
    ds = ds.batch(batch_size)
    return ds

In [None]:
batch_size = 100
train_ds = df_to_dataset(train, batch_size=batch_size)
val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)

In [None]:
numOfFeatures = len(dictionary)

# Build Model

In [None]:
def get_build_model():
    model = tf.keras.Sequential([
    tf.keras.layers.Dense(15, activation='relu', input_shape=(numOfFeatures,)),
    tf.keras.layers.Dropout(0.3),    
    tf.keras.layers.Dense(15, activation='relu'),  
    tf.keras.layers.Dropout(0.3),   
    tf.keras.layers.Dense(1,activation='sigmoid')
  ])
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.01),
                loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
                metrics=['accuracy'])
    return model

In [None]:
model = get_build_model()
model.summary()
model.fit(train_ds,epochs=20 ,validation_data=val_ds)

# Evaluate Model

In [None]:
model.evaluate(test_ds)

# Make Predictions

In [None]:
## make predictions
testComments = ['صباح الخيرات', 'للراحة عنوان . كل شي. لا شي']
tfidfs = []
for comment in testComments:
    tfidfs.append(encoder(comment, dictionary, idfs))
print(f'predicted probabliities : {model.predict(tfidfs)}')
print(f'predicted classes : {tf.round(model.predict(tfidfs))}')

# Export Model

In [None]:
model.save('ar_reviews.h5')

In [None]:
# write dictionary and IDFs

with open('dictionary.json', 'w', encoding='utf-8') as outfile:
    json.dump(dictionary, outfile,  ensure_ascii=False, indent=4)

with open('idfs.json', 'w', encoding='utf-8') as outfile:
    json.dump(idfs, outfile, ensure_ascii=False, indent=4)

# sklearn.ensemble import RandomForestClassifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf_model = RandomForestClassifier(n_jobs=-1,n_estimators=180)
rf_model.fit(train.loc[:, train.columns != 'label'], train['label'])

In [None]:
rf_model.score(train.loc[:, train.columns != 'label'], train['label'])

# Testing

In [None]:
rf_model.score(test.loc[:, test.columns != 'label'],test['label'])