In [215]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
from IPython.display import display

In [216]:
final_file = pd.read_csv('cleaned_data - Copy.csv')

In [217]:
final_file

Unnamed: 0,review,polarity,cleaned_text
0,تطبيق جيد، بسيط وفعال لي أنصح بشدة لجميع مالكي...,1,تطبيق جيد بسيط وفعال انصح بشده لجميع مالكي الح...
1,فعال,1,فعال
2,أوصي بشدة جميع مالكي الحساب في هذا البنك الجيد...,1,اوصي بشده مالكي الحساب البنك الجيد
3,بنك جيد جدا!,1,بنك جيد
4,علة سيئة!,-1,عله سيئه
5,سيئ,1,سيئ
6,تطبيق رائع، عملي جدا. أصبح كل شيء بسيطا، ويعمل...,1,تطبيق رائع عملي شيء بسيطا ويعمل جيدا بالنسبه
7,تطبيق رائع,1,تطبيق رائع
8,مريحة للغاية.,1,مريحه للغايه
9,انها تعمل جيدة جدا,1,تعمل جيده


In [218]:
sentiment_map = pd.read_csv('sentiment_dict.csv')
sentiment_dict = dict(zip(sentiment_map.words.values, sentiment_map.sentiment_coeff.values))

In [219]:
sentiment_map

Unnamed: 0,words,sentiment_coeff
0,بنك,0.999916
1,جيد,1.013212
2,تطبيق,1.013912
3,التطبيق,1.010318
4,انا,-1.005417
...,...,...
192,بدون,1.011055
193,مزيانه,-1.019680
194,المعامله,-1.004579
195,اخري,-1.009889


Getting tfidf scores of words in every sentence, and replacing them with their associated tfidf weights:

In [220]:
file_weighting = final_file.copy()

In [221]:
tfidf = TfidfVectorizer(tokenizer=lambda y: y.split())
tfidf.fit(file_weighting.cleaned_text)
features = pd.Series(tfidf.get_feature_names())
transformed = tfidf.transform(file_weighting.cleaned_text)



Replacing words in sentences with their tfidf scores

In [222]:
def create_tfidf_dictionary(x, transformed_file, features):
    '''
    create dictionary for each input sentence x, where each word has assigned its tfidf score
    
    inspired  by function from this wonderful article: 
    https://medium.com/analytics-vidhya/automated-keyword-extraction-from-articles-using-nlp-bfd864f41b34
    
    x - row of dataframe, containing sentences, and their indexes,
    transformed_file - all sentences transformed with TfidfVectorizer
    features - names of all words in corpus used in TfidfVectorizer

    '''
    vector_coo = transformed_file[x.name].tocoo()
    vector_coo.col = features.iloc[vector_coo.col].values
    dict_from_coo = dict(zip(vector_coo.col, vector_coo.data))
    return dict_from_coo

def replace_tfidf_words(x, transformed_file, features):
    '''
    replacing each word with it's calculated tfidf dictionary with scores of each word
    x - row of dataframe, containing sentences, and their indexes,
    transformed_file - all sentences transformed with TfidfVectorizer
    features - names of all words in corpus used in TfidfVectorizer
    '''
    dictionary = create_tfidf_dictionary(x, transformed_file, features)   
    return list(map(lambda y:dictionary[f'{y}'], x.cleaned_text.split()))

In [223]:
%%time
replaced_tfidf_scores = file_weighting.apply(lambda x: replace_tfidf_words(x, transformed, features), axis=1)#this step takes around 3-4 minutes minutes to calculate

Wall time: 14.9 ms


Replacing words in sentences with their sentiment score

In [224]:
def replace_sentiment_words(word, sentiment_dict):
    '''
    replacing each word with its associated sentiment score from sentiment dict
    '''
    try:
        out = sentiment_dict[word]
    except KeyError:
        out = 0
    return out

In [225]:
replaced_closeness_scores = file_weighting.cleaned_text.apply(lambda x: list(map(lambda y: replace_sentiment_words(y, sentiment_dict), x.split())))

Merging both previous steps and getting the predictions:

In [226]:
replacement_df = pd.DataFrame(data=[replaced_closeness_scores, replaced_tfidf_scores, file_weighting.cleaned_text, file_weighting.polarity]).T
replacement_df.columns = ['sentiment_coeff', 'tfidf_scores', 'sentence', 'sentiment']
replacement_df['sentiment_rate'] = replacement_df.apply(lambda x: np.array(x.loc['sentiment_coeff']) @ np.array(x.loc['tfidf_scores']), axis=1)
replacement_df['prediction'] = (replacement_df.sentiment_rate>0).astype('int8')
replacement_df['sentiment'] = [1 if i==1 else 0 for i in replacement_df.sentiment]

In [227]:
replacement_df.head(20)

Unnamed: 0,sentiment_coeff,tfidf_scores,sentence,sentiment,sentiment_rate,prediction
0,"[1.0139124167197244, 1.0132120073662652, -1.01...","[0.2232184255077722, 0.23433302303580217, 0.33...",تطبيق جيد بسيط وفعال انصح بشده لجميع مالكي الح...,1,0.716683,1
1,[-1.0073278883848795],[1.0],فعال,1,-1.007328,0
2,"[1.008691511042577, 1.0092453457030903, -1.009...","[0.4560417459400615, 0.41058781041664155, 0.41...",اوصي بشده مالكي الحساب البنك الجيد,1,0.803564,1
3,"[0.999915678911855, 1.0132120073662652]","[0.7679001940809359, 0.640569505932386]",بنك جيد,1,1.416868,1
4,"[0, 1.014571942989931]","[0.7071067811865476, 0.7071067811865476]",عله سيئه,0,0.717411,1
5,[1.0079834031163],[1.0],سيئ,1,1.007983,1
6,"[1.0139124167197244, 0.9979524416492116, 0.994...","[0.25992663351798156, 0.35499247884115925, 0.3...",تطبيق رائع عملي شيء بسيطا ويعمل جيدا بالنسبه,1,0.254175,1
7,"[1.0139124167197244, 0.9979524416492116]","[0.5907705533091341, 0.8068396081891489]",تطبيق رائع,1,1.404177,1
8,"[1.0126443338861348, -1.0139532660965966]","[0.7905071379756432, 0.6124528265993695]",مريحه للغايه,1,0.179504,1
9,"[1.0006874920522506, -1.0106585928970864]","[0.7071067811865476, 0.7071067811865476]",تعمل جيده,1,-0.007051,0


Reporting model's metrics

In [228]:
predicted_classes = replacement_df.prediction
y_test = replacement_df.sentiment


test_scores = accuracy_score(y_test, predicted_classes),precision_score(y_test, predicted_classes), recall_score(y_test, predicted_classes), f1_score(y_test, predicted_classes)

scores = pd.DataFrame(data=[test_scores])
scores.columns = ['accuracy', 'precision', 'recall', 'f1']
scores = scores.T
scores.columns = ['scores unsupervised method']
display(scores)

Unnamed: 0,scores unsupervised method
accuracy,0.714286
precision,0.818182
recall,0.818182
f1,0.818182


In [229]:
import codecs
def get_stop_words():
    path = r"C:/Users/belfa/sentiment2/stop-words-list.txt"
    stop_words = []
    with codecs.open(path, "r", encoding="utf-8", errors="ignore") as myfile:
        stop_words = myfile.readlines()
    stop_words = [word.strip() for word in stop_words]
    return stop_words
def remove_stp_words(text):
    text_words = []
    words = text.split(" ")
    stop_words = get_stop_words()
    for word in words:
        if word not in stop_words and len(word)>2:
            text_words.append(word)
    return ' '.join(text_words)
#Stemmer_LIGHT : Remove suffixes and affixes 
def light_stemming(text):
    text_words = []
    words = text.split(" ")
    for c in words:
        stem = stemmer.stem(c)
        text_words.append(stem)
    return ' '.join(text_words)
def tokenization(text):
    return set(word_tokenize(text)) 

In [230]:
file = pd.read_csv("Data.csv")
file_cleaned = file.dropna().drop_duplicates().reset_index(drop=True).rename(columns={'banque':'review'})

In [231]:
def clean_text(text):  
    # ref: https://github.com/bakrianoo/aravec
    search = ["أ","إ","آ","ة","_","-","/",".","،"," و "," يا ",'"',"ـ","'","ى",
              "\\",'\n', '\t','&quot;','?','؟','!']
    replace = ["ا","ا","ا","ه"," "," ","","",""," و"," يا",
               "","","","ي","",' ', ' ',' ',' ? ',' ؟ ', ' ! ']
    
    tashkeel = re.compile(r'[\u0617-\u061A\u064B-\u0652]')
    text = re.sub(tashkeel,"", text)
    
    longation = re.compile(r'(.)\1+')
    subst = r"\1\1"
    text = re.sub(longation, subst, text)
    
    text = re.sub(r"[^\w\s]", '', text)
    text = re.sub(r"[a-zA-Z]", '', text)
    text = re.sub(r"\d+", ' ', text)
    text = re.sub(r"\n+", ' ', text)
    text = re.sub(r"\t+", ' ', text)
    text = re.sub(r"\r+", ' ', text)
    text = re.sub(r"\s+", ' ', text)
    text = text.replace('وو', 'و')
    text = text.replace('يي', 'ي')
    text = text.replace('اا', 'ا')
    
    for i in range(0, len(search)):
        text = text.replace(search[i], replace[i])
    text = text.strip()
    text=remove_stp_words(text)
    #text=light_stemming(text)
    text=tokenization(text)
    return text 

In [232]:
import pandas as pd
import numpy as np
import re
from re import sub
import multiprocessing
from unidecode import unidecode

from gensim.models.phrases import Phrases, Phraser
from gensim.models import Word2Vec
from gensim.test.utils import get_tmpfile
from gensim.models import KeyedVectors


from time import time 
from collections import defaultdict


import pyarabic.araby as araby

from tashaphyne.stemming import ArabicLightStemmer
import unicodedata
from nltk import  word_tokenize
from gensim.models.phrases import Phrases, Phraser
from gensim.models import Word2Vec
from gensim.test.utils import get_tmpfile
from gensim.models import KeyedVectors

In [233]:
for index,entry in enumerate(file_cleaned['review']):

    Final_words = []
    Final_words=clean_text(entry)
    file_cleaned.loc[index,'cleaned_text'] = str(Final_words)
#file_cleaned['cleaned_text']= file_cleaned['review'].apply(clean_text)

In [234]:
file_cleaned=file_cleaned[:50]

In [235]:
from sklearn import model_selection
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(file_cleaned['cleaned_text'],file_cleaned['polarity'],test_size=0.3)

In [236]:
from sklearn.preprocessing import LabelEncoder
Encoder = LabelEncoder()
Train_Y = Encoder.fit_transform(Train_Y)
Test_Y = Encoder.fit_transform(Test_Y)

In [237]:
Tfidf_vect = TfidfVectorizer()
Tfidf_vect.fit(file_cleaned['cleaned_text'])
Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)
#print(Tfidf_vect.vocabulary_)

In [238]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

predicted_classes = replacement_df.prediction
y_test = replacement_df.sentiment


test_scores = accuracy_score(y_test, predicted_classes),precision_score(y_test, predicted_classes), recall_score(y_test, predicted_classes), f1_score(y_test, predicted_classes)

scores = pd.DataFrame(data=[test_scores])
scores.columns = ['accuracy', 'precision', 'recall', 'f1']
scores = scores.T
scores.columns = ['scores unsupervised method']
display(scores)


SVM = SVC()
SVM.fit(Train_X_Tfidf,Train_Y)

predictions_SVM = SVM.predict(Test_X_Tfidf)
#print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y))

test_sc = accuracy_score(predictions_SVM, Test_Y),precision_score(predictions_SVM, Test_Y), recall_score(predictions_SVM, Test_Y), f1_score(predictions_SVM, Test_Y)


scores1 = pd.DataFrame(data=[test_sc])
scores1.columns = ['accuracy', 'precision', 'recall', 'f1']
scores1 = scores1.T
scores1.columns = ['scores supervised method']
display(scores1)

Unnamed: 0,scores unsupervised method
accuracy,0.714286
precision,0.818182
recall,0.818182
f1,0.818182


Unnamed: 0,scores supervised method
accuracy,0.8
precision,1.0
recall,0.8
f1,0.888889
