In [66]:
import numpy as np
import pandas as pd
from pandas import DataFrame
import mysql.connector
from mysql.connector import errorcode, connect
import re
from nltk.tokenize import word_tokenize
from nltk.stem.isri import ISRIStemmer
import itertools
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import cross_validation
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC

In [67]:
def connectMysql():
    try:
        cnn = connect(
            user='root', 
            password='root', 
            host='localhost', 
            database='test',
            unix_socket='/Applications/MAMP/tmp/mysql/mysql.sock')
        return cnn
    except mysql.connector.Error as e:
        if e.errno == errorcode.ER_ACCESS_DENIED_ERROR:
            print ("Access denied!")
        elif e.errno == errorcode.ER_BAD_DB_ERROR:
            print ("Database does not exist.")
        else:
            print (e)

In [50]:
def getData(sql):
    cnn = connectMysql()
    cursor = cnn.cursor()
    cursor.execute(sql)
    reviews = DataFrame(cursor.fetchall())
    reviews.rename(columns={0:'text', 1:'label'}, inplace=True)
    cursor.close()
    cnn.close()
    return reviews

In [51]:
sql ="SELECT * FROM `REVIEWS`"
reviews = getData(sql)
len(reviews)

8340

In [52]:
reviews

Unnamed: 0,text,label
0,اولا: المنيو تغير الشورما اصبحت اعتياديه بأختف...,-1
1,من محلات الشاورما ذات الشعبيه لتميز الصلصات ال...,-1
2,دجاج طازج يحضر امامك على الطلب لا يقوم باعدة ا...,1
3,فكما تعرف أستراليا بالكنغر والكوالا. فإنها تعر...,1
4,إسمحو لي أن أقيم مطعم هاشم بصفتي فلسطيني عشت ف...,-1
5,دعاية المطعم وسمعته أكبر بكثير من مستوى وجباته...,-1
6,انا من 5 سنوات كنت اكل فيه \n كان طعم هاشم الم...,-1
7,مقبول واسعاره جيدة \n بالنسبة للفلافل والحمص ...,1
8,جربت نوع محشي فلفل ماعجبني مررة كان نيئ,-1
9,المطعم. رائع والأكل نظيف وش هالحسد اللهم يا ...,1


# Preprocessing

In [53]:
def noramlize(text):
    text = re.sub(r"[إأٱآا]", "ا", text)
    text = re.sub(r"ى", "ي", text)
    text = re.sub(r"ؤ", "ء", text)
    text = re.sub(r"ئ", "ء", text)
    text = re.sub(r'[^ا-ي ]', "", text)

    noise = re.compile(""" ّ    | # Tashdid
                             َ    | # Fatha
                             ً    | # Tanwin Fath
                             ُ    | # Damma
                             ٌ    | # Tanwin Damm
                             ِ    | # Kasra
                             ٍ    | # Tanwin Kasr
                             ْ    | # Sukun
                             ـ     # Tatwil/Kashida
                         """, re.VERBOSE)
    text = re.sub(noise, '', text)
    return text

In [54]:
# Normalize Arabic Text @ http://maximromanov.github.io/2013/01-02.html

In [55]:
def stopWordRmove(text):
    ar_stop_list = open("ar_stop_word_list.txt", "r")
    stop_words = ar_stop_list.read().split('\n')
    needed_words = []
    words = word_tokenize(text)
    for w in words:
        if w not in (stop_words):
            needed_words.append(w)
    filtered_sentence = " ".join(needed_words)
    return filtered_sentence

In [56]:
# Stemming ISRIStemmer @ http://www.nltk.org/_modules/nltk/stem/isri.html

In [57]:
def stemming(text):
    st = ISRIStemmer()
    stemmed_words = []
    words = word_tokenize(text)
    for w in words:
        stemmed_words.append(st.stem(w))
    stemmed_sentence = " ".join(stemmed_words)
    return stemmed_sentence

In [58]:
def prepareDataSets(reviews):
    sentences = []
    for index, r in reviews.iterrows():
        text = stopWordRmove(r['text'])
        text = noramlize(r['text'])
        text = stemming(r['text'])
        
        if r['label'] == -1:
            sentences.append([text, 'neg'])
        else:
            sentences.append([text, 'pos'])
    df_sentences = DataFrame(sentences, columns=['text', 'label'])
    return df_sentences

In [59]:
preprocessed_reviews = prepareDataSets(reviews)

In [60]:
preprocessed_reviews

Unnamed: 0,text,label
0,اول : نيو تغر شور صبح اعتياديه أختفاء صلص خير ...,neg
1,من محل شور ذات شعب تمز صلص قدم عها يمكن خير شو...,neg
2,دجج طزج حضر ممك على طلب لا يقم بعد تسخ تخذ بشر...,pos
3,فكم عرف رال كنغر كلا . فإن عرف ايض أكل مثل فيج...,pos
4,محو لي ان اقم طعم هشم بصف لسط عشت في ارد 24 عا...,neg
5,دعي طعم سمع كبر كثر من ستى وجب . جرب وكن عقد ا...,neg
6,انا من 5 سنو كنت اكل فيه كان طعم هشم ميز فول ح...,neg
7,قبل سعر جيد نسب لفل حمص فول يجي ابو جبر رتب ول...,pos
8,جرب نوع حشي لفل اعجب ررة كان نيئ,neg
9,طعم . رئع اكل نظف وش هالحسد لهم يا كفي الل قعد...,pos


In [61]:
# Feature Extraction

In [62]:
def featureExtraction(data):
    vectorizer = TfidfVectorizer(min_df=10, max_df=0.75, ngram_range=(1,3))
    tfidf_data = vectorizer.fit_transform(data)
    return tfidf_data

In [85]:
def learning(clf, X, Y):
    X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(X,Y, test_size=0.4, random_state=43)
    
    classifer = clf()
    classifer.fit(X_train, Y_train)
    
    predict = cross_validation.cross_val_predict(classifer, X_test, Y_test, cv=10)
    
    scores = cross_validation.cross_val_score(classifer,X_test, Y_test, cv=10)
    
    print (scores)
    
    print ("Accuracy of %s: %0.2f (+/- %0.2f)" % (classifer, scores.mean(), scores.std() *2))
    print (classification_report(Y_test, predict))

In [99]:
def main(clf):
    reviews = getData("SELECT * FROM `REVIEWS` LIMIT 3000")
    preprocessed_reviews = prepareDataSets(reviews)
    data, target = preprocessed_reviews['text'], preprocessed_reviews['label']
    tfidf_data = featureExtraction(data)
    learning(clf, tfidf_data, target)

In [100]:
main(MultinomialNB)

[ 0.75206612  0.71900826  0.7107438   0.7107438   0.73553719  0.72268908
  0.76470588  0.75630252  0.78991597  0.76470588]
Accuracy of MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True): 0.74 (+/- 0.05)
             precision    recall  f1-score   support

        neg       0.84      0.29      0.44       405
        pos       0.73      0.97      0.83       795

avg / total       0.77      0.74      0.70      1200



In [101]:
clfs = [MultinomialNB, BernoulliNB, LogisticRegression, SGDClassifier, SVC, LinearSVC, NuSVC]

In [102]:
for clf in clfs:
    main(clf)

[ 0.75206612  0.71900826  0.7107438   0.7107438   0.73553719  0.72268908
  0.76470588  0.75630252  0.78991597  0.76470588]
Accuracy of MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True): 0.74 (+/- 0.05)
             precision    recall  f1-score   support

        neg       0.84      0.29      0.44       405
        pos       0.73      0.97      0.83       795

avg / total       0.77      0.74      0.70      1200

[ 0.79338843  0.74380165  0.7768595   0.71900826  0.73553719  0.72268908
  0.80672269  0.68907563  0.74789916  0.65546218]
Accuracy of BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True): 0.74 (+/- 0.09)
             precision    recall  f1-score   support

        neg       0.64      0.51      0.57       405
        pos       0.77      0.86      0.81       795

avg / total       0.73      0.74      0.73      1200

[ 0.76859504  0.74380165  0.74380165  0.75206612  0.75206612  0.74789916
  0.77310924  0.7394958   0.79831933  0.75630252]
Accuracy of L

  'precision', 'predicted', average, warn_for)


[ 0.83471074  0.70247934  0.7768595   0.78512397  0.78512397  0.76470588
  0.80672269  0.73109244  0.8487395   0.77310924]
Accuracy of LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0): 0.78 (+/- 0.08)
             precision    recall  f1-score   support

        neg       0.71      0.59      0.65       405
        pos       0.81      0.88      0.84       795

avg / total       0.78      0.78      0.78      1200

[ 0.81818182  0.74380165  0.80165289  0.79338843  0.78512397  0.78991597
  0.82352941  0.7394958   0.83193277  0.76470588]
Accuracy of NuSVC(cache_size=200, class_weight=None, coef0=0.0,
   decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
   max_iter=-1, nu=0.5, probability=False, random_state=None,
   shrinking=True, tol=0.001, verbose=False): 0.79 (+/- 0.06)
             precision    recall  f1