In [1]:
import pandas as pd

In [2]:
dat = pd.read_excel("data.xlsx")

In [3]:
len(dat)
dat['PROBLEM_CAT_ID'] = dat['PROBLEM_CAT_ID'].astype(str)

In [4]:
dat['P_DESCRIPTION'] = dat['P_DESCRIPTION'].astype(str)
dat['P_SOLUTION'] = dat['P_SOLUTION'].astype(str)
dat['P_SOLUTION_Sec'] = dat['P_SOLUTION_Sec'].astype(str)
dat['P_SOLUTION_Fir'] = dat['P_SOLUTION_Fir'].astype(str)


In [5]:
# Clean/Normalize Arabic Text
import regex as re
def clean_str(text):
    search = ["أ","إ","آ","ة","_","-","/",".","،"," و "," يا ",'"',"ـ","'","ى","\\",'\n',
              '\t','&quot;','?','؟','!']
    replace = ["ا","ا","ا","ه"," "," ","","",""," و"," يا","","","","ي","",' ', ' ',' ',' ? ',' ؟ ',' ! ']
    
    #remove tashkeel
    p_tashkeel = re.compile(r'[\u0617-\u061A\u064B-\u0652]')
    text = re.sub(p_tashkeel,"", text)
    
    #remove longation
    p_longation = re.compile(r'(.)\1+')
    subst = r"\1\1"
    text = re.sub(p_longation, subst, text)
    
    text = text.replace('وو', 'و')
    text = text.replace('يي', 'ي')
    text = text.replace('اا', 'ا')
    
    for i in range(0, len(search)):
        text = text.replace(search[i], replace[i])
    
    #trim    
    text = text.strip()

    return text

In [7]:
from sklearn.cross_validation import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(dat["P_DESCRIPTION"],                   
                                                dat['P_SOLUTION'],
                                                test_size=0.25,
                                                random_state=1)



In [8]:
xtrain = xtrain.apply(clean_str)
xtest = xtest.apply(clean_str)

In [44]:
xtrain = xtrain.astype('str')
ytrain = ytrain.astype('str')

# **Training Model using TFIDF**

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
count_vect = CountVectorizer()
tfidf_vect = TfidfVectorizer()
X_train_counts = count_vect.fit_transform(xtrain)
X_train_tfidf = tfidf_vect.fit_transform(xtrain)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)


In [10]:
clf = MultinomialNB().fit(X_train_counts, ytrain)

In [11]:
from sklearn.metrics import accuracy_score,classification_report
predicted = clf.predict(count_vect.transform(xtest))
acc = accuracy_score(ytest,predicted)
print ('test accuracy = '+str(acc*100)+'%')
predicted = clf.predict(count_vect.transform(xtrain))
acc = accuracy_score(ytrain,predicted)
print ('train accuracy = '+str(acc*100)+'%')

test accuracy = 25.318761384335154%
train accuracy = 28.545399331916187%


# **Train Model using Count Vector **

In [13]:
from sklearn.metrics import accuracy_score,classification_report
predicted = clf.predict(tfidf_vect.transform(xtest))
acc = accuracy_score(ytest,predicted)
print ('test accuracy = '+str(acc*100)+'%')
predicted = clf.predict(tfidf_vect.transform(xtrain))
acc = accuracy_score(ytrain,predicted)
print ('train accuracy = '+str(acc*100)+'%')

test accuracy = 25.318761384335154%
train accuracy = 24.56726389310659%


# SVM

In [14]:
from sklearn import svm
svc = svm.SVC()
svc.fit(X_train_counts, ytrain)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [15]:
from sklearn.metrics import accuracy_score,classification_report
predicted = svc.predict(tfidf_vect.transform(xtest))
acc = accuracy_score(ytest,predicted)
print ('test accuracy = '+str(acc*100)+'%')
predicted = svc.predict(tfidf_vect.transform(xtrain))
acc = accuracy_score(ytrain,predicted)
print ('train accuracy = '+str(acc*100)+'%')

test accuracy = 25.318761384335154%
train accuracy = 24.17248709383541%


# Random Forest Classifier

In [16]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(X_train_counts, ytrain)


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=2, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [17]:
from sklearn.metrics import accuracy_score,classification_report
predicted = clf.predict(tfidf_vect.transform(xtest))
acc = accuracy_score(ytest,predicted)
print ('test accuracy = '+str(acc*100)+'%')
predicted = clf.predict(tfidf_vect.transform(xtrain))
acc = accuracy_score(ytrain,predicted)
print ('train accuracy = '+str(acc*100)+'%')

test accuracy = 25.318761384335154%
train accuracy = 24.17248709383541%


In [51]:
from sklearn.feature_extraction.text import CountVectorizer
xcount_vect = CountVectorizer()
ycount_vect = CountVectorizer()
X_train_counts = xcount_vect.fit(xtrain)
Y_train_counts = ycount_vect.fit(ytrain)

# NGram Search Engine for QA

In [18]:
import ngram

In [19]:
s1 = dat['P_SOLUTION'].tolist()
s2 = dat['P_SOLUTION_Sec'].tolist()
s3 = dat['P_SOLUTION_Fir'].tolist()
sol = s1 + s2 + s3

In [20]:
G = ngram.NGram(sol)

In [21]:
 G.search('ظهور تبرقش اصفر اللون على اوراق نبات الكوسة صنف شمامىوالتبرقش على سطحى الورقة السفلى والعلوى,وفى بعض الاوراق تصل نسبة التبرقش على الورقة الى 50 %من سطح الورقة,المساحة المنزرعة فدان والاصابة تتركز فى شريحة واحدة فقط حتى الان فما سبب الاصابة ؟وما هو العلاج')

[('   وصف المشكلة ظهور بقع بيضاء باهته على السطح العلوى للاوراق مع ظهور لون بنفسجى محمرعلى السطح السفلى للاوراق فى القطن صنف جيزة 86 مع وجود افراد من العنكبوت على سطح الاوراق السفلى فى مساحات متفرقةمن الحقل فى مساحة فدانين ونصف الفدان 0   ',
  0.1761904761904762),
 ('تم النزول الى الحقل المصاب وجد تقزم فى بعض االنباتات المصابة كما تلاحظ وجود نمو زغبى على السطح السفلى للاوراق ووجود بقع صفراء على السطح العلوى للاوراق ',
  0.1752873563218391),
 ('بعد فحص الاوراق المصابة وجد حشرة العنكبوت الاحمر على السطح السفلى للاوراق ولعلاج هذه الحشرة يستخدم مادة الكوميت 73 % بمعدل 600 سم / الفدان او تيدا فول بمعدل واحد لتر للفدان مع مراعاة الاتى 1- يجب رش البقع المصابة فقط ووصول محلول الرش الى الى الاسطح السفلية 2 - عدم خلط المبيدات مع الاسمدة الورقية 3 - تجنب الرش وقت الظهيرة ',
  0.16901408450704225),
 ('بعد فحص الاوراق  المصابة وجد حشرة العنكبوت الاحمر  على السطح السفلى للاوراق ولعلاج هذه الحشرة يستخدم مادة الكوميت 73 % بمعدل 600 سم / الفدان او تيدا فول بمعدل واحد لتر للفدان مع مراعاة الاتى 1- يجب  