In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [2]:
import sys
import os

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

import preprocess

In [3]:
data = pd.read_csv('../../Data/Sentimental_Analysis/main.csv',encoding='utf-8')

In [4]:
data.head(1)

Unnamed: 0,LABEL,TWEET
0,anger,عجز الموازنه وصل ل93.7 % من الناتج المحلي يعني...


In [5]:
tweet = list(data["TWEET"])
label = list(data['LABEL'])

In [6]:
file = open("../../Data/stopwords.txt","r",encoding="utf-8")
stopwords = file.read().split()
file.close()
def get_tokens(text):
    dictionary = {'د':"دكتور"}#to be continued
    words_after_split = text.split()
    for index , word in enumerate(words_after_split):
        if word in dictionary.keys():
            w = dictionary[word]
            words_after_split[index] = w
    new_words = list()
    for word in words_after_split:
        if word not in stopwords:
            new_words.append(word)
    return new_words

In [7]:
import numpy as np
import verb_extraction
import stemming
for i in range(len(tweet)):
    tweet[i] = preprocess.pre_process(str(tweet[i]))
    tokens = get_tokens(tweet[i])
    tokens_verb_noun = verb_extraction.extract_stem_verb(tokens,{})
    tokens_verb_noun = np.array(stemming.stem(tokens_verb_noun))
    listToStr = ' '.join([str(elem) for elem,_ in tokens_verb_noun])
    tweet[i] = listToStr

In [8]:
vectorizer = TfidfVectorizer (encoding='utf-8',max_df=0.6)
tweet = vectorizer.fit_transform(tweet)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(tweet, label, test_size=0.1, random_state=0)

In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
#text_classifier = RandomForestClassifier(n_jobs=2, random_state=0)
text_classifier = LinearSVC(max_iter=10000,random_state=0)
text_classifier.fit(X_train, y_train)

LinearSVC(max_iter=10000, random_state=0)

In [11]:
predictions = text_classifier.predict(X_test)

In [12]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print(confusion_matrix(y_test,predictions))
print(classification_report(y_test,predictions))
print(accuracy_score(y_test, predictions))

[[1068  117  275  358  171]
 [ 131  865  257  188  160]
 [ 176  107 2550  360  478]
 [ 260  146  585 1712  242]
 [ 172  118  719  251 1062]]
              precision    recall  f1-score   support

       anger       0.59      0.54      0.56      1989
        fear       0.64      0.54      0.59      1601
         joy       0.58      0.69      0.63      3671
     sadness       0.60      0.58      0.59      2945
    surprise       0.50      0.46      0.48      2322

    accuracy                           0.58     12528
   macro avg       0.58      0.56      0.57     12528
weighted avg       0.58      0.58      0.58     12528

0.5792624521072797


In [13]:
text_classifier.fit(tweet, label)

LinearSVC(max_iter=10000, random_state=0)

In [14]:
predictions = text_classifier.predict(X_test)
print(confusion_matrix(y_test,predictions))
print(classification_report(y_test,predictions))
print(accuracy_score(y_test, predictions))

[[1606   55   99  162   67]
 [  62 1252  117   96   74]
 [  68   40 3226  146  191]
 [ 110   58  229 2447  101]
 [  69   48  316   92 1797]]
              precision    recall  f1-score   support

       anger       0.84      0.81      0.82      1989
        fear       0.86      0.78      0.82      1601
         joy       0.81      0.88      0.84      3671
     sadness       0.83      0.83      0.83      2945
    surprise       0.81      0.77      0.79      2322

    accuracy                           0.82     12528
   macro avg       0.83      0.81      0.82     12528
weighted avg       0.83      0.82      0.82     12528

0.8243933588761175


In [20]:
text_list = ["انا استمتعت جدا باليوم","انا اكرهك","انا اريد ان انتحر","انا سعيد"]
for index in range(len(text_list)):
    text_list[index] = preprocess.pre_process(text_list[index])
    tokens = get_tokens(text_list[index])
    tokens_verb_noun = verb_extraction.extract_stem_verb(tokens,{})
    tokens_verb_noun = np.array(stemming.stem(tokens_verb_noun))
    text_list[index] = ' '.join([str(elem) for elem,_ in tokens_verb_noun])
tweet = vectorizer.transform(text_list)
prediction = text_classifier.predict(tweet)
print(prediction)

['joy' 'anger' 'sadness' 'joy']


In [16]:
import pickle
filename = f'../../utils/sentmental_model.sav'
pickle.dump(text_classifier, open(filename, 'wb'))
filename = f'../../utils/tfidf_model.sav'
pickle.dump(vectorizer, open(filename, 'wb'))