In [74]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import sys
import os

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

import preprocess
import numpy as np
import verb_extraction
import stemming
import re


In [75]:
data = pd.read_csv('../../Data/Sentimental_Analysis/main_preprocessed.csv',encoding='utf-8')

In [76]:
file = open("../../Data/stopwords.txt","r",encoding="utf-8")
stopwords = file.read().split()
file.close()
def get_tokens(text):
    dictionary = {'د':"دكتور"}#to be continued
    words_after_split = text.split()
    for index , word in enumerate(words_after_split):
        if word in dictionary.keys():
            w = dictionary[word]
            words_after_split[index] = w
    new_words = list()
    for word in words_after_split:
        if word not in stopwords:
            new_words.append(word)
    return new_words


In [77]:
tweet = list(data['TWEET'])
label = list(data['LABEL'])

In [78]:
X_train, X_test, y_train, y_test = train_test_split(tweet, label, test_size=0.2, random_state=0)
vectorizer = TfidfVectorizer(encoding='utf-8',ngram_range=(1,3),max_df=0.8,min_df=1)
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)


In [79]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
#text_classifier = RandomForestClassifier(n_jobs=2, random_state=0)
text_classifier = LinearSVC(random_state=0,C=0.9)
text_classifier.fit(X_train, y_train)


LinearSVC(C=0.9, random_state=0)

In [80]:
predictions = text_classifier.predict(X_test)
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, predictions))


0.5664797567002471


In [81]:
text_list = ["انا استمتعت جدا باليوم","انا اكرهك","انا اريد ان انتحر","انا سعيد"]
for index in range(len(text_list)):
    text_list[index] = preprocess.pre_process(text_list[index])
    tokens = get_tokens(text_list[index])
    tokens_verb_noun = verb_extraction.extract_stem_verb(tokens,{})
    tokens_verb_noun = np.array(stemming.stem(tokens_verb_noun))
    text_list[index] = ' '.join([str(elem) for elem,_ in tokens_verb_noun])
for i in range(len(text_list)):
    text_list[i] = re.sub("[a-zA-Z]", " ", text_list[i]) # remove english letters
    text_list[i] = re.sub('\n', ' ', text_list[i]) # remove \n from text
    text_list[i] = re.sub(r'\d+', '', text_list[i]) #remove number
    text_list[i] = re.sub(r'http\S+', '', text_list[i]) # remove links
    text_list[i] = re.sub(' +', ' ',text_list[i]) # remove extra space
    text_list[i] = text_list[i].strip() #remove whitespaces
tweet = vectorizer.transform(text_list)
prediction = text_classifier.predict(tweet)
print(prediction)


['joy' 'anger' 'sadness' 'joy']


In [82]:
import pickle
filename = f'../../utils/sentmental_all_model.sav'
pickle.dump(text_classifier, open(filename, 'wb'))
filename = f'../../utils/tfidf_all_model.sav'
pickle.dump(vectorizer, open(filename, 'wb'))