In [25]:
import numpy as np
import pandas as pd
import re
from sklearn.naive_bayes import BernoulliNB, GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import normalize
from sklearn.model_selection import cross_val_score

In [2]:
morph_list = []

with open('SU-Movie-Reviews-Morp.txt', encoding='utf-8') as file:
    lines = file.readlines()
    for line in lines:
        idx = line.split(" ")[0]
        normal = line.split(" ")[1]
        morphed = line.split(" ")[2]
        
        morph_list.append((idx, normal, morphed))

In [3]:
review_list = []

with open('SU-Movie-Reviews-Sentences.txt', "r", encoding='utf-8') as file:
    line = file.read()    
    splitted_line = line.split("\n")
    
    for s_line in splitted_line[:-1]:
        idx = s_line.split(" ")[0].split(":")[-1]
        polarity = s_line.split(" ")[1].split(":")[-1]
        sentence = " ".join(s_line.split(" ")[2:])
        
        splitted_sentence = re.findall(r"\w+|[^\w\s]", sentence, re.UNICODE)
        for i in range(len(splitted_sentence)):
            for morph in morph_list:
                if morph[0] == idx and morph[1] == splitted_sentence[i]:
                    splitted_sentence[i] = morph[2]
                    break
        
        morphed_sentence = " ".join(splitted_sentence)
        review_list.append((idx, polarity, sentence, morphed_sentence))

In [4]:
with open('SU-Movie-Reviews-KFolds.txt', "r", encoding='utf-8') as file:
    lines = [line[:-1].strip() for line in file.readlines() if line[:-1] != '']
    fold_1_train = [review_list[eval(idx)] for idx in lines[:110][2:]]
    fold_1_test = [review_list[eval(idx)] for idx in lines[110:139][1:]]
    fold_2_train = [review_list[eval(idx)] for idx in lines[139:249][2:]]
    fold_2_test = [review_list[eval(idx)] for idx in lines[249:278][1:]]
    fold_3_train = [review_list[eval(idx)] for idx in lines[278:388][2:]]
    fold_3_test = [review_list[eval(idx)] for idx in lines[388:417][1:]]
    fold_4_train = [review_list[eval(idx)] for idx in lines[417:527][2:]]
    fold_4_test = [review_list[eval(idx)] for idx in lines[527:556][1:]]
    fold_5_train = [review_list[eval(idx)] for idx in lines[556:666][2:]]
    fold_5_test = [review_list[eval(idx)] for idx in lines[666:][1:]]

In [5]:
def get_split_data(train_fold, test_fold, morp):
    if not morp:
        X_train = [review[2] for review in eval(train_fold)]
        X_test = [review[2] for review in eval(test_fold)]
    else:
        X_train = [" ".join([sp_review_element.split("+")[0] for sp_review_element in review[3].split("\n ")]) \
                  for review in eval(train_fold)]
        X_test = [" ".join([sp_review_element.split("+")[0] for sp_review_element in review[3].split("\n ")]) \
                  for review in eval(test_fold)]
    y_train = [review[1] for review in eval(train_fold)]
    y_test = [review[1] for review in eval(test_fold)]
    
    return X_train, y_train, X_test, y_test

In [6]:
def get_scores(clf, k, morp = False):
    scores = []
    for i in range(0, k):
        train_fold = "fold_" + str(i+1) + "_train"
        test_fold = "fold_" + str(i+1) + "_test"

        X_train, y_train, X_test, y_test = get_split_data(train_fold, test_fold, morp)

        clf_ = clf()
        vec = TfidfVectorizer(analyzer='word', min_df=0.1, max_df=1.0, sublinear_tf=True, use_idf=True, ngram_range=(1, 2))
        vec_clf = Pipeline([('vectorizer', vec), ('clf', clf_)])
        vec_clf.fit(X_train, y_train)
        scores.append(vec_clf.score(X_test, y_test))

    return scores

In [7]:
scores = get_scores(BernoulliNB, 5)
print("5-Fold Cross Validation Scores for normal sentences: \n"+ str(scores))
scores_morp = get_scores(BernoulliNB, 5, morp=True)
print("5-Fold Cross Validation Scores for morphed roots in sentences: \n"+ str(scores_morp))

5-Fold Cross Validation Scores for normal sentences: 
[0.35714285714285715, 0.35714285714285715, 0.2857142857142857, 0.42857142857142855, 0.4642857142857143]
5-Fold Cross Validation Scores for morphed roots in sentences: 
[0.5, 0.32142857142857145, 0.42857142857142855, 0.35714285714285715, 0.4642857142857143]


In [115]:
print("Accuracy for raw data: %0.3f (+/- %0.3f)" % (np.mean(scores), np.std(scores) * 2))
print("Accuracy for morphological analyzed data: %0.3f (+/- %0.3f)" % (np.mean(scores_morp), np.std(scores_morp) * 2))

Accuracy for raw data: 0.379 (+/- 0.125)
Accuracy for morphological analyzed data: 0.414 (+/- 0.132)


In [9]:
STN = pd.read_excel("./Polarity Resources/SentiTurkNet/STN.xlsx")

In [10]:
STN.tail()

Unnamed: 0,synonyms,Turkish Gloss,Polarity Label,POS tag,neg value,obj value,pos value,Eng Synonyms,English Gloss,SWNpos,SWNneg
14790,kritik,"Tehlikeli, endişe veren",o,a,0.06,0.872,0.068,critical#6,being in or verging on a state of crisis or em...,0.0,0.125
14791,"sönük , loş",ışıksız,o,a,0.125,0.818,0.057,subdued#5 dim#1,"lacking in light; not bright or harsh; ""a dim ...",0.125,0.75
14792,"fok , fok balığı",bir tür deniz balığı,o,n,0.06,0.872,0.068,pinniped_mammal#1 pinniped#1 pinnatiped#1,aquatic carnivorous mammal having a streamline...,0.0,0.0
14793,"örtü , battaniye",Örtmek için kullanılan şey,o,n,0.06,0.872,0.068,bedding#1 bedclothes#1 bed_clothing#1,coverings that are used on a bed,0.0,0.0
14794,"litre , desimetreküp , desimetre küp , l , l. ...",sıvı maddeleri ölçmek için kullnılan birim,o,n,0.06,0.872,0.068,metric_capacity_unit#1,a capacity unit defined in metric terms,0.0,0.0


In [11]:
STN_dict = dict()

for _, syn in STN.iterrows():
    STN_dict[syn[0]] = (syn[6], syn[4])

In [12]:
pos_scores_for_STN = []
neg_scores_for_STN = []

for review in review_list:
    words = review[3].split(" ")
    count = 0
    pos_score = 0
    neg_score = 0
    
    for word in words:
        root = word.split("+")[0]
        if root in STN_dict.keys():
            count = count + 1
            pos_score = pos_score + STN_dict[root][0]
            neg_score = neg_score + STN_dict[root][1]
   
    if count != 0:
        avg_pos_score = pos_score / count
        avg_neg_score = neg_score / count
    else: 
        avg_pos_score = 0
        avg_neg_score = 0
    
    pos_scores_for_STN.append(avg_pos_score)
    neg_scores_for_STN.append(avg_neg_score)

In [13]:
SN = pd.read_csv('./Polarity Resources/TurkishSenticNet2.txt', sep="\t", header=None)

In [14]:
SN.tail()

Unnamed: 0,0,1,2
14222,yore,-0.023,geçmiş zamanlar
14223,cry child,-0.132,bağırma çocuk
14224,baby need,-0.029,bebek gereksinim
14225,paper punch,-0.035,kâğıt zımbası
14226,circular route,0.062,


In [15]:
SN_dict = dict()

for _, syn in SN.iterrows():
    SN_dict[syn[2]] = syn[1]

In [16]:
pos_scores_for_SN = []
neg_scores_for_SN = []

for review in review_list:
    words = review[3].split(" ")
    count = 0
    pos_score = 0
    neg_score = 0
    
    for word in words:
        if word in SN_dict.keys():
            count = count + 1
            if SN_dict[word] >= 0:
                pos_score = pos_score + syn[1]
            else:
                neg_score = neg_score + syn[1]
                
    if count != 0:
        avg_pos_score = pos_score / count
        avg_neg_score = neg_score / count
    else: 
        avg_pos_score = 0
        avg_neg_score = 0
    
    pos_scores_for_SN.append(avg_pos_score)
    neg_scores_for_SN.append(avg_neg_score)

In [17]:
PWS_NEG = pd.read_csv('./Polarity Resources/allNeg_clean2.txt', header=None)
PWS_POS = pd.read_csv('./Polarity Resources/allPos_clean2.txt', header=None)

In [18]:
PWS_NEG.tail()

Unnamed: 0,0
1263,zoraki
1264,zorba
1265,zorlu
1266,zorunlu
1267,züğürt


In [19]:
PWS_POS.tail()

Unnamed: 0,0
916,gönlü zengin
917,gönül
918,hâkim
919,sakin
920,som


In [20]:
counts_indicative_words = []
counts_total_words = []
counts_for_capital = []

indicative_words_lst = ["izleyin", "izle", "iyi seyirler", "kaçırma", "kaçırmayın", "izlenmeli", "izlemeli", "izlemek gerek", "izlemek lazım", "izlenebilir"]

for review in review_list:
    words = review[2].split(" ")
    indic_count = 0
    total_count = 0
    cap_count = 0
    
    for word in words:
        if word in indicative_words_lst:
            indic_count = indic_count + 1
            
        total_count = total_count + 1
        
        if len(word) > 0:
            if word[0].isupper():
                cap_count = cap_count + 1
                
    counts_indicative_words.append(indic_count)
    counts_total_words.append(total_count)
    counts_for_capital.append(cap_count)

In [21]:
PWS_dict = dict()

for pws in PWS_POS.iterrows():
    PWS_dict[pws[1].item()] = 1
    
for pws in PWS_NEG.iterrows():
    PWS_dict[pws[1].item()] = 0

In [22]:
pos_counts_for_PWS = []
neg_counts_for_PWS = []
counts_for_adj_and_adv = []
is_negated_sentence = []

for review in review_list:
    words = review[3].split(" ")
    pos_count = 0
    neg_count = 0
    adj = 0
    adv = 0
    is_negated = False
    
    for word in words:
        root = word.split("+")[0]
        if root in PWS_dict.keys():
            if PWS_dict[root] == 1:
                pos_count = pos_count + 1
            else:
                neg_count = neg_count + 1
                
        if "Adj" in word:
            adj = adj + 1
        if "Adverb" in word:
            adv = adv + 1
        if "Neg" in word:
            is_negated = True
    
    pos_counts_for_PWS.append(pos_count)
    neg_counts_for_PWS.append(neg_count)
    counts_for_adj_and_adv.append(adj+adv)
    is_negated_sentence.append(1 if is_negated else 0)

In [23]:
features = np.vstack((pos_scores_for_STN, neg_scores_for_STN, pos_scores_for_SN, neg_scores_for_SN, pos_counts_for_PWS, neg_counts_for_PWS, counts_for_adj_and_adv, counts_indicative_words, counts_for_capital, counts_total_words, is_negated_sentence))

In [26]:
f_df = pd.DataFrame(normalize(features.T, axis=0, norm='max'))

In [27]:
f_df.columns = ["POS_STN", "NEG_STN", "POS_SN", "NEG_SN", "POS_PWS", "NEG_PWS", "ADJ_ADV", "INDIC_WORDS", "INIT_CAPITAL", "TOTAL_TOKENS", "NEGATED"]

In [29]:
f_df.tail()

Unnamed: 0,POS_STN,NEG_STN,POS_SN,NEG_SN,POS_PWS,NEG_PWS,ADJ_ADV,INDIC_WORDS,INIT_CAPITAL,TOTAL_TOKENS,NEGATED
2666,0.05375,0.074791,0.0,0.0,0.0,0.0,0.027027,0.0,0.028571,0.091667,1.0
2667,0.068,0.062762,0.0,0.0,0.0,0.2,0.054054,0.0,0.0,0.083333,0.0
2668,0.3735,0.062762,0.0,0.0,0.0,0.0,0.243243,0.0,0.028571,0.283333,1.0
2669,0.068,0.062762,0.0,0.0,0.0,0.2,0.189189,0.0,0.085714,0.258333,0.0
2670,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.028571,0.025,0.0


In [30]:
reviews = []
labels = []

for review in review_list:
    reviews.append(review[2])
    labels.append(review[1])

In [31]:
vectorizer = TfidfVectorizer(analyzer='word', min_df=0.1, max_df=1.0, sublinear_tf=True, use_idf=True, ngram_range=(1, 2))
reviews = vectorizer.fit_transform(reviews)

In [141]:
#data = np.concatenate((reviews.toarray(), f_df.iloc[:,:2].values), axis=1) # Accuracy: 0.457 (+/- 0.095)
#data = np.concatenate((reviews.toarray(), f_df.iloc[:,2:4].values), axis=1) # Accuracy: 0.414 (+/- 0.160)
#data = np.concatenate((reviews.toarray(), f_df.iloc[:,4:6].values), axis=1) # Accuracy: 0.414 (+/- 0.184)
#data = np.concatenate((reviews.toarray(), f_df.iloc[:,:6].values), axis=1) # Accuracy: 0.464 (+/- 0.090)
#data = np.concatenate((reviews.toarray(), f_df.iloc[:,6:7].values), axis=1) # Accuracy: 0.407 (+/- 0.140)
#data = np.concatenate((reviews.toarray(), f_df.iloc[:,:7].values), axis=1) # Accuracy: 0.464 (+/- 0.090)
#data = np.concatenate((reviews.toarray(), f_df.iloc[:,7:8].values), axis=1) # Accuracy: 0.414 (+/- 0.160)
#data = np.concatenate((reviews.toarray(), f_df.iloc[:,:8].values), axis=1) # Accuracy: 0.464 (+/- 0.090)
#data = np.concatenate((reviews.toarray(), f_df.iloc[:,8:9].values), axis=1) # Accuracy: 0.414 (+/- 0.160)
#data = np.concatenate((reviews.toarray(), f_df.iloc[:,:9].values), axis=1) # Accuracy: 0.464 (+/- 0.090)
#data = np.concatenate((reviews.toarray(), f_df.iloc[:,9:10].values), axis=1) # Accuracy: 0.407 (+/- 0.107)
#data = np.concatenate((reviews.toarray(), f_df.iloc[:,:10].values), axis=1) # Accuracy: 0.464 (+/- 0.090)
#data = np.concatenate((reviews.toarray(), f_df.iloc[:,10:11].values), axis=1) # Accuracy: 0.407 (+/- 0.107)
data = np.concatenate((reviews.toarray(), f_df.iloc[:,:].values), axis=1)

In [142]:
data.shape

(2671, 19)

In [143]:
with open('SU-Movie-Reviews-KFolds.txt', "r", encoding='utf-8') as file:
    lines = [line[:-1].strip() for line in file.readlines() if line[:-1] != '']
    fold_1_train_lr = [(data[eval(idx)],labels[eval(idx)]) for idx in lines[:110][2:]]
    fold_1_test_lr = [(data[eval(idx)],labels[eval(idx)]) for idx in lines[110:139][1:]]
    fold_2_train_lr = [(data[eval(idx)],labels[eval(idx)]) for idx in lines[139:249][2:]]
    fold_2_test_lr = [(data[eval(idx)],labels[eval(idx)]) for idx in lines[249:278][1:]]
    fold_3_train_lr = [(data[eval(idx)],labels[eval(idx)]) for idx in lines[278:388][2:]]
    fold_3_test_lr = [(data[eval(idx)],labels[eval(idx)]) for idx in lines[388:417][1:]]
    fold_4_train_lr = [(data[eval(idx)],labels[eval(idx)]) for idx in lines[417:527][2:]]
    fold_4_test_lr = [(data[eval(idx)],labels[eval(idx)]) for idx in lines[527:556][1:]]
    fold_5_train_lr = [(data[eval(idx)],labels[eval(idx)]) for idx in lines[556:666][2:]]
    fold_5_test_lr = [(data[eval(idx)],labels[eval(idx)]) for idx in lines[666:][1:]]

In [144]:
def get_split_data_lr(train_fold, test_fold):
    X_train = [review for review, label in eval(train_fold)]
    X_test = [review for review, label in eval(test_fold)]
    y_train = [label for review, label in eval(train_fold)]
    y_test = [label for review, label in eval(test_fold)]
    
    return X_train, y_train, X_test, y_test

In [145]:
scores_lr = [] 
for i in range(0, 5):
    train_fold = "fold_" + str(i+1) + "_train_lr"
    test_fold = "fold_" + str(i+1) + "_test_lr"

    X_train, y_train, X_test, y_test = get_split_data_lr(train_fold, test_fold)
    
    lr = LogisticRegression()
    lr.fit(X_train, y_train)
    
    scores_lr.append(lr.score(X_test, y_test))

In [146]:
scores_lr

[0.5, 0.39285714285714285, 0.5, 0.5, 0.5357142857142857]

In [147]:
print("Accuracy: %0.3f (+/- %0.3f)" % (np.mean(scores_lr), np.std(scores_lr) * 2))

Accuracy: 0.486 (+/- 0.097)


In [148]:
# For random cross validation folded with all data 
random_cv_scores = cross_val_score(BernoulliNB(), data, labels, cv=5)

In [149]:
random_cv_scores

array([ 0.62242991,  0.54766355,  0.57865169,  0.58801498,  0.57786116])

In [150]:
print("Accuracy: %0.3f (+/- %0.3f)" % (np.mean(random_cv_scores), np.std(random_cv_scores) * 2))

Accuracy: 0.583 (+/- 0.048)
