In [13]:
"""Automatically label sentiment of Tigrinya comment
    0: negative
    1: positive
"""

import re
import nltk
import pandas as pd
import os

In [3]:
def find_all_occurance(row,rowdata):
    n = len(rowdata)
    td_indexes = []
    td = 0
    start = 1
    new_row = row
    new_data = rowdata
    while td > -1:
        if start  == 1:
            td = new_row.find(new_data)
            start = 0
        else:          
            #updated = td + len(rowdata)
            td = new_row.find(new_data)
        if td >= 0:
            td_indexes.append(td)
        new_row = new_row[td+len(new_data):]
    return td_indexes

def extract_emojis_and_polarity(path):
    with open(path, 'r', encoding= 'utf-8') as f:
        data = f.read()
    row_start = data.find("<tr>")
    row_end = data.find("</tr>")
    emojis = []
    while row_start >= 0:
        start_td_indexes = find_all_occurance(data[row_start:row_end],"<td>")
        end_td_indexes = find_all_occurance(data[row_start:row_end],"</td>")
        break
        if len(start_td_indexes) == 12 and len(end_td_indexes) == 12:#Make sure the structure matchs our real html table row which has 12 data in single row
            polarity_td_start = start_td_indexes[8]+len("<td>")
            polarity_td_end = end_td_indexes[8]
            unicode_td_start = start_td_indexes[2]+len("<td>")
            unicode_td_end = end_td_indexes[2]
            #print(polarity_td_start, unicode_td_start )
            polarity = data[polarity_td_start:polarity_td_end]
            unicode = data[unicode_td_start:unicode_td_end]
            emojis.append([unicode, polarity])
        data = data[row_end+4:]
        row_start = data.find("<tr>")
        row_end = data.find("</tr>")
    
    return emojis

def load_local_emojis(path):
    with open(path, 'r', encoding ='utf-8') as f:
        data =f.readlines()
    emojis = []
    for line in data:
        line = line.strip("\t\n")
        line_data = line.split(',')
        emoji = [line_data[0],line_data[1],line_data[2]]
        emojis.append(emoji)
    return emojis

In [4]:
def download_extract_unicode_official(url):
    """
    Args:
        path: a path to emoji file
    """
    try:
        data = pd.read_html(url, header = 0)
    except:
        print("Check the url given... Don't forget to connect to the interent if the url is not local")
        return None
    return data
    
    
def store(comments, path):
    """
    Args:
        comments: array of coomment to be stored
        path: path where the comments to be stored
    """
    with open(path, 'w', encoding='utf-8') as f:
        f.write("\n".join(comments))
        
def it_contains_Tig_words(comment):
    """
    Args:
        Comment: a single comment
    Returns:
         False when the given comment does not have any Tigrinya words
         True otherwise
    """
    tig_flag = re.findall(r'[\u1200-\u137F]+', comment)
    print(len(tig_flag))
    if len(tig_flag) > 2:#comment should contain at least 2 Tigrinya words
        return True
    return False

#Loads comments from a given path
def load_comments(path):
    """
    Args:
        path: a path to comments
    """
    with open(path, "r", encoding="utf-8") as file:
        data = file.readlines()
    comments = [comment.replace('\n','') for comment in data]
    return comments

#Loads emojis from a given url [Note:  code of the function is toilered for the given default url]
def load_emojis(url = "http://kt.ijs.si/data/Emoji_sentiment_ranking/"):
    """
    Args:
        url: a url or path to emoji html file
    Returns:
        array of emojis
    """
    try:
        data = pd.read_html(url, header = 0)
    except:
        print("Check the url given... Don't forget to connect to the interent if the url is not local")
        return None
    emojis = []
    pdframe = pd.DataFrame(data[0], columns = ['Char', 'Image[twemoji]','Unicodecodepoint', 'Occurrences[5...max]','Position[0...1]', 'Neg[0...1]', 'Neut[0...1]', 'Pos[0...1]', 'Sentiment score[-1...+1]','Sentiment bar(c.i. 95%)','Unicode name','Unicode block'])
    for index, row in pdframe.iterrows():
        polarity = row['Sentiment score[-1...+1]']
        if float(polarity) > 0.4:
            emojis.append([row["Char"], row['Unicodecodepoint'], 'pos_high'])
        elif float(polarity) > 0:
            emojis.append([row["Char"], row['Unicodecodepoint'], 'pos_low'])
        elif float(polarity) > -0.1:
            emojis.append([row["Char"], row['Unicodecodepoint'], 'neg_low'])
        else:
            emojis.append([row["Char"], row['Unicodecodepoint'], 'neg_high'])
    return emojis

#Returns True if emoji is in already saved emojis else False
def check_if_emoji_exist_already(emoji, emojis):
    """
    Args:
        emoji
        array of old emojis as emojis
    Returns: 
        True/False
    """
    for old_emoji in emojis:
        if old_emoji[0] == emoji or old_emoji[1] == emoji:
            return True
    return False

#
def append_emojis(emojis,data, sentiment):
    for line in data:
        new_emojis = line.split(' ')
        i = 1#At position 0 we have unicode not emoji
        while i < len(new_emojis):
            new_emoji = new_emojis[i].strip(' \t\n\r')
            if not check_if_emoji_exist_already(new_emoji, emojis):
                emojis.append([new_emoji,new_emojis[0].strip(' \t\n\r'),sentiment])
            i += 1
    return emojis

def add_extra_emojis(emojis,path):
    """
    Args:
        emojis: Already existing emojis
        path: path that holds set of emojis files
    Return: 
        emojis array which contains all emojis in emojis parameter and in the files of directory path
    """
    for filename in os.listdir(path):
        if ".txt" in filename:
            file_path = os.path.join(path,filename)
            with open(file_path,'r',encoding = 'utf-8') as f:
                data = f.readlines()
            if filename == "neg.txt":
                emojis = append_emojis(emojis,data,'neg_high')
            elif filename == "pos.txt":
                emojis = append_emojis(emojis,data,'pos_high')
            elif filename == "slightly_pos.txt":
                emojis = append_emojis(emojis,data,'pos_low')
            elif filename == "slightly_neg.txt":
                emojis = append_emojis(emojis,data,'neg_low')
    return emojis
    
def simple_pos_neg_counter(comment,emojis):
    """
    Args:
        comment: a single comment
        emojis: set of all emojis assigned as positive and negative with structure [emoji, sentiment, high/low]
    """
    pos = 0
    neg = 0
    for emoji in emojis:
        if emoji[2] == "pos_high" or emoji[2] == "pos_low":
            pos += 1
        else:
            neg += 1
    return pos,neg

#Remove amharic comments from tigrinya
def remove_amharic(myarr):
    """
    Args:
        Array of comments(both Amharic and Tigriyna)
    Returns:
        Array of Tigrinya comments excluding Amharic comments
    """
    amharic_letters = ['ሇ','ሗ','ሧ','ሷ','ቇ','ቧ','ቷ','ኇ','ኗ','ኧ','ዧ','ዷ','ጇ','ጧ','ጷ	','ፇ','ፗ','ፘ	','ፙ	','ፚ',\
                  'ፏ','ጿ','ጯ','ጟ','ጏ','ዿ','ዯ','ዟ','ዏ','ኯ','ኟ','ቿ','ቯ','ሿ','ሯ','ሟ','ሏ','ⶇ','ⶶ']
    tig_exc_amh = []
    for comment in myarr:
        isAmharic = False
        for ltr in amharic_letters:
            if ltr in comment:
                isAmharic = True
                break
        if not isAmharic:
            tig_exc_amh.append(comment)
    return tig_exc_amh
        

def pos_neg_counter(comment,emojis):
    """
    Args:
        comment: a single comment
        emojis: set of all emojis assigned as positive and negative with structure [emoji, sentiment, high/low]
    """
    pos_high = 0
    pos_low = 0
    neg_high = 0
    neg_low = 0
    c ='none'
    for emoji in emojis:
        counter = 0
        if emoji[0] in comment:
            
            counter = comment.count(emoji[0])
            if emoji[2] == "pos_high":
                pos_high += counter
            elif emoji[2] == "pos_low":
                pos_low += counter
            elif emoji[2] == "neg_high":
                neg_high += counter
            elif emoji[2] == "neg_low":
                neg_low += counter
    return pos_high,pos_low,neg_high,neg_low


In [39]:
#Main function
def main(UNLABELLED_COMMENTS_PATH,POS_PATH,NEG_PATH):
    
    """
    AUTOMATIC LABELLING PSEUDOCODE
    
    Load emoji's data as positive and negative array of emojis[emoji, high/low] high means highly neg/pos and low means neg/pos with low polarity
    Load all comments to array comments
    Get tigrinya comments only, store them in array Tig_comments
    For each Tig_comment check 
        if it contains words and also emojis
            if it contains only positive emojis then append the comment to array positive_comments
            elseif it contains only negative emojis then append the comment to array negative_comments
            elseif it contains both emojis then do the following:
                count low and high of each positive and negative emoji
                if (7*pos_high + 2*pos_low) > (7*neg_high + 2*neg_low) then append the comment to positive_comments
                elseif (7*pos_high + 2*pos_low) < (7*neg_high + 2*neg_low) then append the comment to negative_comments
                elseif pos_high>neg_high then append the comment to positive_comments
                else append the comment to negative_comments
        elseif it contains only words then append the comment to array unlabled_comments
        elseif it contains only emojis then do nothing, in otherwords skip this comment
    """
        
    POS = []
    NEG = []
    ONLY_EMOJI = []
    #emojis = load_emojis()
    #emojis = add_extra_emojis(emojis,EXTRA_EMOJI_PATH)
    emojis = load_local_emojis(LOCAL_EMOJI_PATH)
    ALPHA_HIGH = 7
    ALPHA_LOW = 2
    regex_tokenizer = nltk.RegexpTokenizer("\w+")
    for filename in os.listdir(UNLABELLED_COMMENTS_PATH): 
        if filename.endswith(".txt"):
            COMMENT_PATH = os.path.join(UNLABELLED_COMMENTS_PATH, filename)
            comments = load_comments(COMMENT_PATH)
            for comment in comments:
                if it_contains_Tig_words(comment):
                    """pos,neg = simple_pos_neg_counter(comment,emojis)
                    if pos > neg:
                        POS.append(comment)
                    else:
                        NEG.append(comment)
                    """
                    pos_high,pos_low,neg_high,neg_low = pos_neg_counter(comment,emojis)
                    #pure_Tig_comment = comment
                    pure_Tig_comment = " ".join(regex_tokenizer.tokenize(comment))
                    if ((pos_high + pos_low) > 0 and (neg_low + neg_high) == 0 and len(pure_Tig_comment) > 2):#Comment contains only positive emojis
                        POS.append("1;"+pure_Tig_comment)
                    elif ((pos_high + pos_low) == 0 and (neg_low + neg_high) > 0 and len(pure_Tig_comment) > 2):#Comment conatins only negative emojis
                        NEG.append("0;"+pure_Tig_comment)     
                    elif(len(pure_Tig_comment) > 2):#Comment conatins both negative and positive emojis
                        if ((ALPHA_HIGH  * pos_high) + (ALPHA_LOW * pos_low)) > (((ALPHA_HIGH + 1) * neg_high) + (ALPHA_LOW * neg_low)):
                            POS.append("1;"+pure_Tig_comment)
                        elif ((ALPHA_HIGH  * pos_high) + (ALPHA_LOW * pos_low)) < (((ALPHA_HIGH + 1)* neg_high) + (ALPHA_LOW * neg_low)):
                            NEG.append("0;"+pure_Tig_comment)
                        elif(pos_high < neg_high):
                            NEG.append("0;"+pure_Tig_comment)
                        else:
                            POS.append("1;"+pure_Tig_comment)

                else:
                    ONLY_EMOJI.append(comment)
                    
    #remove Amharic comments
    POS = remove_amharic(POS)
    NEG = remove_amharic(NEG)
    
    #Store Tigrinya automatically labelled comments 
    store(POS,POS_PATH)
    store(NEG,NEG_PATH)
    #store(ONLY_EMOJI,ONLY_EMOJI_PATH)
    
    print("Length of positive comments : ",len(POS))
    print("Length of negative comments : ", len(NEG)) 
    print(len(POS) + len(NEG))
    return NEG,POS
            

In [12]:
#run main functions
if __name__ == "__main__":
    
    EMOJI_PATH = "data/polarized emojis.txt"
    LOCAL_EMOJI_PATH = "data/all_emojis.txt"
    
    UNLABELLED_COMMENTS_PATH = "YOUR PATH TO UNLABLED COMMENTS: THE PATH SOULD CONTAIN 1 OR MORE .txt FILES INSIDE WITH 1 COMMENT PER LINE"
    POS_PATH = "data/labeled_pos.txt"
    NEG_PATH = "data/labeled_neg.txt"
    
    NEG,POS = main(UNLABELLED_COMMENTS_PATH,POS_PATH,NEG_PATH)

    