In [None]:
%cd "/content/drive/MyDrive/Sentiment Analysis of Code-Mixed Telugu-English Text using Sequence models. (SACMTET)/data normalization"

/content/drive/MyDrive/Sentiment Analysis of Code-Mixed Telugu-English Text using Sequence models. (SACMTET)/data normalization


# Data Normalization

In [None]:
!pip install symspellpy
import pkg_resources
from symspellpy import SymSpell, Verbosity
import re



In [None]:
# 5.1 Elongation Normalization
def elongation_normalization(text):
    text = text.lower()
    text = re.sub(r'(.)\1{2,}', r"\1\1", text)
    return text
example = "hellooo bagundhiii, gooood"
elongation_normalization(example)


'helloo bagundhii, good'

In [None]:
# 5.2 Normalizing English Words
sym_spell = SymSpell(max_dictionary_edit_distance=3)
dictionary_path = pkg_resources.resource_filename("symspellpy", "frequency_dictionary_en_82_765.txt")
sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)

def eng_spelling_correction(text):
    result = sym_spell.word_segmentation(text)
    return result.corrected_string
    
example = "thsi isa a lovely movei"
eng_spelling_correction(example)

'this is a lovely movie'

In [None]:
# 5.3.1 Normalizing Transliterations
def transliteration_normalization(text):
    text = re.sub(r'(.)\1{1,}', r"\1", text)
    return text

example = "tinnaavaa sarigga"
transliteration_normalization(example)

'tinava sariga'

In [None]:
# Normalizing Aspirated Consonants
def consonant_normalization(text):
    text = text.replace('kh', 'k')
    text = text.replace('chh', 'ch')
    text = text.replace('gh', 'g')
    text = text.replace('th', 't')
    text = text.replace('jh', 'j')
    text = text.replace('dh', 'd')
    text = text.replace('bh', 'b')
    return text
    
example = "thinnava jharkand"
consonant_normalization(example)

'tinnava jarkand'

In [None]:
def data_normalization(text_tuple):
    text = ""
    for tupl in text_tuple:
        word = tupl[0]
        lang = tupl[1]

        word = elongation_normalization(word) # for both languages

        if lang == 'en': #english
            word = eng_spelling_correction(word)
            text += word + " "

        if lang == 'te': # telugu
            word = transliteration_normalization(word)
            word = consonant_normalization(word)
            text += word + " "

        if lang == 'univ': # universal
            text += word + " "
    return text

example = [('worstttt', 'en'), ('government', 'en'), ('.', 'univ'), ('#YSRCP', 'univ'), ('chala', 'te'), ('chethha', 'te'), ('ga', 'te'), ('paripalana', 'te'), ('chesthumdhi', 'te'), ('.', 'univ')]
data_normalization(example)

'worst government . #ysrcp chala cheta ga paripalana chestumdi . '

In [None]:
with open('codemix_sentiment_data.txt') as f:
    df = f.readlines()

In [None]:
labels = []
sents = []
for i in range(0, len(df), 4):
    line1 = df[i]
    line2 = df[i+1]

    text = line1.replace('\n','')
    lang = line2.replace('\n','')

    label, sent = text[:3], text[4:]

    labels.append(label)
    out = [(word, lang) for word, lang in zip(sent.split(), lang.split())]
    sents.append(out)

In [None]:
print(sents[0], labels[0])
print(sents[1], labels[1])

[('We', 'en'), ('need', 'en'), ('Mr', 'univ'), ('chari', 'univ'), ("'s", 'univ'), ('review', 'en'), ('on', 'en'), ('master', 'en')] NTL
[('worst', 'en'), ('government', 'en'), ('.', 'univ'), ('#YSRCP', 'univ'), ('chala', 'te'), ('chethha', 'te'), ('ga', 'te'), ('paripalana', 'te'), ('chesthumdhi', 'te'), ('.', 'univ')] NEG


In [None]:
import csv

with open('normalized_data.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerow(['comment', 'label'])

    for i, tupl_text in enumerate(sents):
        if i%2500 == 0:
            print(i)
        text = data_normalization(tupl_text)
        label = labels[i]
        writer.writerow([text, label])

0
2500
5000
7500
10000
12500
15000
17500


In [None]:
import pandas as pd
normalized_data = pd.read_csv('normalized_data.csv')
normalized_data.head(10)

Unnamed: 0,comment,label
0,we need mr chari 's review on master,NTL
1,worst government . #ysrcp chala cheta ga parip...,NEG
2,baya nuvu emina chepu kani bagoledu ani chepak...,NEG
3,gadini vadilesi manchi pani chesaru @rcbtweets 👍,POS
4,i came to watch thyview 's review crying after...,POS
5,enti baya review ela ichav chala anukuna gurin...,NTL
6,@mechanicmastr ne basha cheptundi ra ne batuku...,NEG
7,great bro single day 3 movies chusi reviews ch...,NTL
8,@puremass ante apudu online lo yavaru leru kab...,POS
9,what works and what does ’ to in mesam ! .,NTL


In [None]:
normalized_data.iloc[8].comment

'@puremass ante apudu online lo yavaru leru kabati reply ichav lekapote nv reply ivavu ga ana 😌 😌 '