In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import os, os.path

path = os.getcwd() 

df = pd.read_csv( '/content/drive/MyDrive/MCI/data for armankade.csv', sep='\t')
df.head()

Unnamed: 0,line,text,average tag
0,1,با تشکر از شما آقای عباسی,POSITIVE
1,2,برنامه خندوانه به زندگی ما انرژی داده مرسی,POSITIVE
2,3,مواظب کلیدت باش تو برفا گم نشه ،شب خونه راهت ن...,NEGATIVE
3,4,مایه افتخار ما شیرازی هاست که آقا اومدن اونجا ...,POSITIVE
4,5,@_zahra_str_75 بیخیار سالاد نمیشه,NATURAL


In [6]:
!pip install demoji

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting demoji
  Downloading demoji-1.1.0-py3-none-any.whl (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.9/42.9 KB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: demoji
Successfully installed demoji-1.1.0


In [7]:
import emoji
import re

def clean_text(text):
    text=  emoji.demojize(text)
    text= re.sub(r'(:[!_\-\w]+:)', '', text)
    text= re.sub(r'@\w+', '', text)
    text =  text.replace("#", " ")
    return text

df['emoji'] = df['text'].apply(lambda x : emoji.distinct_emoji_list(x))
df['text'] = df['text'].apply(lambda x : clean_text(x))

df.loc[:12,:]

Unnamed: 0,line,text,average tag,emoji
0,1,با تشکر از شما آقای عباسی,POSITIVE,[]
1,2,برنامه خندوانه به زندگی ما انرژی داده مرسی,POSITIVE,[]
2,3,مواظب کلیدت باش تو برفا گم نشه ،شب خونه راهت ن...,NEGATIVE,[]
3,4,مایه افتخار ما شیرازی هاست که آقا اومدن اونجا ...,POSITIVE,"[🌸, 🌼, 🌷]"
4,5,بیخیار سالاد نمیشه,NATURAL,[]
5,6,بعضی ها به خدا افاقه میکنند بعضی ها به کدخدا ب...,NATURAL,[]
6,7,عشششقم,POSITIVE,[😍]
7,8,عاشقه این اهنگمبی نظیره,POSITIVE,"[👌, ❤, 💪]"
8,9,على بابا باشين اوستى شهيدلره مدفندى/مسجدوه گلي...,BiMani,[]
9,10,مردی تو عمل پای حرف امام خمینی باش,NEGATIVE,[]


In [8]:
import demoji

def get_emojis_text(List):
   
    text_list = []
    for item in List:
        text_list.append(list(demoji.findall(item).values())[0])
    return text_list

df['emoji_text'] = df['emoji'].apply(lambda row:get_emojis_text(row))

In [None]:
df.head()

Unnamed: 0,line,text,average tag,emoji,emoji_text
0,1,با تشکر از شما آقای عباسی,POSITIVE,[],[]
1,2,برنامه خندوانه به زندگی ما انرژی داده مرسی,POSITIVE,[],[]
2,3,مواظب کلیدت باش تو برفا گم نشه ،شب خونه راهت ن...,NEGATIVE,[],[]
3,4,مایه افتخار ما شیرازی هاست که آقا اومدن اونجا ...,POSITIVE,"[🌼, 🌷, 🌸]","[blossom, tulip, cherry blossom]"
4,5,بیخیار سالاد نمیشه,NATURAL,[],[]


In [9]:
import itertools as it

filename='/content/drive/MyDrive/MCI/NRC-emotion-lexicon-wordlevel-persian-v0.92.txt'

positive_lexicon_dict = {}
negative_lexicon_dict = {}

with  open(filename) as fp:
    for x in fp:
        if(x.split('\t')[1] == 'negative'):
            negative_lexicon_dict[x.split('\t')[0]] = x.split('\t')[2].split('\n')[0]
    
        if(x.split('\t')[1] == 'positive'):
            positive_lexicon_dict[x.split('\t')[0]] = x.split('\t')[2].split('\n')[0]

In [10]:
def lexicon_count(text, lexicon_list):
    
    text_split = text.split(' ')
    count = 0
    
    for word in text_split:
        if(word in lexicon_list):
            count += int(lexicon_list[word])
    return count

In [11]:
df['positive_lexicon_count']= df['text'].apply(lambda row: lexicon_count(row, positive_lexicon_dict ))

In [13]:
df['negative_lexicon_count']= df['text'].apply(lambda row: lexicon_count(row, negative_lexicon_dict ))

In [14]:
import numpy as np

df['word_count'] = df['text'].apply(lambda row: np.log(len(row.split(' '))))

In [15]:
df['label'] = df['average tag'].apply(lambda row: 1 if row == 'POSITIVE' else(-1 if row == 'NEGATIVE' else 0))

In [16]:
from sklearn.model_selection import train_test_split

df = df[df['label'] != 0]

train , test = train_test_split(df,
                                test_size=0.2,
                                random_state=123,
                                stratify=df['average tag'].values)


In [17]:
import torch

def sigmoid(X, weight):
    z = torch.matmul(X, weight)
    return 1 / (1 + torch.exp(-z))

In [26]:
def loss_fn(h, y):
    return (-y * torch.log(h) - (1 - y) * torch.log(1 - h)).mean()

In [28]:
import torch.optim as optim

num_iter = 100

X=  torch.tensor(train.loc[:,['positive_lexicon_count','negative_lexicon_count','word_count']].values,dtype=torch.float64)
intercept = torch.ones((X.shape[0], 1),dtype=torch.float64) 
X = torch.column_stack((intercept, X))
Y= torch.tensor(train.loc[:,'label'].values,dtype=torch.float64)

W = torch.zeros(X.shape[1],dtype=torch.float64,requires_grad=True)
learning_rate = 0.01
optimizer = optim.SGD([W], lr=learning_rate)
n_epochs = 1000

def training_loop(n_epochs, optimizer, W, X, Y):
  for i in range(n_epochs):
      if W.grad is not None:
        W.grad.zero_()

      output = sigmoid(X, W)
      loss = loss_fn(output,Y)
      loss.backward()
      optimizer.step()
  return W

W = training_loop(n_epochs, optimizer, W, X, Y)    

In [20]:
def predict(W,x):
    return sigmoid(x, W) > 0.5

In [23]:
from sklearn.metrics import accuracy_score

x= torch.tensor(test.loc[:,['positive_lexicon_count','negative_lexicon_count','word_count']].values
                ,dtype=torch.float64)

intercept = torch.ones((x.shape[0], 1),dtype=torch.float64) 
x = torch.column_stack((intercept, x))
y= torch.tensor(test.loc[:,'label'].values,dtype=torch.float64)

print("Accuracy of prediction on test set : ", accuracy_score(predict(W,x),y))


Accuracy of prediction on test set :  0.09361135758651287
