In [1]:
import torch
import torch.nn.functional as F
import torchtext
import time
import random
import pandas as pd
import re
import tqdm
import string
import torch.nn as nn
import torch.optim as optim
torch.backends.cudnn.deterministic = True
print("\ntorch    :", torch.__version__)
print("torchtext:", torchtext.__version__)


torch    : 2.1.1+cpu
torchtext: 0.5.0


In [2]:
RANDOM_SEED = 123
torch.manual_seed(RANDOM_SEED)

VOCABULARY_SIZE = 20000
LEARNING_RATE = 0.0001
BATCH_SIZE = 150
NUM_EPOCHS = 20
DEVICE = torch.device('cuda:2' if torch.cuda.is_available() else 'cpu')

EMBEDDING_DIM = 128
HIDDEN_DIM = 256
NUM_CLASSES = 3

In [3]:
df = pd.read_excel('C:/Users/Asus/Desktop/New folder/run 1/train2.1.xlsx')
df.head()

Unnamed: 0,review_description,rating
0,ممتاز لكن مفروض زام مطاعم باضاف خدم الك نت,1
1,افاشل تطبيق تاخير وقت والاكل ناشف صراح اول مره...,-1
2,,-1
3,برنامج مهكر,-1
4,يج لكنرع,1


In [4]:
df2 = pd.read_csv('C:/Users/Asus/Desktop/New folder/run 1/test2.01.csv')
df2.head()

Unnamed: 0,ID,review_description
0,1,اهن خدم عملاء محادث مباشر ما قصرو اله يوفق يعط...
1,2,ممتاز جدا لكن اتم ان تكون مسابق والجواز طلب سع...
2,3,محمل يقول تم ايقاف حط عشان تسون خطاء
3,4,شغل طيب
4,5,ماجرب


In [5]:
ALEF_MADDA = u'\u0622'
ALEF_HAMZA_ABOVE = u'\u0623'
ALEF_HAMZA_BELOW = u'\u0625'
HAMZA_ABOVE = u'\u0654'
HAMZA_BELOW = u'\u0655'
ALEF = u'\u0627'
ALEFAT_PAT = re.compile(u"["+u"".join([ALEF_MADDA, ALEF_HAMZA_ABOVE,
                                       ALEF_HAMZA_BELOW, HAMZA_ABOVE,
                                       HAMZA_BELOW])+u"]")

ALEF_TANWEEN = u"\u0627\u064B"
TASHKEEL = u"[\u064B-\u0652]"
ALEFAT = u"["+u"".join([ALEF_MADDA, ALEF_HAMZA_ABOVE,
                                       ALEF_HAMZA_BELOW, HAMZA_ABOVE,
                                       HAMZA_BELOW])+u"]"

WAW_HAMZA = u'\u0624'
YEH_HAMZA = u'\u0626'
HAMZA = u'\u0621'

HAMZAT_PAT = re.compile(u"["+u"".join([WAW_HAMZA, YEH_HAMZA])+u"]")

TEH_MARBUTA = u'\u0629'
HEH = u'\u0647'
ALEF_MAKSURA = u'\u0649'
YEH = u'\u064a'


def tokenize_text(text):
    from nltk import word_tokenize
    tokens = word_tokenize(text)
    return tokens


def remove_stop_words(tokens, mode=0):
    if mode == 0:
        from nltk.corpus import stopwords
        stop_words = stopwords.words('arabic')
        stop_words.append('ال')
        filtered_text = [word for word in tokens if word not in stop_words]
        return filtered_text
    elif mode == 2:
        from gensim.parsing.preprocessing import STOPWORDS
        STOPWORDS = STOPWORDS.union(set(['ال']))
        filtered_tokens = [token for token in tokens if token not in STOPWORDS]
        filtered_text = ' '.join(filtered_tokens)
        return filtered_text.split()
    

def snowball_stemmer(tokens):
    from snowballstemmer import stemmer
    arabic_stemmer = stemmer('arabic')
    stemmed_words = arabic_stemmer.stemWords(tokens)
    return stemmed_words


def isris_stemmer(tokens):
    from nltk.stem.isri import ISRIStemmer
    stemmer = ISRIStemmer()
    stemmed_list = [stemmer.stem(word) for word in tokens]
    return stemmed_list


def tashaphyne_stemmer(text):
    from tashaphyne.stemming import ArabicLightStemmer
    stem = ArabicLightStemmer()
    return stem.light_stem(text)


def farasa_stemmer(tokens):
    from farasa.stemmer import FarasaStemmer
    stemmer = FarasaStemmer()
    stemmed_list = [stemmer.stem(word) for word in tokens]
    return stemmed_list


def farasa_segmenter(text):
    from farasa.segmentation import FarasaSegmenter
    segmenter = FarasaSegmenter()
    segments = segmenter.segment(text)
    # print(segments)
    return re.split(r'\s\w{1,2}\+|\+\w{1,2}\s|\s|\+', segments)
    # return re.split(r'\+|\s', segments)
    

def remove_non_arabic(text):
    return ' '.join(re.sub(u"[^\u0621-\u063A\u0640-\u0652 ]", " ", text,  flags=re.UNICODE).split())

def remove_punctuation(text):
    
    cleaned_text = ''.join(char for char in text if char not in string.punctuation)
    return cleaned_text


def normalize_text(text):
    #text = text.strip() #Remove end spaces
    text = re.sub(u'%s' % ALEFAT, ALEF, text)
    text = re.sub(u'%s' % ALEF_TANWEEN, "", text)
    text = re.sub(u'%s' % TASHKEEL, "", text)
    text = re.sub(u'[%s]' % ALEF_MAKSURA, YEH, text)
    text = re.sub(u'[%s]' % TEH_MARBUTA, HEH, text)
    text = re.sub(u'[%s]' % WAW_HAMZA, HAMZA, text)
    text = re.sub(u'[%s]' % YEH_HAMZA, HAMZA, text)
    return text


def preprocess(text, token_mode=0, stem_mode=0):
    # print("Original Text:", text)
    
    #removed_non_arabic = remove_non_arabic(text)
    
    
    #normalized_text = normalize_text(removed_non_arabic)
    remove_arabic=remove_punctuation(text)
    
    if token_mode == 0:        
        stemmed_tokens = farasa_segmenter(remove_arabic)
    
    elif token_mode == 1:
        tokens = tokenize_text(remove_arabic)
        if stem_mode == 0:
            stemmed_tokens = snowball_stemmer(tokens)
            # print("Snowball Stemmed:", stemmed_tokens)
        
        elif stem_mode == 1:
            stemmed_tokens = isris_stemmer(tokens)
        
        elif stem_mode == 2:
            stemmed_tokens = farasa_stemmer(tokens)

    
    removed_stop_words = remove_stop_words(stemmed_tokens, mode=0)
    return removed_stop_words


In [6]:
df['processed_text'] = df['review_description']
column_to_move = 'processed_text'
first_column = df.pop(column_to_move)
df.insert(0, column_to_move, first_column)


In [7]:

null_indices_train_texts = df2[df2['review_description'].isnull()].index
print("Indices of nulls in train_texts:", len(null_indices_train_texts))
df2['review_description'].fillna("", inplace=True)
print(len(df2))

# Remove rows with null values in 'review_description' column
df = df.dropna(subset=['review_description'])

df.drop('review_description', axis=1, inplace=True)



df.to_csv('C:/Users/Asus/Desktop/New folder (4)/processed_data.csv', index=False)

df.head()

df2['processed_text'] = df2['review_description']
df2.drop('review_description', axis=1, inplace=True)
df2.drop('ID', axis=1, inplace=True)



df2.to_csv('C:/Users/Asus/Desktop/New folder (4)/processed_data2.csv', index=False)

df2.head()

Indices of nulls in train_texts: 17
1000


Unnamed: 0,processed_text
0,اهن خدم عملاء محادث مباشر ما قصرو اله يوفق يعط...
1,ممتاز جدا لكن اتم ان تكون مسابق والجواز طلب سع...
2,محمل يقول تم ايقاف حط عشان تسون خطاء
3,شغل طيب
4,ماجرب


In [8]:

TEXT = torchtext.data.Field(
    tokenize=lambda text: text.split(),  
    include_lengths=True
)
LABEL = torchtext.data.LabelField()


fields = [('processed_text', TEXT), ('rating', LABEL)]


dataset = torchtext.data.TabularDataset(
    path='C:/Users/Asus/Desktop/New folder (4)/processed_data.csv', format='csv',
    skip_header=True, fields=fields
)


dataset

<torchtext.data.dataset.TabularDataset at 0x1512b91dd90>

In [9]:

fields2 = [('processed_text', TEXT)]


dataset2 = torchtext.data.TabularDataset(
    path='C:/Users/Asus/Desktop/New folder (4)/processed_data2.csv', format='csv',
    skip_header=True, fields=fields2
)

print(torchtext.data)
dataset2

<module 'torchtext.data' from 'C:\\Users\\Asus\\anaconda3\\lib\\site-packages\\torchtext\\data\\__init__.py'>


<torchtext.data.dataset.TabularDataset at 0x1512b91d4c0>

In [10]:
#Split Dataset into Train/Validation/Test

train_data= dataset
test_data = dataset2

print(f'Num Train: {len(train_data)}')
print(f'Num Test: {len(test_data)}')

Num Train: 36834
Num Test: 1000


In [11]:
train_data, valid_data = train_data.split(
    split_ratio=[0.85, 0.15],
    random_state=random.seed(RANDOM_SEED))

print(f'Num Train: {len(train_data)}')
print(f'Num Validation: {len(valid_data)}')

Num Train: 31309
Num Validation: 5525


In [12]:
print(vars(train_data.examples[0]))

{'processed_text': ['ممتاز', 'ممتاز', 'ممتاز'], 'rating': '0'}


In [13]:
print(vars(test_data.examples[0]))

{'processed_text': ['اهن', 'خدم', 'عملاء', 'محادث', 'مباشر', 'ما', 'قصرو', 'اله', 'يوفق', 'يعط', 'الف', 'عاف']}


In [14]:
#Build Vocabulary
TEXT.build_vocab(train_data, max_size=VOCABULARY_SIZE)
LABEL.build_vocab(train_data)

print(f'Vocabulary size: {len(TEXT.vocab)}')
print(f'Number of classes: {len(LABEL.vocab)}')

Vocabulary size: 17757
Number of classes: 3


In [15]:
print(TEXT.vocab.freqs.most_common(20))

[('ممتاز', 12604), ('تطبيق', 5665), ('جدا', 5063), ('طلب', 4345), ('لكن', 3944), ('فاشل', 3670), ('توصيل', 2460), ('ما', 2437), ('لا', 2319), ('مطاعم', 2248), ('انا', 1863), ('برنامج', 1790), ('خدم', 1763), ('رنامج', 1761), ('جميل', 1747), ('مش', 1718), ('ان', 1515), ('مطعم', 1386), ('مره', 1328), ('طلبا', 1264)]


In [16]:
print(TEXT.vocab.itos[:10])

['<unk>', '<pad>', 'ممتاز', 'تطبيق', 'جدا', 'طلب', 'لكن', 'فاشل', 'توصيل', 'ما']


In [17]:
print(LABEL.vocab.stoi)

defaultdict(None, {'1': 0, '-1': 1, '0': 2})


In [18]:
#Define Data Loaders
train_loader, valid_loader, test_loader = \
    torchtext.data.BucketIterator.splits(
        (train_data, valid_data, test_data), 
        batch_size=BATCH_SIZE,
        sort_within_batch=True, 
             sort_key=lambda x: len(x.processed_text),
        shuffle=False,
        device=DEVICE
)

In [19]:
# i = 1
# for batch in train_loader:
#     # Get the padded sequences from the batch
#     padded_sequences = batch.processed_text
    
#     # Extract the tensor from the tuple (assuming it's at index 0)
#     padded_sequences = padded_sequences[0]
    
#     # Transpose the tensor to iterate through sentences
#     padded_sequences = padded_sequences.transpose(0, 1)  # Assuming sentences are along dimension 1
    
#     # Convert numeric tokens to words using the vocabulary from TEXT
#     word_sequences = [
#         [TEXT.vocab.itos[token] for token in sentence]
#         for sentence in padded_sequences
#     ]
    
#     # Get the actual lengths of each sequence in the batch
#     sequence_lengths = [len(seq) for seq in word_sequences]
    
#     # Get targets from the batch (assuming they are named 'targets')
#     targets = batch.rating  # Replace 'targets' with the actual name of your target attribute
#     #print(targets)
#     # Print the lengths of sentences in the current batch
#     #print("Lengths of sentences in the current batch:", sequence_lengths)
    
#     # Print all sentences and their targets in the current batch
#     print("Sentences in Batch {}:".format(i))
#     for idx, (sentence, target) in enumerate(zip(word_sequences, targets)):
#         print(f"Sentence {idx + 1}:", sentence)
#         print(f"Target {idx + 1}:", target)
    
#     i += 1
#     print("Batch", i)


In [20]:
# i = 1
# for batch in train_loader:
#     # Get the padded sequences from the batch
#     padded_sequences = batch.processed_text
   
#     # Extract the tensor from the tuple (assuming it's at index 0)
#     padded_sequences = padded_sequences[0]
#     # Transpose the tensor to iterate through sentences
#     padded_sequences = padded_sequences.transpose(0, 1)  # Assuming sentences are along dimension 1
#     # Get the actual lengths of each sequence in the batch
#     sequence_lengths = [len(seq) for seq in padded_sequences]
#     print(padded_sequences)
#     print(sequence_lengths[0])
    
#     i += 1
#     print("Batch", i)

In [21]:
# i = 1
# for batch in test_loader:
#     # Get the padded sequences from the batch
#     padded_sequences = batch.processed_text
    
#     # Extract the tensor from the tuple (assuming it's at index 0)
#     padded_sequences = padded_sequences[0]
#     # Transpose the tensor to iterate through sentences
#     padded_sequences = padded_sequences.transpose(0, 1)  # Assuming sentences are along dimension 1
#     # Get the actual lengths of each sequence in the batch
#     sequence_lengths = [len(seq) for seq in padded_sequences]
#     print(padded_sequences)
#     print(sequence_lengths[0])
    
#     i += 1
#     print("Batch", i)

In [22]:
#esting the iterators (note that the number of rows depends on the longest document in the respective batch):
print('Train')
for batch in train_loader:
    print(f'Text matrix size: {batch.processed_text[0].size()}')
    print(f'Target vector size: {batch.rating.size()}')
    break
    
print('\nValid:')
for batch in valid_loader:
    print(f'Text matrix size: {batch.processed_text[0].size()}')
    print(f'Target vector size: {batch.rating.size()}')
    break
    
print('\nTest:')
for batch in test_loader:
    print(f'Text matrix size: {batch.processed_text[0].size()}')
   
    break

Train
Text matrix size: torch.Size([1, 150])
Target vector size: torch.Size([150])

Valid:
Text matrix size: torch.Size([1, 150])
Target vector size: torch.Size([150])

Test:
Text matrix size: torch.Size([2, 150])


In [23]:
class LSTMModel(nn.Module):
    
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        super().__init__()

        self.embedding = torch.nn.Embedding(input_dim, embedding_dim)
        self.rnn = torch.nn.LSTM(embedding_dim, hidden_dim)        
        self.fc = torch.nn.Linear(hidden_dim, output_dim)
        

    def forward(self, text, text_length):
        embedded = self.embedding(text)
        
        packed = torch.nn.utils.rnn.pack_padded_sequence(embedded, text_length, batch_first=True)
        packed_output, (hidden, cell) = self.rnn(packed)
        
        # Unpack the sequence (assuming batch_first=True)
        output, _ = torch.nn.utils.rnn.pad_packed_sequence(packed_output, batch_first=True)
        
        # Take the last time-step's output
        last_output = output[:, -1, :]
        
        output = self.fc(last_output)
        return output


In [24]:
# Initialize the model, criterion, and optimizer
model = LSTMModel(len(TEXT.vocab), EMBEDDING_DIM, HIDDEN_DIM, NUM_CLASSES)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

# Move model to device
model.to(DEVICE)

LSTMModel(
  (embedding): Embedding(17757, 128)
  (rnn): LSTM(128, 256)
  (fc): Linear(in_features=256, out_features=3, bias=True)
)

In [25]:
def compute_accuracy(model, data_loader, device):

    with torch.no_grad():

        correct_pred, num_examples = 0, 0

        for batch_idx, batch_data in enumerate(data_loader):

            # Get the padded sequences from the batch
            padded_sequences3 = batch_data.processed_text

            # Extract the tensor from the tuple (assuming it's at index 0)
            padded_sequences3 = padded_sequences3[0]

            # Transpose the tensor to iterate through sentences
            padded_sequences3 = padded_sequences3.transpose(0, 1)  # Assuming sentences are along dimension 1


            sequence_lengths3 = [len(seq) for seq in padded_sequences3]

            # Get targets from the batch (assuming they are named 'targets')
            targets3 = batch_data.rating  # Replace 'targets' with the actual name of your target attribute
            
            logits = model(padded_sequences3, sequence_lengths3)
            _, predicted_labels = torch.max(logits, 1)

            num_examples += targets3.size(0)

            correct_pred += (predicted_labels == targets3).sum()
    return correct_pred.float()/num_examples * 100

In [26]:
start_time = time.time()

for epoch in range(NUM_EPOCHS):
    model.train()
    i=1
    for batch_idx, batch_data in enumerate(train_loader):
        # Get the padded sequences from the batch
        padded_sequences2 = batch_data.processed_text

        # Extract the tensor from the tuple (assuming it's at index 0)
        padded_sequences2 = padded_sequences2[0]

        # Transpose the tensor to iterate through sentences
        padded_sequences2 = padded_sequences2.transpose(0, 1)  # Assuming sentences are along dimension 1


        sequence_lengths2 = [len(seq) for seq in padded_sequences2]

        # Get targets from the batch (assuming they are named 'targets')
        targets2 = batch_data.rating  # Replace 'targets' with the actual name of your target attribute
        #print(targets2)
        # Print the lengths of sentences in the current batch
        #print("Lengths of sentences in the current batch:", sequence_lengths2)

        i += 1
        #print("Batch", i)


        ### FORWARD AND BACK PROP
        logits = model(padded_sequences2,sequence_lengths2)
        #print(logits)
        loss = F.cross_entropy(logits, targets2)
        optimizer.zero_grad()
        
        loss.backward()
        
        ### UPDATE MODEL PARAMETERS
        optimizer.step()
        
        ### LOGGING
        if not batch_idx % 50:
            print (f'Epoch: {epoch+1:03d}/{NUM_EPOCHS:03d} | '
                   f'Batch {batch_idx:03d}/{len(train_loader):03d} | '
                   f'Loss: {loss:.4f}')

    with torch.set_grad_enabled(False):
        print(f'training accuracy: '
              f'{compute_accuracy(model, train_loader, DEVICE):.2f}%'
              f'\nvalid accuracy: '
              f'{compute_accuracy(model, valid_loader, DEVICE):.2f}%')
        
    print(f'Time elapsed: {(time.time() - start_time)/60:.2f} min')
    
print(f'Total Training Time: {(time.time() - start_time)/60:.2f} min')
#print(f'Test accuracy: {compute_accuracy(model, test_loader, DEVICE):.2f}%')

Epoch: 001/020 | Batch 000/209 | Loss: 1.0980
Epoch: 001/020 | Batch 050/209 | Loss: 0.9503
Epoch: 001/020 | Batch 100/209 | Loss: 0.9671
Epoch: 001/020 | Batch 150/209 | Loss: 0.9521
Epoch: 001/020 | Batch 200/209 | Loss: 1.0150
training accuracy: 64.39%
valid accuracy: 64.27%
Time elapsed: 0.68 min
Epoch: 002/020 | Batch 000/209 | Loss: 0.9314
Epoch: 002/020 | Batch 050/209 | Loss: 0.7552
Epoch: 002/020 | Batch 100/209 | Loss: 0.9224
Epoch: 002/020 | Batch 150/209 | Loss: 0.8637
Epoch: 002/020 | Batch 200/209 | Loss: 0.9083
training accuracy: 67.56%
valid accuracy: 67.71%
Time elapsed: 1.33 min
Epoch: 003/020 | Batch 000/209 | Loss: 0.9043
Epoch: 003/020 | Batch 050/209 | Loss: 0.7181
Epoch: 003/020 | Batch 100/209 | Loss: 0.8960
Epoch: 003/020 | Batch 150/209 | Loss: 0.8350
Epoch: 003/020 | Batch 200/209 | Loss: 0.8575
training accuracy: 69.12%
valid accuracy: 68.71%
Time elapsed: 2.09 min
Epoch: 004/020 | Batch 000/209 | Loss: 0.8805
Epoch: 004/020 | Batch 050/209 | Loss: 0.6851
Ep

In [27]:
def compute_accuracy2(model, data_loader, device):

    with torch.no_grad():

        correct_pred, num_examples = 0, 0
        predicted=[]
        for batch_idx, batch_data in enumerate(data_loader):
            # Get the padded sequences from the batch
            padded_sequences3 = batch_data.processed_text

            # Extract the tensor from the tuple (assuming it's at index 0)
            padded_sequences3 = padded_sequences3[0]

            # Transpose the tensor to iterate through sentences
            padded_sequences3 = padded_sequences3.transpose(0, 1)  # Assuming sentences are along dimension 1


            sequence_lengths3 = [len(seq) for seq in padded_sequences3]

            

            logits = model(padded_sequences3, sequence_lengths3)
            _, predicted_labels = torch.max(logits, 1)

            predicted+=predicted_labels.tolist()
            
    return predicted

In [28]:
predicted_out=compute_accuracy2(model, test_loader, DEVICE)

In [29]:
def replace_values(lst):
    for i in range(len(lst)):
        if lst[i] == 0:
            lst[i] = 1
        elif lst[i] == 1:
            lst[i] = -1
        elif lst[i] == 2:
            lst[i] = 0
    return lst


In [30]:
predicted_out2=replace_values(predicted_out)


In [31]:
import csv


# File name for the CSV file
file_name = 'C:/Users/Asus/Desktop/New folder (4)/predicted_ratings4.csv'

# Writing data to a CSV file
with open(file_name, 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)

    # Write header
    writer.writerow(['ID', 'rating'])

    # Write data rows
    for idx, rating in enumerate(predicted_out2, start=1):
        writer.writerow([idx, rating])

print(f"CSV file '{file_name}' has been created.")

CSV file 'C:/Users/Asus/Desktop/New folder (4)/predicted_ratings4.csv' has been created.
