# Baktash Ansari 

Advantages and disadvantages of each model are written at the end of the notebook.

In [10]:
import torch
import re
import pandas as pd 
from sklearn.preprocessing import OneHotEncoder
import numpy as np
import math
from gensim.models import Word2Vec

In [11]:
cos = torch.nn.CosineSimilarity(dim=0, eps=1e-6)

In [12]:
def find_k_nearest_neighbors(word, embedding_dict, k):
  words_cosine_similarity = dict()
  for token in embedding_dict.keys():
    words_cosine_similarity[token] = cos(embedding_dict[word], embedding_dict[token]).item()
  words_cosine_similarity = dict(sorted(words_cosine_similarity.items(), key=lambda item: item[1]))
  return list(words_cosine_similarity.keys())[-k:][::-1]

def delete_hashtag_usernames(text):
  try:
    result = []
    for word in text.split():
      if word[0] not in ['@', '#']:
        result.append(word)
    return ' '.join(result)
  except:
    return ''

def delete_url(text):
  text = re.sub(r'http\S+', '', text)
  return text

def delete_trash(text) :
  text = re.sub(r'\u200c','',text)
  return text

In [13]:
word = 'زندگی'
k = 10

# 0. Data preprocessing

In [14]:
!pip install json-lines

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [15]:
import json_lines

In [16]:
# 1. extract all tweets from files and save them in memory base on each year.
# 2. remove urls, hashtags and usernames.

frame =  pd.read_csv("/content/mahsa_amini_data.csv")

frame.Text

result = []

for text in frame.Text :
  new_text = delete_hashtag_usernames(text)
  new_text = delete_url(new_text)
  new_text = delete_trash(new_text)
  result.append(new_text)

result 


['بنشین تا شود نقش فال ما نقش هم فردا شدن',
 'این گوزو رو کی گردن میگیره؟؟ دچار زوال عقل شده از بس پای منبر دستمال کشی کرده.',
 'برای ایران، برای مهسا.',
 'مرگ بر دیکتاتور',
 'نذاریم خونشون پایمال شه...',
 'مابهت افتخار میکنیم نبات باعث شدی کل دنیا مارو ببینه',
 'برای انسانای خوشگلمون',
 'فارغ از هر باوری متحد شویم.',
 'اینها عجب موجودات پستی هستن🥺🥺🥺الهی بگردم، من خودم باردارم و حتی توتظاهرات مسالمت امیز خارج ایران استرس داشتم ادم ها نا خود اگاه بهم ضربه بزنن،بمیرم برای دل اون زن که چه کشیده...مرگ بر دیکتاتور',
 'کصخلا چرا ۴ تاوفحشش نمیدن؟',
 'و از روی پول ملی....',
 'برای آزادی ایران تا آخرین قطره خونم میجنگم 🇮🇷✌🏻 (7) ',
 'داریم چهل تا میشیم',
 'سی و سه',
 'سلام بچه ها اگر من رو میبینید حتما هشتگ بزنید.',
 'امروز برای کاری رفتم سراغ وسایل قدیمی و اینارو دیدم : برای تمام روزهایی که رفتم دکه تا کارت دیال آپ بخرم و یه چیزایی تو وبلاگم بنویسم و 88 زدید آرشیوم و حذف کردید برای اون روزها خیلی بغض دارم. ',
 'فالو کنید بک بدم',
 'زن،زندگی،آزادی.',
 'دستاشو مشت کرده✊',
 'بچه ها را تا سرنگونی ر

# 1. One hot encoding

In [17]:
# 1. find one hot encoding of each word for each year
# 2. find 10 nearest words from "ولنتاین"


new_result = []
for text in result :
  new_result.append(text.split(" "))

category = set()

for li in new_result :
  for text in li :
    category.add(text)
print(len(category))

category = list(category)

oneHot = np.zeros((len(category),len(category)))

oneHotdic = {}

for i in range(len(category)) :
      oneHot[i,i] = 1
      oneHotdic[category[i]] = torch.tensor(oneHot[i]).float()



find_k_nearest_neighbors(word,oneHotdic,k)

32116


['زندگی',
 'کوچیکی',
 'آخر',
 'خانوادشون',
 'ابنجا',
 'بجای',
 'کرد:)..#OpIran',
 'ترینن',
 'حرامزادهگان',
 'ادامه']

# 2. TF-IDF

In [18]:
# 1. find the TF-IDF of all tweets.
# 2. choose one tweets randomly.
# 3. find 10 nearest tweets from chosen tweet.


tf = np.zeros((len(new_result),len(category)))


count = {}

all = 0

for tweet in new_result :
  for word in tweet :
    all += 1
    count[word] = 0

for tweet in new_result :
  for word in tweet :
    count[word] += 1

# saving index of each word in category for reducing Time order
indexWord = {}
for index,word in enumerate(category) :
  indexWord[word] = index 

# fill numpy array :
for i,tweet in enumerate(new_result) :
  
  for word in tweet :
    
    cnt = 0
    for new_word in tweet :
      if new_word == word :
        cnt +=1
    tf[i,indexWord[word]] = float(cnt)/len(tweet) * math.log(float(all)/count[word])
  

resultDict = {}

for index,tweet in enumerate(result) :
  resultDict[tweet] = torch.tensor(tf[index])
resultDict

find_k_nearest_neighbors(result[11],resultDict,k)

['برای آزادی ایران تا آخرین قطره خونم میجنگم 🇮🇷✌🏻 (7) ',
 'تا آخرین قطره اینترنت',
 'تا آخرین قطره ی خون',
 'برای مردمم میجنگم',
 'برای آزادی ایران',
 'من امشب خونم رو برای وطنم خواهم داد هر قطره خون من فدای آزادی میهنم',
 'برای آزادی ',
 'برای ایران ',
 'برای ایران برای آزادی',
 'تا آزادی']

# 3. Word2Vec

In [19]:
# 1. train a word2vec model base on all tweets for each year.
# 2. find 10 nearest words from "ولنتاین"
#category

model = Word2Vec(sentences = new_result)

model.wv.most_similar("آزادی")

[('ازادی', 0.9874179363250732),
 ('زن', 0.9833468794822693),
 ('زندگی،', 0.9817682504653931),
 ('امید', 0.9769082069396973),
 ('ایران', 0.9757297039031982),
 ('خواهرم', 0.9737299680709839),
 ('زندگی', 0.9733548164367676),
 ('زن،', 0.9687106609344482),
 ('،زندگی', 0.96824711561203),
 ('برای', 0.9667270183563232)]

# 4. Contextualized embedding

In [None]:
# 1. fine tune a bert model base on all tweets for each year.
# 2. find 10 nearest words from "آزادی"

# Pros and Cons :

## One-hot vector :

advantages:

 One-hot encoding easily processed by machine learning models because it is easy to implement and have simple structure.


 One-hot encoding preserves the original meaning of categorical features and makes them more interpretable for humans.

disadvantages:

One-hot encoding can result in a significant increase in the number of features, which can lead to computational challenges and overfitting.

If embedding of one-hot vector done in a large data set, result matrix can be very large and use lot of memory.


## TF-IDF:

advantages:

can help capture the semantic meaning of words.

TF-IDF reduce the number of common words such as "the" and "a" ,which can help reduce their impact on the model.


TF-IDF is widely used in information retrieval systems such as search engines to rank documents based on their relevance to a query.

disadvantages:

TF-IDF does not capture the order or context of words in a document, which can lead to reduce undrestanding of context.

TF-IDF operates at the word-level and does not capture the meaning of phrases or sentences,


## Word2Vec:

advantages:

word2vec can capture the relationships between words, such as synonyms, antonyms, which can improve the performance of many NLP tasks.

Word2Vec generates vector representations of words, which can be used as features in machine learning models for text classification, sentiment analysis, and other tasks.

disadvantages:

Training a Word2Vec model can be computationally expensive, especially with large datasets and high-dimensional vector representations.

Word2Vec operates at the word-level and does not capture the meaning of phrases or sentences, which can limit its effectiveness in tasks such as sentiment analysis or text generation.

## Contextualized embeddings:


advantages:

Contextualized embeddings are designed to capture the meaning of a word based on the context in which it appears, which can improve the performance of many natural language processing tasks.

Contextualized embeddings have been shown to improve the performance of many natural language processing tasks, such as sentiment analysis, named entity recognition, and machine translation.

disadvantages:

Training and using contextualized embeddings can be computationally expensive, especially with large datasets and high-dimensional vector representations.

As contextualized embedding designed for complex relations between words in context,they hard to examine and interpret.