# **Word2vec in NLP**

In [259]:
import numpy as np
import pandas as pd
import gensim
import os
import re
import emoji
import string
from nltk import sent_tokenize, word_tokenize
from gensim.utils import simple_preprocess
from spellchecker import SpellChecker
import nltk
from nltk.corpus import stopwords
from gensim.models import Word2Vec

In [260]:
df = pd.read_csv('twitter_training.csv', usecols=['Tweet_Content'])
df.head()

Unnamed: 0,Tweet_Content
0,im getting on borderlands and i will murder yo...
1,I am coming to the borders and I will kill you...
2,im getting on borderlands and i will kill you ...
3,im coming on borderlands and i will murder you...
4,im getting on borderlands 2 and i will murder ...


## **Preprocessing**

### **Lowercasing**

In [261]:
df['Tweet_Content'] = df['Tweet_Content'].str.lower()

In [262]:
df.head()

Unnamed: 0,Tweet_Content
0,im getting on borderlands and i will murder yo...
1,i am coming to the borders and i will kill you...
2,im getting on borderlands and i will kill you ...
3,im coming on borderlands and i will murder you...
4,im getting on borderlands 2 and i will murder ...


### **Removing html tags**

In [263]:
def remove_html_tags(text):
    if not isinstance(text, str):
        return text
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)

In [264]:
df['Tweet_Content'] = df['Tweet_Content'].apply(remove_html_tags)

In [265]:
df.head()

Unnamed: 0,Tweet_Content
0,im getting on borderlands and i will murder yo...
1,i am coming to the borders and i will kill you...
2,im getting on borderlands and i will kill you ...
3,im coming on borderlands and i will murder you...
4,im getting on borderlands 2 and i will murder ...


### **Removing URL's**

In [266]:
def remove_urls(text):
    if not isinstance(text, str):
        return text
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    clean_text = re.sub(url_pattern, '', text)
    return clean_text

In [267]:
df['Tweet_Content'] = df['Tweet_Content'].apply(remove_urls)

In [268]:
df.head()

Unnamed: 0,Tweet_Content
0,im getting on borderlands and i will murder yo...
1,i am coming to the borders and i will kill you...
2,im getting on borderlands and i will kill you ...
3,im coming on borderlands and i will murder you...
4,im getting on borderlands 2 and i will murder ...


### **Removing Punctuations**

In [269]:
def remove_punctuation(text):
    if not isinstance(text, str):
        return text
    punctuation = string.punctuation
    clean_text = text.translate(str.maketrans('', '', punctuation))
    return clean_text

In [270]:
df['Tweet_Content'] = df['Tweet_Content'].apply(remove_punctuation)

In [271]:
df.head()

Unnamed: 0,Tweet_Content
0,im getting on borderlands and i will murder yo...
1,i am coming to the borders and i will kill you...
2,im getting on borderlands and i will kill you all
3,im coming on borderlands and i will murder you...
4,im getting on borderlands 2 and i will murder ...


### **ChatWord Treatment**

In [272]:
chat_words_mapping = {
    "lol": "laughing out loud",
    "brb": "be right back",
    "btw": "by the way",
    "afk": "away from keyboard",
    "rofl": "rolling on the floor laughing",
    "ttyl": "talk to you later",
    "np": "no problem",
    "thx": "thanks",
    "omg": "oh my god",
    "idk": "I don't know",
    "np": "no problem",
    "gg": "good game",
    "g2g": "got to go",
    "b4": "before",
    "cu": "see you",
    "yw": "you're welcome",
    "wtf": "what the f*ck",
    "imho": "in my humble opinion",
    "jk": "just kidding",
    "gf": "girlfriend",
    "bf": "boyfriend",
    "u": "you",
    "r": "are",
    "2": "to",
    "4": "for",
    "b": "be",
    "c": "see",
    "y": "why",
    "tho": "though",
    "smh": "shaking my head",
    "lolz": "laughing out loud",
    "h8": "hate",
    "luv": "love",
    "pls": "please",
    "sry": "sorry",
    "tbh": "to be honest",
    "omw": "on my way",
    "omw2syg": "on my way to see your girlfriend",
    "im" : "I am"
}

def expand_chat_words(text):
    if not isinstance(text, str):
        return text
    words = text.split()
    expanded_words = [chat_words_mapping.get(word.lower(), word) for word in words]
    return ' '.join(expanded_words)

In [273]:
df['Tweet_Content'] = df['Tweet_Content'].apply(expand_chat_words)

In [274]:
df.head()

Unnamed: 0,Tweet_Content
0,I am getting on borderlands and i will murder ...
1,i am coming to the borders and i will kill you...
2,I am getting on borderlands and i will kill yo...
3,I am coming on borderlands and i will murder y...
4,I am getting on borderlands to and i will murd...


### **Removing Stop Words**

In [275]:
def remove_stop_words(text):
    if not isinstance(text, str):
        return text
    tokens = nltk.word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token not in stop_words]
    preprocessed_text = ' '.join(filtered_tokens)
    return preprocessed_text

In [276]:
df['Tweet_Content'] = df['Tweet_Content'].apply(remove_stop_words)

In [277]:
df.head()

Unnamed: 0,Tweet_Content
0,I getting borderlands murder
1,coming borders kill
2,I getting borderlands kill
3,I coming borderlands murder
4,I getting borderlands murder


### **Word Tokenization**

In [278]:
def word_tokenization(text):
    if not isinstance(text, str):
        return text
    return nltk.word_tokenize(text)

In [279]:
df['Tweet_Content'] = df['Tweet_Content'].apply(word_tokenization)

In [280]:
df.head()

Unnamed: 0,Tweet_Content
0,"[I, getting, borderlands, murder]"
1,"[coming, borders, kill]"
2,"[I, getting, borderlands, kill]"
3,"[I, coming, borderlands, murder]"
4,"[I, getting, borderlands, murder]"


## **Training Word2vec model**

In [281]:
df['Tweet_Content'] = df['Tweet_Content'].astype(str)

In [282]:
tweet_content = df['Tweet_Content'].tolist()

In [283]:
model = Word2Vec(sentences=tweet_content, vector_size=100, window=10, min_count=1, workers=4)

model.save("word2vec.model")

In [284]:
model = Word2Vec.load("word2vec.model")

vocab = model.wv.index_to_key

In [285]:
word = "murder"
if word in model.wv:
    similar_words = model.wv.most_similar(word)
    print("Words most similar to", word + ":", similar_words)
else:
    print("Word", word, "not found in the model vocabulary.")

Word murder not found in the model vocabulary.
