In [1]:
import gensim
import pandas as pd
import os
import re
import nltk
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer

import warnings
warnings.filterwarnings('ignore')



## Labelled Words

In [2]:
rootdir = "data"
positive_words = []
negative_words = []
neutral_words = []

for directories, subdirs, files in os.walk(rootdir):
    if (os.path.split(directories)[1] == '1' or os.path.split(directories)[1] == '2' or os.path.split(directories)[1] == '3' or os.path.split(directories)[1] == '4'):
        for filename in files:
            if (filename == 'positive.txt'):
                with open(os.path.join(directories, filename)) as f:
                    for line in f:
                        positive_words.append(line.strip())
            if (filename == 'negative.txt'):
                with open(os.path.join(directories, filename)) as f:
                    for line in f:
                        negative_words.append(line.strip())
            if (filename == 'neutral.txt'):
                with open(os.path.join(directories, filename), encoding='latin-1') as f:
                    for line in f:
                        neutral_words.append(line.strip())

## Word2Vec

In [3]:
df = pd.read_csv('data/posts_data_gab.csv')
data = df['Body']
data.dropna(inplace=True)
data.drop_duplicates(inplace=True)

In [4]:
df.head()

Unnamed: 0,Title,Body,Submitted
0,18294,there’s this guy who’s been making too much ma...,"December 9, 2018 2:44:26 AM +08"
1,18293,You call yourself my best friend pero you cant...,"December 9, 2018 2:44:05 AM +08"
2,18292,#Daddy Jett 🙌,"December 9, 2018 2:43:23 AM +08"
3,18291,https://www.facebook.com/braveyawtz/videos/174...,"December 9, 2018 2:16:15 AM +08"
4,18290,I saw this girl last thursday. She was hanging...,"December 9, 2018 2:15:33 AM +08"


In [5]:
# Stop words 
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))
fil_stop = []
with open('assets/stop_words_ph.txt') as f:
    for line in f:
        fil_stop.append(line.rstrip('\n')) 

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Carlo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
nltk.download('wordnet')

def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Carlo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [7]:
text_data = []
counter = 0
index = 0

for line in data:
    tokens = gensim.utils.simple_preprocess(line)
    tokens = [re.sub('\#ADMUFreedomWall\ \d+', '', sent) for sent in tokens]
    tokens = [re.sub('\#ADMUFreedomWall\d+', '', sent) for sent in tokens]
    tokens = [word for word in tokens if word not in fil_stop]
    tokens = [word for word in tokens if word not in en_stop]
    tokens = [word for word in tokens if len(word) > 2]
    tokens = [get_lemma(token) for token in tokens]
    
    text_data.append(tokens)
    counter += len(tokens)
    index+=1
    
print('Words in text data: ', counter)
numbers_set = set(i for j in text_data for i in j)
print('Unique words:', len(numbers_set))
print()

for tokens in text_data:
    print(tokens)
    print()

Words in text data:  24608
Unique words: 5630

['guy', 'making', 'much', 'malaswa', 'dagger', 'stare', 'even', 'never', 'do', 'anything', 'wrong', 'way', 'look', 'like', 'want', 'either', 'beat', 'physically', 'even', 'sexually', 'old', 'act', 'irrational', 'ateneo', 'pls', 'help', 'make', 'school', 'better', 'place', 'rather', 'bring', 'malaswa', 'playground']

['call', 'best', 'friend', 'cant', 'even', 'give', 'min', 'time', 'need', 'someone', 'talk', 'tapos', 'tangina', 'say', 'stuff', 'make', 'feel', 'bad', 'get', 'head', 'fucking', 'ass', 'stop', 'thinking', 'everyone', 'want', 'make']

['daddy', 'jett']

['http', 'www', 'facebook', 'com', 'braveyawtz', 'video', 'crazy', 'realize', 'much', 'attention', 'war', 'drug', 'die', 'seeing', 'video', 'reminiscent', 'drug', 'cartel', 'would', 'summarily', 'execute', 'people', 'street', 'people', 'dying', 'left', 'right', 'notion', 'label', 'death', 'police', 'self', 'defence', 'victim', 'fight', 'back', 'leaving', 'one', 'safe', 'kind', 'c

['guy', 'looking', 'team', 'ccsgo', 'sali', 'http', 'www', 'facebook', 'com', 'com', 'post', 'notif_id', 'notif_t', 'notify_me_page', 'ref', 'notif', 'see', 'translation']

['know', 'hell', 'thought', 'dash', 'line', 'non', 'insane', 'person', 'realize', 'dumb', 'really', 'glad', 'typical', 'anakbayan', 'type', 'group', 'focusing', 'nowadays', 'rather', 'intervention', 'thousand', 'kilometer', 'away', 'get', 'wrong', 'important', 'things', 'like', 'iraq', 'wrong', 'pretty', 'sure', 'country', 'aim', 'say', 'entire', 'sea', 'exclusive', 'property', 'include', 'area', 'mile', 'away', 'palawan', 'important', 'sure', 'type', 'people', 'post', 'china', 'duterte', 'triumvirate', 'regime', 'poster', 'graffiti', 'thinking', 'jinping', 'secretly', 'team', 'trump', 'flat', 'earth', 'tier', 'belief', 'atleast', 'finally', 'focusing', 'joke', 'map', 'seriously', 'everyone', 'like', 'straight', 'imperialism', 'happening', 'continent', 'happening', 'dozen', 'mile', 'away', 'palawan', 'okay', 'millio

['going', 'ateneo', 'mistake', 'majors', 'easy', 'core', 'subject', 'hard']

['impeachment', 'sereno', 'hope', 'duterte', 'would', 'declare', 'martial', 'law', 'next', 'week', 'final', 'would', 'postpone', 'hehe']

['sometimes', 'wish', 'thanos', 'snap', 'certain', 'head', 'state']

['people', 'need', 'stop', 'saying', 'poor', 'lazy', 'need', 'work', 'many', 'hours', 'successful', 'life', 'case', 'people', 'work', 'job', 'without', 'weekend', 'filthy', 'rich', 'musician', 'industry', 'years', 'rich', 'popular', 'one', 'simply', 'go', 'viral', 'etc', 'unfortunately', 'reality', 'farther', 'world', 'owe', 'success', 'riches', 'people', 'simply', 'work', 'hard', 'people', 'need', 'realize', 'need', 'creative', 'wisdom', 'know', 'working', 'hard', 'spending', 'time', 'believe', 'practice', 'make', 'perfect', 'success', 'need', 'lot', 'simple', 'hard', 'work']

['president', 'renege', 'promise', 'end', 'illegal', 'two', 'week', 'ago', 'take', 'social', 'medium', 'ridicule', 'sign', 'today',

In [8]:
model = gensim.models.Word2Vec(text_data, size=50, window=5, min_count=5, workers=4)
model.train(text_data, total_examples=len(text_data) , epochs=100)

# For words that dont occur as much
model_sparse = gensim.models.Word2Vec(text_data, size=50, window=5, min_count=1, workers=4)
model_sparse.train(text_data, total_examples=len(text_data) , epochs=100)

(2307132, 2460800)

In [9]:
word = 'crush'
print('Word:', word)
model.wv.most_similar(positive=word)

Word: crush


[('kailangan', 0.6317132711410522),
 ('landi', 0.6247240304946899),
 ('sakin', 0.5961827039718628),
 ('kita', 0.5806883573532104),
 ('gago', 0.5607095956802368),
 ('pota', 0.5521352887153625),
 ('gusto', 0.5502328872680664),
 ('dating', 0.5408319234848022),
 ('fit', 0.5347322225570679),
 ('nung', 0.5287871956825256)]

## TF-IDF

In [10]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(data)
feature_names = vectorizer.get_feature_names()

In [11]:
tf_idf_list = []
for i in range(len(data)):
    sentence_dict = {}
    feature_index = tfidf_matrix[i,:].nonzero()[1]
    tfidf_scores = zip(feature_index, [tfidf_matrix[i, x] for x in feature_index])
    for w, s in [(feature_names[i], s) for (i, s) in tfidf_scores]:
        sentence_dict[w] = s
    tf_idf_list.append(sentence_dict)

## Sentiment Analysis

In [12]:
def get_sentiment(sentence):
    tokens = sentence.split(' ')
    sum_score = 0
    relevant_words = 0
    
    for token in tokens:
        if token in positive_words:
            print("Word: " + token + ", score: 1")
            sum_score += 1
            relevant_words += 1
        elif token in negative_words:
            print("Word: " + token + ", score: -1")
            sum_score -= 1
            relevant_words += 1
        elif token in neutral_words:
            print("Word: " + token)
            try:
                neighbors = model.wv.most_similar(positive=token, topn=100)
            except:
                neighbors = model_sparse.wv.most_similar(positive=token,topn=100)
            for word in neighbors:
                if word[0] in positive_words:
                    print("Nearest labelled word: " + word[0] + ", score: " + str(word[1]))
                    sum_score += word[1]
                    relevant_words += 1
                    break
                elif word[0] in negative_words:
                    print("Nearest labelled word: " + word[0] + ", score: -" + str(word[1]))
                    sum_score -= word[1]
                    relevant_words += 1
                    break
        else:
            print('"' + token + '" not in model')
#     val = sum_score/len(tokens)
    val = sum_score/relevant_words
    print(val)
#     print("%.2f" % val)


# for tokens in text_data:
#     print(tokens)
#     sum_score = 0
#     if not tokens:
#         continue
#     for token in tokens:
#         if token in positive_words:
#             print("Word: " + token + ", score: 1")
#             sum_score += 1
#         elif token in negative_words:
#             print("Word: " + token + ", score: -1")
#             sum_score -= 1
#         elif token in neutral_words:
#             print("Word: " + token)
#             try:
#                 neighbors = model.wv.most_similar(positive=token, topn=100)
#             except:
#                 neighbors = model_sparse.wv.most_similar(positive=token,topn=100)
#             for word in neighbors:
#                 if word[0] in positive_words:
#                     print("Nearest labelled word: " + word[0] + ", score: " + str(word[1]))
#                     sum_score += word[1]
#                     break
#                 elif word[0] in negative_words:
#                     print("Nearest labelled word: " + word[0] + ", score: -" + str(word[1]))
#                     sum_score -= word[1]
#                     break
#     print("\nAggregate score:", sum_score/len(tokens))
#     print()

In [15]:
sentence = "How beautiful it is to find someone who asks for nothing but for you company"
get_sentiment(sentence)

"How" not in model
Word: beautiful, score: 1
"it" not in model
"is" not in model
"to" not in model
Word: find
Nearest labelled word: beautiful, score: 0.5448257923126221
Word: someone
Nearest labelled word: trust, score: 0.44044801592826843
"who" not in model
"asks" not in model
"for" not in model
Word: nothing
Nearest labelled word: worse, score: -0.38945525884628296
"but" not in model
"for" not in model
"you" not in model
Word: company
Nearest labelled word: enjoy, score: 0.46590885519981384
0.4123454809188843


https://stackoverflow.com/questions/47812930/scientific-explanation-why-word2vec-models-perform-poorly-on-small-data

helpful reference