In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import pandas as pd
from nltk.tokenize import word_tokenize

In [2]:
train_ds = pd.read_csv("train.csv")
train_ds = train_ds.dropna(axis=0, how="any") 
test_ds = pd.read_csv("test.csv")
test_ds = test_ds.dropna(axis=0, how="any") 

In [3]:
train_ds.head()

Unnamed: 0,id,text,emotions
0,27383,i feel awful about it too because it s my job ...,sadness
1,110083,im alone i feel awful,sadness
2,140764,ive probably mentioned this before but i reall...,joy
3,100071,i was feeling a little low few days back,sadness
4,2837,i beleive that i am much more sensitive to oth...,love


In [4]:
train_words = train_ds["text"]

In [5]:
test_words = test_ds["text"]

In [6]:
train_uniquewords = set(train_words.str.lower().str.findall("\w+").sum())
test_uniquewords = set(test_words.str.lower().str.findall("\w+").sum())

In [7]:
test_uniquewords - train_uniquewords

{'continued',
 'potential',
 'wellbeing',
 'medication',
 'married',
 'recipient',
 'remove',
 'sustains',
 'galleries',
 'jacob',
 'skating',
 'term',
 'phd',
 'calls',
 'award',
 'someday',
 'laughs',
 'yellow',
 'absurd',
 'observer',
 'disorders',
 'bennett',
 'stigmatized',
 'sceptical',
 'passage',
 'recipricated',
 'mountie',
 'streets',
 'depressing',
 'nearly',
 'insercurity',
 'edgar',
 'alt',
 'approval',
 'tips',
 'testosterone',
 'applications',
 'illustrator',
 'fresh',
 'loose',
 'exclude',
 'spice',
 'bitterness',
 'services',
 'clearboth',
 'allow',
 'prophetically',
 'randomly',
 'grumbled',
 'career',
 'prolific',
 'furry',
 'wrench',
 'encourage',
 'insincere',
 'drunk',
 'december',
 'discuss',
 'moves',
 'prayer',
 'fascinating',
 'acquisition',
 'exemptions',
 'readers',
 'niya',
 'filed',
 'pleasantly',
 'reconsider',
 'proverbsverse',
 'confidence',
 'brat',
 'remarks',
 'indecisive',
 'kinabuhi',
 'ear',
 'completism',
 'edward',
 'innermost',
 'pew',
 'track'

In [8]:
train_uniquewords - test_uniquewords

{'isolating',
 'hairdresser',
 'cost',
 'distant',
 'fault',
 'bodyworks',
 'beneath',
 'resurrection',
 'grub',
 'dead',
 'slepted',
 'sincerity',
 'pairing',
 'designer',
 'petted',
 'sunlight',
 'handful',
 'shitting',
 'stillness',
 'immensely',
 'shouting',
 'handle',
 'apparently',
 'challenges',
 'alarmed',
 'revenue',
 'ties',
 'acts',
 'abolutely',
 'cop',
 'maternity',
 'challenging',
 'shivering',
 'collection',
 'commandments',
 'wishlist',
 'planning',
 'fence',
 'righting',
 'greatness',
 'hiccups',
 'traffic',
 'shown',
 'swings',
 'reversed',
 'covering',
 'species',
 'resistant',
 'belief',
 'intended',
 'beginnings',
 'cons',
 'nowhere',
 'sports',
 'wore',
 'tactical',
 'gifted',
 'storm',
 'arrange',
 'idiots',
 'sometime',
 'homeschool',
 'beast',
 'cassiopeia',
 'amy',
 'option',
 'drawn',
 'bob',
 'wholeness',
 'monthly',
 'occur',
 'unlovable',
 'discount',
 'healthier',
 'connection',
 'consultation',
 'lawn',
 'bw',
 'buoied',
 'heaven',
 'inconsiderate',
 'in

In [9]:
train_ds

Unnamed: 0,id,text,emotions
0,27383,i feel awful about it too because it s my job ...,sadness
1,110083,im alone i feel awful,sadness
2,140764,ive probably mentioned this before but i reall...,joy
3,100071,i was feeling a little low few days back,sadness
4,2837,i beleive that i am much more sensitive to oth...,love
...,...,...,...
1196,40054,i was feeling terrified and anxious about ever...,fear
1197,104110,i was tempted to feel a little depressed about...,sadness
1198,106240,i wish i had done things differently miss the ...,sadness
1199,5483,i feel more and more curious anxious to see me...,surprise


In [10]:
train_ds[["id", "text"]]

Unnamed: 0,id,text
0,27383,i feel awful about it too because it s my job ...
1,110083,im alone i feel awful
2,140764,ive probably mentioned this before but i reall...
3,100071,i was feeling a little low few days back
4,2837,i beleive that i am much more sensitive to oth...
...,...,...
1196,40054,i was feeling terrified and anxious about ever...
1197,104110,i was tempted to feel a little depressed about...
1198,106240,i wish i had done things differently miss the ...
1199,5483,i feel more and more curious anxious to see me...


In [11]:
train_sentences = train_words.str.lower().str.findall("\w+")
test_sentences = test_words.str.lower().str.findall("\w+")

In [12]:
test_sentences[0]
replacement = ["ive", "optimistic"]
test_sentence2 = ["unknown" if val in replacement else val for val in test_sentences[0]] 
test_sentence2

['unknown',
 'been',
 'feeling',
 'more',
 'unknown',
 'this',
 'week',
 'than',
 'i',
 'have',
 'in',
 'months']

In [13]:
sample_text = pd.Series(['Topic sentences are similar to mini thesis statements. Like a thesis statement', 'a topic sentence has a specific main point. Whereas the thesis is the main point of the essay, the topic sentence is the main point of the paragraph. Like the thesis statement, a topic sentence has a unifying function. But a thesis statement or topic sentence alone doesn’t guarantee unity.', 'An essay is unified if all the paragraphs relate to the thesis, whereas a paragraph is unified if all the sentences relate to the topic sentence.'])

In [14]:
sample_text

0    Topic sentences are similar to mini thesis sta...
1    a topic sentence has a specific main point. Wh...
2    An essay is unified if all the paragraphs rela...
dtype: object

In [81]:
class TFIDF_PROCESSOR():
    def __init__(self):
        self.vocabulary = None
        self.doc_count = None
        self.doc_frequencies = None

    def configure(self, data_col, threshold=1):
        # Assumes pandas column
        # Compute tf_idf model, to be used in new sentences.
        # word count must be AT LEAST threshold to be accepted

        # Text and split into strings
        sentences = data_col.str.lower().str.findall("\w+")
        self.doc_count = len(sentences)

        # All unique words and their counts
        word_counter = pd.Series(sentences.sum()).value_counts()

        unknown_words = set()

        for word in word_counter.keys():
            if word_counter[word] < threshold:
                unknown_words.add(word)
        
        
        print("Number of words with less than " + str(threshold) + " occurences: " + str(len(unknown_words)))

        # Calculate word value counts
        word_doc_frequency = dict()
        word_doc_frequency["_UNKNOWN"] = 0  # Unknown vector

        for sentence in sentences:
            # Get series indexed by word and whose value is the number of occurences
            sentence_word_counter = pd.Series(sentence).value_counts()
            for word in sentence_word_counter.keys():
                if word in unknown_words:
                    word_doc_frequency["_UNKNOWN"] += sentence_word_counter[word]
                else:
                    if word in word_doc_frequency:
                        word_doc_frequency[word] += sentence_word_counter[word]
                    else:
                        word_doc_frequency[word] = sentence_word_counter[word]

        self.doc_frequencies = word_doc_frequency
        self.vocabulary = dict(zip(sorted(self.doc_frequencies.keys()), range(len(self.doc_frequencies.keys()))))

    def apply(self, orig_sentence):
        # determine replacement for low-frequency words
        sentence = pd.Series(orig_sentence).str.lower().str.findall("\w+").to_list()[0]
        vec = np.zeros((len(self.doc_frequencies.keys()),))

        # Replace with unknowns whenever valid.
        for i in range(len(sentence)):
            if sentence[i] not in self.doc_frequencies.keys():
                sentence[i] = "_UNKNOWN"

        # Configure term frequency
        term_frequencies = dict(pd.Series(sentence).value_counts() / len(sentence))

        vectors = np.zeros(len(self.vocabulary.keys()))
        # Implementation of tf_idf
        for word in term_frequencies.keys():
            tf = term_frequencies[word]
            count = self.doc_frequencies[word] + 1
            idf = np.log(self.doc_count / count)
            position = self.vocabulary[word]
            vectors[position] = tf * idf
        return vectors

In [82]:
tf_idf_class = TFIDF_PROCESSOR()
tf_idf_class.configure(train_ds["text"], threshold=2)

Number of words with less than 2 occurences: 2127


In [89]:
_Y_train = pd.get_dummies(train_ds["emotions"], dtype='int')

In [84]:
results = []
for sentence in train_ds["text"]:
    result = tf_idf_class.apply(sentence)
    results.append(result)
train_ds["vectorized"] = pd.Series(results)

In [92]:
np.asarray(_Y_train)

(1200, 6)

In [87]:
res = np.asarray(results)

In [88]:
res.shape

(1200, 1532)

In [15]:
def setup_tf_idf(data_col, threshold=1):
    # Compute tf_idf model, to be used in new sentences.
    # word count must be AT LEAST threshold to be accepted
    
    # Text and split into strings
    sentences = data_col.str.lower().str.findall("\w+")

    # All unique words and their counts
    word_counter = pd.Series(sentences.sum()).value_counts()

    unknown_words = set()
    
    for word in word_counter.keys():
        if word_counter[word] < threshold:
            unknown_words.add(word)

    print("Words with less than " + str(threshold) + " occurences: " + str(unknown_words))
    
    # Calculate word value counts
    word_doc_frequency = dict()
    word_doc_frequency["_UNKNOWN"] = 0 # Unknown vector
        
    for sentence in sentences:
        # Get series indexed by word and whose value is the number of occurences
        sentence_word_counter = pd.Series(sentence).value_counts()
        for word in sentence_word_counter.keys():
            if word in unknown_words:
                word_doc_frequency["_UNKNOWN"] += sentence_word_counter[word]
            else:
                if word in word_doc_frequency:
                    word_doc_frequency[word] += sentence_word_counter[word]
                else:
                    word_doc_frequency[word] = sentence_word_counter[word]
                
    return (len(sentences), word_doc_frequency)



In [40]:
def apply_tf_idf(orig_sentence, tf_idf_data):
    num_docs, frequencies = tf_idf_data
    
    # Vocabulary indexing
    vocabulary = dict(zip(sorted(frequencies.keys()), range(len(frequencies.keys()))))
    
    # determine replacement for low-frequency words
    sentence = pd.Series(orig_sentence).str.lower().str.findall("\w+").to_list()[0]
    vec = np.zeros((len(frequencies.keys()),))

    # Replace with unknowns whenever valid.
    for i in range(len(sentence)):
        if sentence[i] not in frequencies.keys():
            print(sentence[i] + " not in ")
            sentence[i] = "_UNKNOWN"
            
    # Configure term frequency
    term_frequencies = dict(pd.Series(sentence).value_counts()/ len(sentence) )

    vectors = np.zeros(len(vocabulary.keys()))
    # Implementation of tf_idf
    for word in term_frequencies.keys():
        tf = term_frequencies[word]
        doc_count = frequencies[word] + 1
        idf = np.log(num_docs / doc_count)
        position = vocabulary[word]
        vectors[position] = tf*idf

    return vectors, vocabulary

In [49]:
tf_idf_model = setup_tf_idf(sample_text, threshold=1)
results = []
for sentence in sample_text.to_list():
    print(sentence)
    result, vocabulary = apply_tf_idf(sentence, tf_idf_model)
    print(result[1:])
    results.append(result)

Words with less than 1 occurences: set()
Topic sentences are similar to mini thesis statements. Like a thesis statement
[-0.08173577  0.          0.          0.          0.03378876  0.
  0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.03378876  0.          0.
  0.          0.          0.          0.          0.          0.
  0.03378876  0.         -0.02397351  0.03378876  0.          0.
 -0.14121631 -0.02397351 -0.07060816  0.          0.          0.
  0.        ]
a topic sentence has a specific main point. Whereas the thesis is the main point of the essay, the topic sentence is the main point of the paragraph. Like the thesis statement, a topic sentence has a unifying function. But a thesis statement or topic sentence alone doesn’t guarantee unity.
[-0.09615973  0.          0.0079503   0.          0.          0.0079503
  0.0079503   0.          0.0079503   0.0079503   0.          0.
 -0.02003238  0.         -0.01692247  0.        

In [42]:
results

[array([ 0.        , -0.08173577,  0.        ,  0.        ,  0.        ,
         0.03378876,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.03378876,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.03378876,  0.        , -0.02397351,  0.03378876,  0.        ,
         0.        , -0.14121631, -0.02397351, -0.07060816,  0.        ,
         0.        ,  0.        ,  0.        ]),
 array([ 0.        , -0.09615973,  0.        ,  0.0079503 ,  0.        ,
         0.        ,  0.0079503 ,  0.0079503 ,  0.        ,  0.0079503 ,
         0.0079503 ,  0.        ,  0.        , -0.02003238,  0.        ,
        -0.01692247,  0.        ,  0.        ,  0.0079503 ,  0.        ,
         0.        , -0.01692247,  0.        , -0.05436448,  0.        ,
         0.        ,  0.0079503 , -0.01128165,  0.        ,  0.0079503 ,
  

In [43]:
vocabulary

{'_UNKNOWN': 0,
 'a': 1,
 'all': 2,
 'alone': 3,
 'an': 4,
 'are': 5,
 'but': 6,
 'doesn': 7,
 'essay': 8,
 'function': 9,
 'guarantee': 10,
 'has': 11,
 'if': 12,
 'is': 13,
 'like': 14,
 'main': 15,
 'mini': 16,
 'of': 17,
 'or': 18,
 'paragraph': 19,
 'paragraphs': 20,
 'point': 21,
 'relate': 22,
 'sentence': 23,
 'sentences': 24,
 'similar': 25,
 'specific': 26,
 'statement': 27,
 'statements': 28,
 't': 29,
 'the': 30,
 'thesis': 31,
 'to': 32,
 'topic': 33,
 'unified': 34,
 'unifying': 35,
 'unity': 36,
 'whereas': 37}

In [45]:
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')
sentences = []
word_set = []

for sent in sample_text:
    words = [word.lower() for word in word_tokenize(sent) if word.isalpha()]
    sentences.append(words)
    for word in words:
        if word not in word_set:
            word_set.append(word)# Set of words
word_set = set(word_set)
# total documents in our corpus
total_docs = len(sample_text)
print('Total documents: ', total_docs)
print('Total words: ', len(word_set))

word_index = {}
for i, word in enumerate(sorted(word_set)):
    word_index[word] = i

def count_dict(sentences):
    count_dict = {}
    for word in word_set:
        count_dict[word] = 0
    for sent in sentences:
        for word in sent:
            count_dict[word] += 1
    return count_dict
    
word_count = count_dict(sentences)
print(word_count)

def term_frequency(document, word):
    N = len(document)
    occurance = len([token for token in document if token == word])
    return occurance / N

def inverse_document_frequency(word):
    try:
        word_occurance = word_count[word] + 1
    except:
        word_occurance = 1
    return np.log(total_docs / word_occurance)

def tf_idf(sentence):
    vec = np.zeros((len(word_set),))
    for word in sentence:
        tf = term_frequency(sentence, word)
        idf = inverse_document_frequency(word)
        vec[word_index[word]] = tf * idf
    return vec

Total documents:  3
Total words:  37
{'to': 3, 'unifying': 1, 'unity': 1, 'doesn': 1, 'statement': 3, 'paragraph': 2, 'a': 7, 'if': 2, 'but': 1, 'specific': 1, 'relate': 2, 'has': 2, 'or': 1, 'all': 2, 'sentence': 5, 'similar': 1, 'topic': 6, 'like': 2, 'essay': 2, 'unified': 2, 'are': 1, 'sentences': 2, 'mini': 1, 'main': 3, 't': 1, 'the': 11, 'function': 1, 'is': 4, 'guarantee': 1, 'point': 3, 'an': 1, 'whereas': 2, 'statements': 1, 'paragraphs': 1, 'of': 2, 'thesis': 6, 'alone': 1}


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Godonan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [50]:
vectors = []

for sentence in sentences:
    print(sentence)
    print(tf_idf(sentence))

['topic', 'sentences', 'are', 'similar', 'to', 'mini', 'thesis', 'statements', 'like', 'a', 'thesis', 'statement']
[-0.08173577  0.          0.          0.          0.03378876  0.
  0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.03378876  0.          0.
  0.          0.          0.          0.          0.          0.
  0.03378876  0.         -0.02397351  0.03378876  0.          0.
 -0.14121631 -0.02397351 -0.07060816  0.          0.          0.
  0.        ]
['a', 'topic', 'sentence', 'has', 'a', 'specific', 'main', 'point', 'whereas', 'the', 'thesis', 'is', 'the', 'main', 'point', 'of', 'the', 'essay', 'the', 'topic', 'sentence', 'is', 'the', 'main', 'point', 'of', 'the', 'paragraph', 'like', 'the', 'thesis', 'statement', 'a', 'topic', 'sentence', 'has', 'a', 'unifying', 'function', 'but', 'a', 'thesis', 'statement', 'or', 'topic', 'sentence', 'alone', 'doesn', 't', 'guarantee', 'unity']
[-0.09615973  0.          0.0079503   0.  

In [46]:
word_index

{'a': 0,
 'all': 1,
 'alone': 2,
 'an': 3,
 'are': 4,
 'but': 5,
 'doesn': 6,
 'essay': 7,
 'function': 8,
 'guarantee': 9,
 'has': 10,
 'if': 11,
 'is': 12,
 'like': 13,
 'main': 14,
 'mini': 15,
 'of': 16,
 'or': 17,
 'paragraph': 18,
 'paragraphs': 19,
 'point': 20,
 'relate': 21,
 'sentence': 22,
 'sentences': 23,
 'similar': 24,
 'specific': 25,
 'statement': 26,
 'statements': 27,
 't': 28,
 'the': 29,
 'thesis': 30,
 'to': 31,
 'topic': 32,
 'unified': 33,
 'unifying': 34,
 'unity': 35,
 'whereas': 36}