# Basic Fit - Predict Pipeline With Bag of Words and Frequency Of Occurrence As Features

In [None]:
import pandas as pd
import re
from typing import List
import numpy as np
import nltk
from sklearn.naive_bayes import MultinomialNB
from functools import partial
import numpy as np
from collections import defaultdict
from nltk.corpus import webtext, brown, words # different nltk corpora which can be used as dictionary of words 
from scipy import sparse

## Load the train and test datasets into Pandas dataframes

In [2]:
!ls ../data/sentiment_analysis

test.csv
testdata.manual.2009.06.14.csv
train.csv
training.1600000.processed.noemoticon.csv


In [3]:
train_df = pd.read_csv('../data/sentiment_analysis/train.csv', encoding='unicode_escape')
test_df = pd.read_csv('../data/sentiment_analysis/test.csv', encoding='unicode_escape')

In [4]:
train_df.columns

Index(['textID', 'text', 'selected_text', 'sentiment', 'Time of Tweet',
       'Age of User', 'Country', 'Population -2020', 'Land Area (Km²)',
       'Density (P/Km²)'],
      dtype='object')

In [5]:
test_df.columns

Index(['textID', 'text', 'sentiment', 'Time of Tweet', 'Age of User',
       'Country', 'Population -2020', 'Land Area (Km²)', 'Density (P/Km²)'],
      dtype='object')

In [6]:
train_df.iloc[0]['text']

' I`d have responded, if I were going'

In [7]:
def basic_tokenizer(text: str)->List[str]:
    """Tokenize individual words and punctuations, ignoring white spaces"""
    result = re.split(r'([,.:;?_!"()\']|--|\s)', text)
    return [item.strip() for item in result if item.strip()]

In [8]:
text = train_df.iloc[0]['text']
basic_tokenizer(text)

['I`d', 'have', 'responded', ',', 'if', 'I', 'were', 'going']

In [9]:
# !pip install nltk==3.8.1

In [10]:
nltk.tokenize.word_tokenize(text, language='english', preserve_line=False)

['I', '`', 'd', 'have', 'responded', ',', 'if', 'I', 'were', 'going']

In [35]:
# uncomment for various downloads
# nltk.download('popular')
# nltk.download('webtext')
# nltk.download('brown')

[nltk_data] Downloading package brown to /Users/bodesule/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.


True

## Create Train and Test Dataset

In [17]:
def create_X_Y(df, tokenizer_fn, words):
    words_set = set(words)
    words = list(words_set)
    assert len(words_set)==len(words)
    words_dict = {c: i for i, c in enumerate(words)}
    X = defaultdict(lambda: defaultdict(int))
    Y = []
    row = -1
    for i in range(len(df)):
        try:
            tokens = tokenizer_fn(df.iloc[i]['text'])
            row+=1
        except TypeError:
            continue
        for t in tokens:
            if t not in words:
                X[row][len(words)]+=1
            else:
                X[row][words_dict[t]]+=1
        Y.append(df.iloc[i]['sentiment'])

    return X, Y, len(words_set)


X_train, Y_train, num_words = create_X_Y(train_df, partial(nltk.tokenize.word_tokenize, language='english', preserve_line=False), words.words())
X_test, Y_test, num_words = create_X_Y(test_df, partial(nltk.tokenize.word_tokenize, language='english', preserve_line=False), words.words())

# X_train, Y_train = create_X_Y(train_df, basic_tokenizer, webtext.words())

## Load Train Data into sparse matrices

In [15]:
X_train_s = sparse.dok_matrix((len(X_train), num_words+1), dtype=np.int8)

for row_id, word_freq_dict in X_train.items():
    for word_id, word_count in word_freq_dict.items():
        X_train_s[row_id, word_id] = word_count

X_train_sparse = X_train_s.tocsr()

In [20]:
X_test_s = sparse.dok_matrix((len(X_test), num_words+1), dtype=np.int8)

for row_id, word_freq_dict in X_test.items():
    for word_id, word_count in word_freq_dict.items():
        X_test_s[row_id, word_id] = word_count

X_test_sparse = X_test_s.tocsr()

## Fit, Predict and Basic Evaluation

In [16]:
model = MultinomialNB()
model.fit(X_train_sparse, Y_train)

In [25]:
Y_pred = model.predict(X_test_sparse)

In [26]:
print("Number of mislabeled points out of a total %d points : %d"
      % (len(Y_test), (Y_test != Y_pred).sum()))

Number of mislabeled points out of a total 3534 points : 1598
