# Basic Fit - Predicting Pipeline with Bag of Words and Frequency of Occurrence as Features

In [8]:
import pandas as pd
import re
from typing import List
import numpy as np
import nltk
from sklearn.naive_bayes import MultinomialNB
from functools import partial
import numpy as np
from collections import defaultdict
from nltk.corpus import webtext, brown, words # different nltk corpora which can be used as dictionary of words
from scipy import sparse

# To load train and test datasets into Pandas

In [15]:
#!ls ..sentiment-analysis/data/archive-2/
!ls ../data/archive-2

test.csv
testdata.manual.2009.06.14.csv
train.csv
training.1600000.processed.noemoticon.csv


In [16]:
train_df = pd.read_csv('../data/archive-2/train.csv', encoding='unicode_escape')
test_df = pd.read_csv('../data/archive-2/test.csv', encoding='unicode_escape')

In [17]:
train_df.columns

Index(['textID', 'text', 'selected_text', 'sentiment', 'Time of Tweet',
       'Age of User', 'Country', 'Population -2020', 'Land Area (Km²)',
       'Density (P/Km²)'],
      dtype='object')

In [19]:
test_df.columns

Index(['textID', 'text', 'sentiment', 'Time of Tweet', 'Age of User',
       'Country', 'Population -2020', 'Land Area (Km²)', 'Density (P/Km²)'],
      dtype='object')

In [22]:
train_df.iloc[1]['text']

' Sooo SAD I will miss you here in San Diego!!!'

In [23]:
def basic_tokenizer(text: str)->List[str]:
    'tokenize words and punctuations using python split'
    result = re.split(r'([,.:;?_!"()\']|--|\s)', text)
    return [item.strip() for item in result if item.strip()]

In [24]:
text = train_df.iloc[1]['text']
basic_tokenizer(text)

['Sooo',
 'SAD',
 'I',
 'will',
 'miss',
 'you',
 'here',
 'in',
 'San',
 'Diego',
 '!',
 '!',
 '!']

In [33]:
# Used to find preferred data paths for nltk
# print(nltk.data.path)

['/Users/leonardo/nltk_data', '/Users/leonardo/sentiment_analysis/sentiment-analysis/.venv/nltk_data', '/Users/leonardo/sentiment_analysis/sentiment-analysis/.venv/share/nltk_data', '/Users/leonardo/sentiment_analysis/sentiment-analysis/.venv/lib/nltk_data', '/usr/share/nltk_data', '/usr/local/share/nltk_data', '/usr/lib/nltk_data', '/usr/local/lib/nltk_data']


In [43]:
nltk.tokenize.word_tokenize(text, language='english', preserve_line=False)

['Sooo',
 'SAD',
 'I',
 'will',
 'miss',
 'you',
 'here',
 'in',
 'San',
 'Diego',
 '!',
 '!',
 '!']

In [45]:
!pip install --upgrade nltk

Collecting nltk
  Using cached nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Using cached nltk-3.9.1-py3-none-any.whl (1.5 MB)
Installing collected packages: nltk
  Attempting uninstall: nltk
    Found existing installation: nltk 3.8.1
    Uninstalling nltk-3.8.1:
      Successfully uninstalled nltk-3.8.1
Successfully installed nltk-3.9.1


In [46]:
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('popular')

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to
[nltk_data]    |     /Users/leonardo/nltk_data...
[nltk_data]    |   Unzipping corpora/cmudict.zip.
[nltk_data]    | Downloading package gazetteers to
[nltk_data]    |     /Users/leonardo/nltk_data...
[nltk_data]    |   Unzipping corpora/gazetteers.zip.
[nltk_data]    | Downloading package genesis to
[nltk_data]    |     /Users/leonardo/nltk_data...
[nltk_data]    |   Unzipping corpora/genesis.zip.
[nltk_data]    | Downloading package gutenberg to
[nltk_data]    |     /Users/leonardo/nltk_data...
[nltk_data]    |   Unzipping corpora/gutenberg.zip.
[nltk_data]    | Downloading package inaugural to
[nltk_data]    |     /Users/leonardo/nltk_data...
[nltk_data]    |   Unzipping corpora/inaugural.zip.
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /Users/leonardo/nltk_data...
[nltk_data]    |   Unzipping corpora/movie_reviews.zip.
[nltk_data]    | Downlo

True

In [47]:
#nltk.download('popular')
nltk.download('webtext')
nltk.download('brown')

[nltk_data] Downloading package webtext to
[nltk_data]     /Users/leonardo/nltk_data...
[nltk_data]   Unzipping corpora/webtext.zip.
[nltk_data] Downloading package brown to /Users/leonardo/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.


True

# Create Train and Test Dataset

In [48]:
def create_X_Y(df, tokenizer_fn, words):
    words_set = set(words)
    words = list(words_set)
    assert len(words_set)==len(words)
    words_dict = {c: i for i, c in enumerate(words)}
    X = defaultdict(lambda: defaultdict(int))
    Y = []
    row = -1
    for i in range(len(df)):
        try:
            tokens = tokenizer_fn(df.iloc[i]['text'])
            row+=1
        except TypeError:
            continue
        for t in tokens:
            if t not in words:
                X[row][len(words)]+=1
            else:
                X[row][words_dict[t]]+=1
        Y.append(df.iloc[i]['sentiment'])

    return X, Y, len(words_set)


X_train, Y_train, num_words = create_X_Y(train_df, partial(nltk.tokenize.word_tokenize, language='english', preserve_line=False), words.words())
X_test, Y_test, num_words = create_X_Y(test_df, partial(nltk.tokenize.word_tokenize, language='english', preserve_line=False), words.words())

# X_train, Y_train = create_X_Y(train_df, basic_tokenizer, webtext.words())

# To load train data into sparse matrices

In [49]:
X_train_s = sparse.dok_matrix((len(X_train), num_words+1), dtype=np.int8)

for row_id, word_freq_dict in X_train.items():
    for word_id, word_count in word_freq_dict.items():
        X_train_s[row_id, word_id] = word_count

X_train_sparse = X_train_s.tocsr()

In [50]:
X_test_s = sparse.dok_matrix((len(X_test), num_words+1), dtype=np.int8)

for row_id, word_freq_dict in X_test.items():
    for word_id, word_count in word_freq_dict.items():
        X_test_s[row_id, word_id] = word_count

X_test_sparse = X_test_s.tocsr()

# Fit, Predict and Basic Evaluation

In [51]:
model = MultinomialNB()
model.fit(X_train_sparse, Y_train)

In [52]:
Y_pred = model.predict(X_test_sparse)

In [53]:
print("Number of mislabeled points out of a total %d points : %d"
      % (len(Y_test), (Y_test != Y_pred).sum()))

Number of mislabeled points out of a total 3534 points : 1598
