# This version scores the basic models:
- baseline (top ten)
- all words
- stemmed words
- lemmatized words

import files

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from nltk.tokenize import RegexpTokenizer
from nltk import FreqDist
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score
import nltk
nltk.download('stopwords', quiet=True)
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import sent_tokenize
from operator import itemgetter
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import StandardScaler
# from sklearn.metrics import confusion_matrix
# from sklearn.metrics import ConfusionMatrixDisplay

load

In [2]:
df = pd.read_csv('../data/df.csv')
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

could do more here to explore the neither and both values

move on to language processing

train-test split

In [3]:
X, y = df['text'].to_frame(), df['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=9)

add label to X_train for research purposes .. obviously don't include this in the model

reset index to anticipate future problems ... or not reset the index???

In [4]:
X_train.loc[:, 'label'] = [y_train.loc[val] for val in X_train.index]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train.loc[:, 'label'] = [y_train.loc[val] for val in X_train.index]


perfunctory exploring should happen here

top ten visualizations for pos. and non-pos.

size of vocabulary

more?

In [5]:
basic_token_pattern = r"(?u)\b\w\w+\b"
tokenizer = RegexpTokenizer(basic_token_pattern)

In [6]:
X_train.loc[:, 'text_tokenized'] = X_train['text'].apply(tokenizer.tokenize)
vocab_raw = set(X_train['text_tokenized'].explode())
print('Size of raw vocabulary:', len(vocab_raw))   

Size of raw vocabulary: 8876


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train.loc[:, 'text_tokenized'] = X_train['text'].apply(tokenizer.tokenize)


gonna need naive bayes, might not do any other models (markov, etc.)

In [7]:
baseline_model = MultinomialNB()

look at plurality winner to see score to beat

In [8]:
plurality_cv = round(y_train.value_counts(normalize=True)[0],4)
y_train.value_counts(normalize=True)

sentiment
0    0.672366
1    0.327634
Name: proportion, dtype: float64

first model, just ten features

In [9]:
tfidf = TfidfVectorizer(
    max_features = 10
)

X_train_vectorized = tfidf.fit_transform(X_train['text'])

baseline_cv = round(cross_val_score(baseline_model, X_train_vectorized, y_train).mean(),4)

print('Plurality:', plurality_cv,
      '\nBaseline: ',baseline_cv)

Plurality: 0.6724 
Baseline:  0.6724


an absolutely miniscule improvement

let's try all words, not just max_features = 10

In [10]:
tfidf = TfidfVectorizer()

X_train_vectorized = tfidf.fit_transform(X_train['text'])

all_words_cv = round(cross_val_score(baseline_model, X_train_vectorized, y_train).mean(),4)

print('Plurality:', plurality_cv,
      '\nBaseline: ', baseline_cv,
      '\nAll Words:', all_words_cv)

Plurality: 0.6724 
Baseline:  0.6724 
All Words: 0.7005


an actual improvement

let's look at which 10 terms were least and most associated with positive sentiment

we can and will explore stopwords, but it seems clear we can stem or lemmatize

In [11]:
stemmer = SnowballStemmer(language="english")

def stem_and_tokenize(document):
    tokens = tokenizer.tokenize(document)
    return [stemmer.stem(token) for token in tokens]

create stemmed vocabulary

In [12]:
X_train.loc[:, 'text_stemmed'] = X_train.loc[:, 'text'].apply(stem_and_tokenize)
vocab_stemmed = set(X_train['text_stemmed'].explode())
print('Size of raw vocabulary:    ', len(vocab_raw))
print('Size of stemmed vocabulary:', len(vocab_stemmed))

Size of raw vocabulary:     8876
Size of stemmed vocabulary: 7016


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train.loc[:, 'text_stemmed'] = X_train.loc[:, 'text'].apply(stem_and_tokenize)


In [13]:
tfidf = TfidfVectorizer(
    tokenizer = stem_and_tokenize
)

X_train_vectorized = tfidf.fit_transform(X_train['text'])

stemmed_words_cv = round(cross_val_score(baseline_model, X_train_vectorized, y_train).mean(),4)

print('Plurality:    ', plurality_cv,
      '\nBaseline:     ', baseline_cv,
      '\nAll Words:    ', all_words_cv,
      '\nStemmed Words:', stemmed_words_cv
     )

Plurality:     0.6724 
Baseline:      0.6724 
All Words:     0.7005 
Stemmed Words: 0.6995


Stemming is worse by about one tenth of a percent

top 10 terms

In [14]:
lemmatizer = WordNetLemmatizer()

def lemmatize_and_tokenize(document):
    tokens = tokenizer.tokenize(document)
    return [lemmatizer.lemmatize(token) for token in tokens]

In [15]:
X_train.loc[:, 'text_lemmatized'] = X_train.loc[:, 'text'].apply(lemmatize_and_tokenize)
vocab_lemmatized = set(X_train['text_lemmatized'].explode())
print('Size of raw vocabulary:       ', len(vocab_raw),
      '\nSize of stemmed vocabulary:   ', len(vocab_stemmed),
      '\nSize of lemmatized vocabulary:', len(vocab_lemmatized))

Size of raw vocabulary:        8876 
Size of stemmed vocabulary:    7016 
Size of lemmatized vocabulary: 8208


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train.loc[:, 'text_lemmatized'] = X_train.loc[:, 'text'].apply(lemmatize_and_tokenize)


In [16]:
tfidf = TfidfVectorizer(
    tokenizer = lemmatize_and_tokenize
)

X_train_vectorized = tfidf.fit_transform(X_train['text'])

lemmatized_words_cv = round(cross_val_score(baseline_model, X_train_vectorized, y_train).mean(),4)

print('Plurality:       ', plurality_cv,
      '\nAll Words:       ', all_words_cv,
      '\nStemmed Words:   ', stemmed_words_cv,
      '\nLemmatized Words:', lemmatized_words_cv
     )

Plurality:        0.6724 
All Words:        0.7005 
Stemmed Words:    0.6995 
Lemmatized Words: 0.6984


lemmatizing makes it worse by another tenth of a percent

look at top 10

top ten only looks at top ten most frequent, so this is useless until you give it a stop words list, probably not useful at all

now i have a way to find the top and bottom tfidf scores, i need to take this one step further to see how frequent those terms are, this will help me to decide to ignore n > 4 for example

getting absolutely nowhere