In [12]:
import pandas as pd
import numpy as np
import string

# nltk - natural language toolkit (used to perform text processing)

# 1. Tokenization
from nltk.tokenize import word_tokenize
# 2. Remove Stopwords
from nltk.corpus import stopwords
# 3. Steming/Lemmatization
from nltk.stem import WordNetLemmatizer, PorterStemmer

# sklearn - scikit-learn (pre-defined machine learning library for python)
# 4. Vectorization
from sklearn.feature_extraction.text import TfidfVectorizer
# 5. Train Test Split Data
from sklearn.model_selection import train_test_split
# 6. Apply Logistic Regression
from sklearn.linear_model import LogisticRegression
# 7. Calculate accuracy and confusion matrix
from sklearn.metrics import accuracy_score, confusion_matrix

In [8]:
df = pd.read_csv('dataset.txt', sep='\t', encoding='latin1', header=None)

In [9]:
df.head()

Unnamed: 0,0,1
0,so there is no way for me to plug it in here i...,0
1,good case excellent value,1
2,great for the jawbone,1
3,tied to charger for conversations lasting more...,0
4,the mic is great,1


In [10]:
df.columns = ['Review' ,'Sentiment']

In [11]:
df.head()

Unnamed: 0,Review,Sentiment
0,so there is no way for me to plug it in here i...,0
1,good case excellent value,1
2,great for the jawbone,1
3,tied to charger for conversations lasting more...,0
4,the mic is great,1


In [13]:
punct = string.punctuation

In [14]:
table = str.maketrans('','',punct)

In [16]:
# table

In [17]:
for i in range(len(df)):
    df['Review'].iloc[i] = df['Review'].iloc[i].translate(table)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [18]:
df.head()

Unnamed: 0,Review,Sentiment
0,so there is no way for me to plug it in here i...,0
1,good case excellent value,1
2,great for the jawbone,1
3,tied to charger for conversations lasting more...,0
4,the mic is great,1


In [28]:
tokens = []
for i in range(len(df)):
    tokens.append(word_tokenize(df['Review'].iloc[i]))

In [29]:
print(tokens[2])

['great', 'for', 'the', 'jawbone']


In [30]:
eng_stopwords = stopwords.words('english')
for i in range(len(tokens)):
    tokens[i] = list(set(tokens[i]) - set(eng_stopwords))

In [31]:
print(tokens[2])

['jawbone', 'great']


In [32]:
p_stem = PorterStemmer()

In [33]:
p_stem.stem('playing')

'play'

In [34]:
p_stem.stem('texting')

'text'

In [35]:
p_stem.stem('wasted')

'wast'

In [36]:
p_stem.stem('bought')

'bought'

In [37]:
wnet = WordNetLemmatizer()

In [41]:
#n - noun
#v - verb
wnet.lemmatize('bought',pos='v')

'buy'

In [42]:
for i in range(len(tokens)):
    for j in range(len(tokens[i])):
        tokens[i][j] = wnet.lemmatize(tokens[i][j], pos='v')

In [45]:
print(tokens[10])

['quality', 'great', 'sound']


In [48]:
' '.join(tokens[10])

'quality great sound'

In [49]:
for i in range(len(tokens)):
    tokens[i] = ' '.join(tokens[i])

In [50]:
tfidf = TfidfVectorizer()

In [51]:
tfidf.fit(tokens)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [53]:
vectors = tfidf.transform(tokens)

In [54]:
vectors = vectors.toarray()

In [55]:
vectors.shape

(3000, 4536)

In [56]:
logistic = LogisticRegression()

In [57]:
logistic.fit(vectors, df['Sentiment'])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [58]:
y_pred = logistic.predict(vectors)

In [60]:
accuracy_score(df['Sentiment'], y_pred)

0.9333333333333333

In [61]:
confusion_matrix(df['Sentiment'], y_pred)

array([[1409,   91],
       [ 109, 1391]], dtype=int64)

In [62]:
# correct predictions
1409 + 1391

2800

In [63]:
# wrong predictions
109 + 91

200