### Loading Dataset

In [1]:
import pandas as pd
dataset = pd.read_csv('Sentiments.csv', delimiter='\t')

### Data Preprocessing

In [2]:
import re
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist

In [3]:
stemmer = SnowballStemmer('english')
stop_words = stopwords.words('english')
stop_words.remove("not")

In [4]:
processed_reviews = []
for review in dataset['Reviews']:
    review = re.sub("[^' a-z]", "", review.lower())
    review = ' '.join([stemmer.stem(word) for word in word_tokenize(review) if word not in stop_words])
    processed_reviews.append(review)
dataset['processed_reviews'] = processed_reviews
processed_reviews

['wow love place',
 'crust not good',
 'not tasti textur nasti',
 'stop late may bank holiday rick steve recommend love',
 'select menu great price',
 'get angri want damn pho',
 "honeslti n't tast fresh",
 'potato like rubber could tell made ahead time kept warmer',
 'fri great',
 'great touch',
 'servic prompt',
 'would not go back',
 'cashier care ever say still end wayyy overpr',
 'tri cape cod ravoli chicken cranberrymmmm',
 'disgust pretti sure human hair',
 'shock sign indic cash',
 'high recommend',
 'waitress littl slow servic',
 'place not worth time let alon vega',
 'not like',
 'burritto blah',
 'food amaz',
 'servic also cute',
 'could care less interior beauti',
 'perform',
 "'s rightth red velvet cakeohhh stuff good",
 'never brought salad ask',
 'hole wall great mexican street taco friend staff',
 'took hour get food tabl restaur food luke warm sever run around like total overwhelm',
 'worst salmon sashimi',
 'also combo like burger fri beer decent deal',
 'like final b

In [5]:
# Collecting words with frequency
def freq_dict(reviews, stemmer):
    all_reviews = ' '.join(reviews)
    all_stemmed_reviews = [stemmer.stem(word) for word in word_tokenize(all_reviews)]
    word_freq = FreqDist(all_stemmed_reviews)
    return dict(word_freq)

In [6]:
# Collecting postive word-freq
positive_reviews = dataset['processed_reviews'][dataset["Labels"]==1]
pos_word_freq = freq_dict(positive_reviews, stemmer)

# Collecting negative word-freq
negative_reviews = dataset['processed_reviews'][dataset["Labels"]==0]
neg_word_freq = freq_dict(negative_reviews, stemmer)

In [7]:
positive_scores = []
negative_scores = []
for review in dataset['processed_reviews']:
    pos_score = 0
    neg_score = 0
    for word in word_tokenize(review):
        pos_score += pos_word_freq.get(word, 0)
        neg_score += neg_word_freq.get(word, 0)
    positive_scores.append(pos_score)
    negative_scores.append(neg_score)

df = pd.DataFrame([positive_scores, negative_scores]).T
df.columns = ['pos', 'neg']

In [8]:
# Training Data
df.head(10)

Unnamed: 0,pos,neg
0,93,55
1,92,123
2,28,107
3,53,13
4,108,10
5,21,35
6,38,95
7,76,89
8,77,9
9,72,1


In [9]:
# train test split
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(df, dataset['Labels'], test_size=0.2, random_state=0)

### Model Creation

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

In [11]:
# Logistic Regression
classifier = LogisticRegression()
classifier.fit(X_train, Y_train)

LogisticRegression()

### Model Prediction and Evaluation

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix 
from sklearn.metrics import accuracy_score

In [13]:
# Logisitc Regression
Y_pred = classifier.predict(X_test)
accuracy_result = accuracy_score(Y_test, Y_pred)
confusion_matrix_result = confusion_matrix(Y_test, Y_pred)
print("accuracy:",accuracy_result, "\nconfusion matix:\n",confusion_matrix_result)

accuracy: 0.885 
confusion matix:
 [[86 11]
 [12 91]]


### Exploring Model

In [14]:
sentiment = "Food was awesome"
words = [stemmer.stem(word) for word in word_tokenize(sentiment)]
pos_score = sum([pos_word_freq.get(word, 0) for word in words])
neg_score = sum([neg_word_freq.get(word, 0) for word in words])

In [15]:
data = pd.DataFrame([pos_score, neg_score]).T
result = classifier.predict(data)
print(result)

[1]


In [16]:
'''
#To save file
import pickle
pickle.dump(classifier1, open("LogisticRegression.pkl", "wb"))
pickle.dump(pos_word_freq, open("pos_label_word_freq", "wb"))
pickle.dump(neg_word_freq, open("neg_label_word_freq", "wb"))

#Stemmer can be saved as pickle but it can easily be imported using "nltk.corpus.stopwords"
'''

'\n#To save file\nimport pickle\npickle.dump(classifier1, open("LogisticRegression.pkl", "wb"))\npickle.dump(pos_word_freq, open("pos_label_word_freq", "wb"))\npickle.dump(neg_word_freq, open("neg_label_word_freq", "wb"))\n\n#Stemmer can be saved as pickle but it can easily be imported using "nltk.corpus.stopwords"\n'