Logistic Regression

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [8]:
data = pd.DataFrame([
    ("I love spending time with family", "positive"),
    ("I'm not happy ", "negative"),
    ("I got a free upgrade", "positive"),
    ("lost today's match", "negative"),
    ("missed my train again", "negative"),
    ("i won a free coupon", "positive"),
    ("I love spending time with friends", "positive"),
    ("I love music", "positive"),
    ("He is as good as I am. ", "positive"),
    ("He is too weak to walk. ", "negative"),
    ("Honesty is the best policy.", "positive"),
    ("I do not like to meet him.", "negative"),
], columns=["text", "sentiment"])
print(data)

                                 text sentiment
0    I love spending time with family  positive
1                      I'm not happy   negative
2                I got a free upgrade  positive
3                  lost today's match  negative
4               missed my train again  negative
5                 i won a free coupon  positive
6   I love spending time with friends  positive
7                        I love music  positive
8             He is as good as I am.   positive
9            He is too weak to walk.   negative
10        Honesty is the best policy.  positive
11         I do not like to meet him.  negative


In [9]:
data = data.sample(frac=1).reset_index(drop=True)

In [10]:
X = data['text']
y = data['sentiment']

In [11]:
countvec = CountVectorizer()
countvec_fit = countvec.fit_transform(X)

In [12]:
bag_of_words = pd.DataFrame(countvec_fit.toarray(), columns=countvec.vocabulary_)
bag_of_words

Unnamed: 0,do,not,like,to,meet,him,lost,today,match,love,...,family,got,free,upgrade,too,weak,walk,won,coupon,happy
0,0,0,0,0,0,1,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,1,0,...,1,0,0,0,0,0,0,0,1,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
5,0,1,2,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,1,0,0,0,...,1,0,0,0,0,0,0,0,1,0
8,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,1,0,1,0,0,1,1,0,0


In [18]:
X_train, X_test, y_train, y_test = train_test_split(bag_of_words,y,test_size=0.7, random_state=7)

In [19]:
lr = LogisticRegression(random_state=1).fit(X_train, y_train)

In [20]:
y_pred_lr = lr.predict(X_test)

In [21]:
accuracy_score(y_pred_lr, y_test)

0.3333333333333333

In [22]:
print(classification_report(y_test, y_pred_lr,zero_division=0))

              precision    recall  f1-score   support

    negative       0.33      1.00      0.50         3
    positive       0.00      0.00      0.00         6

    accuracy                           0.33         9
   macro avg       0.17      0.50      0.25         9
weighted avg       0.11      0.33      0.17         9



Naive Bayes - classification using probablity

In [23]:
from sklearn.naive_bayes import MultinomialNB

In [27]:
nb = MultinomialNB().fit(X_train, y_train)

In [28]:
y_pred_nb = nb.predict(X_test)


In [34]:
accuracy_score(y_pred_nb, y_test)

0.3333333333333333

In [35]:
from sklearn.linear_model import SGDClassifier

In [36]:
svm = SGDClassifier().fit(X_train, y_train)

In [37]:
y_pred_svm = svm.predict(X_test)

In [38]:
accuracy_score(y_pred_svm, y_test)

0.3333333333333333