In this notebook, we will try to get logistic regression running.

Helpful documentation: https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html 

In [226]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression

In [227]:
# load data
data_path = "cleaned_data.csv" # get the data without the disagreements
data = pd.read_csv(data_path)[['text','sentiment']]
print(data.shape)

(38961, 2)


In [228]:
data

Unnamed: 0,text,sentiment
0,Spent the entire morning in a meeting w/ a ven...,neutral
1,Oh! Good idea about putting them on ice cream,positive
2,says good (or should i say bad?) afternoon! h...,neutral
3,haha better drunken tweeting you mean?,positive
4,had an awsome salad! I recommend getting the S...,positive
...,...,...
38956,RT @toricolelli: My phones been charging for a...,negative
38957,'@WhoaBiebz: GET YOUR SHIT TOGETHER OR I'LL GU...,negative
38958,Those** PICK UP THE SLACK YOU FUCK BOYS @Apple,negative
38959,@umo_games @Apple ended up getting a new compu...,neutral


In [229]:
# how many labels of each class?
num_pos = 0; num_neg = 0; num_neu = 0; other = 0
for index, row in data.iterrows():
    label = row['sentiment']
    if label == 'positive': num_pos += 1
    elif label == 'negative': num_neg += 1
    elif label == 'neutral': num_neu += 1
    else: other += 1

print("num pos = {}".format(num_pos))
print("num neg = {}".format(num_neg))
print("num neu = {}".format(num_neu))
print("num other = {}".format(other))

num pos = 11497
num neg = 13604
num neu = 13860
num other = 0


In [230]:
# split into training and testing
from sklearn.model_selection import train_test_split
train, test = train_test_split(data, test_size=0.1)

# how many labels of each class?
num_pos = 0; num_neg = 0; num_neu = 0; other = 0
for index, row in train.iterrows():
    label = row['sentiment']
    if label == 'positive': num_pos += 1
    elif label == 'negative': num_neg += 1
    elif label == 'neutral': num_neu += 1
    else: other += 1

print("num pos = {}".format(num_pos))
print("num neg = {}".format(num_neg))
print("num neu = {}".format(num_neu))
print("num other = {}".format(other))

# change labels to numbers (if you run this in another cell it breaks everything)
train["sentiment"] = train["sentiment"].map({"positive": 0, "negative": 1, "neutral": 2})
test["sentiment"] = test["sentiment"].map({"positive": 0, "negative": 1, "neutral": 2})

# fill the NaN values with empty string
train.fillna('',inplace=True)
test.fillna('',inplace=True)

num pos = 10356
num neg = 12208
num neu = 12500
num other = 0


In [231]:
print(train.shape, test.shape)
train

(35064, 2) (3897, 2)


Unnamed: 0,text,sentiment
25784,No. RT @JetBlue Our fleet's on fleek.,2
16611,"rblpn , You know, I could listen to every ve...",2
14301,Happy to have a Sunday off from work,0
3767,I need a code,2
20658,Happy Star Wars day!,0
...,...,...
8191,&quot;i hear its wonderful in california.&quot;,0
4698,"Hey girl, yeah I did..thanks a bunch!! I have...",0
38000,@apple IOS8 is still shit,1
5958,Just started feeling bad again ugh. I hate it...,1


In [232]:
test

Unnamed: 0,text,sentiment
16673,The World is just amazing!,0
10216,So I was just angrily told I was extemely abus...,1
26950,"@JetBlue yes, well they are operating outside ...",1
29275,@AmericanAir I had a 6am flight I can get no r...,1
17752,The best I could do for proof of the crack LO...,0
...,...,...
7090,Wow one of the nicest patients I've ever had! ...,0
25358,@SouthwestAir how are flights looking for tomo...,2
35327,"RT @larryelder: Trump should have said, ""Megyn...",0
565,is currently watching supernatural whilst wait...,2


In [233]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(train.text)
X_train_counts.shape

(35064, 35393)

In [234]:
for i, (k, v) in enumerate(count_vect.vocabulary_.items()):
    if i < 5: print(k, v) # word & occurrence

no 22238
rt 26964
jetblue 17434
our 23175
fleet 13013


In [235]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(35064, 35393)

In [236]:
tfidf_transformer.idf_.shape

(35393,)

In [237]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(solver='newton-cg')
clf.fit(X=X_train_tfidf, y=train.sentiment)

LogisticRegression(solver='newton-cg')

In [238]:
#useful function
def get_preprocess(mode='train'):
    if mode == 'train':
        X_count_vect = count_vect.fit_transform(train.text)
    else:
        X_count_vect = count_vect.fit_transform(test.text)
    X_tfidf = tfidf_transformer.fit_transform(X_count_vect)
    return X_tfidf

In [239]:
# evaluate on test???
# looks like we have to do the count vectorizer first on whole data, then split, tthen tf-idf
X_test_tfidf = get_preprocess(mode='test')
print(X_test_tfidf.shape)
# preds = clf.predict(X_test_tfidf)
# type(preds)

(3897, 8923)


In [240]:
# convert to numeric labels
data["sentiment"] = data["sentiment"].map({"positive": 0, "negative": 1, "neutral": 2})

# fill the NaN values with empty string
data.fillna('',inplace=True)

In [241]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()

X_counts = count_vect.fit_transform(data.text)
X_counts.shape

(38961, 37798)

In [242]:
split = train_test_split(X_counts, data.sentiment, test_size=0.1)

X_train_counts = split[0]; X_test_counts = split[1]
y_train_labels = split[2]; y_test_labels = split[3]

In [243]:
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(35064, 37798)

In [244]:
X_test_tfidf = tfidf_transformer.fit_transform(X_test_counts)
X_test_tfidf.shape

(3897, 37798)

In [245]:
clf = LogisticRegression(solver='newton-cg')
clf.fit(X=X_train_tfidf, y=y_train_labels)

LogisticRegression(solver='newton-cg')

In [246]:
preds = clf.predict(X_test_tfidf)
type(preds)

numpy.ndarray

In [247]:
import numpy as np
y_test_labels = np.array(y_test_labels)

In [248]:
# should be similar format
print(type(preds), type(y_test_labels))
print(preds.shape, y_test_labels.shape)
print(preds[:5], y_test_labels[:5])

<class 'numpy.ndarray'> <class 'numpy.ndarray'>
(3897,) (3897,)
[1 0 1 2 1] [0 0 1 0 1]


In [249]:
# acc
np.mean(preds == y_test_labels)

0.8083140877598153

In [250]:
def parse_label(label):
    if label == '0': print("positive")
    elif label == '1': print("negative")
    else: print("neutral")

In [251]:
input_text = "the movie was excellent and one of the best movies I've ever seen."
middle = count_vect.transform([input_text])

In [252]:
prepred = tfidf_transformer.transform(middle)

In [253]:
clf.predict(prepred)

array([0])

In [254]:
# let's try a pipeline
from sklearn.pipeline import Pipeline
text_clf = Pipeline([
    ('vect', CountVectorizer()), # first do CountVectorizer
    ('tfidf', TfidfTransformer()), # then tf-idf
    ('clf', LogisticRegression(solver='newton-cg')), # then logistic regression
])

In [255]:
text_clf.fit(train.text, train.sentiment)

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf', LogisticRegression(solver='newton-cg'))])

In [256]:
preds = text_clf.predict(test.text)
type(preds)

numpy.ndarray

In [257]:
preds.shape

(3897,)

In [258]:
tst_labels = np.array(test.sentiment)

In [259]:
print(preds[:5], tst_labels[:5])

[0 1 0 1 0] [0 1 1 1 0]


In [260]:
np.mean(preds == tst_labels)

0.8165255324608673

In [266]:
from sklearn import metrics

print(metrics.classification_report(tst_labels, preds, target_names=['positive', 'negative', 'neutral']))

              precision    recall  f1-score   support

    positive       0.89      0.81      0.85      1141
    negative       0.82      0.84      0.83      1396
     neutral       0.76      0.80      0.78      1360

    accuracy                           0.82      3897
   macro avg       0.82      0.82      0.82      3897
weighted avg       0.82      0.82      0.82      3897



In [262]:
input_text = "the movie was fucking horrible and one of the worst movies I've ever seen."
text_clf.predict([input_text])

array([1])

In [263]:
# now do the pipeline + grid search to find the best params
from sklearn.model_selection import GridSearchCV

params = {
    'vect__ngram_range': [(1, 1), (1, 2)],
    'vect__strip_accents': [None, 'ascii', 'unicode'],
    'vect__lowercase': (True, False),
    'tfidf__norm': ['l1', 'l2'],
    'tfidf__use_idf': (True, False),
    'clf__penalty': ['none', 'l2', 'l1', 'elasticnet'],
    'clf__C': [1.0, 0.75, 0.5, 0.25],
    'clf__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
}

# params = {
#     'vect__ngram_range': [(1, 1), (1, 2)],
#     'vect__lowercase': (True, False),
#     'clf__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
# }

gs_clf = GridSearchCV(text_clf, params, cv=5, n_jobs=-1)

In [264]:
gs_clf

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('vect', CountVectorizer()),
                                       ('tfidf', TfidfTransformer()),
                                       ('clf',
                                        LogisticRegression(solver='newton-cg'))]),
             n_jobs=-1,
             param_grid={'clf__C': [1.0, 0.75, 0.5, 0.25],
                         'clf__penalty': ['none', 'l2', 'l1', 'elasticnet'],
                         'clf__solver': ['newton-cg', 'lbfgs', 'liblinear',
                                         'sag', 'saga'],
                         'tfidf__norm': ['l1', 'l2'],
                         'tfidf__use_idf': (True, False),
                         'vect__lowercase': (True, False),
                         'vect__ngram_range': [(1, 1), (1, 2)],
                         'vect__strip_accents': [None, 'ascii', 'unicode']})

In [None]:
# need to be able to save and load models
# need the count Vectorizer, the tf-idf thingy, and the model
# can we create a pipeline and save iti all?
save_path = "models/log_reg_pipeline.pkl"