In [14]:
# importing required libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.pipeline import Pipeline

In [3]:
# read the dataset
data = pd.read_csv('dataset/twitter_sentiments.csv')

data.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [4]:
# split data in train and test
train, test = train_test_split(data, test_size=0.2, stratify=data['label'], random_state=21)

train.shape, test.shape

((25569, 3), (6393, 3))

In [8]:
# create a TF-IDF vectorizer object
tfidf_vectorizer = TfidfVectorizer(max_features=1000, stop_words=ENGLISH_STOP_WORDS)

# fit with the data
tfidf_vectorizer.fit(train['tweet'])

TfidfVectorizer(max_features=1000,
                stop_words=frozenset({'a', 'about', 'above', 'across', 'after',
                                      'afterwards', 'again', 'against', 'all',
                                      'almost', 'alone', 'along', 'already',
                                      'also', 'although', 'always', 'am',
                                      'among', 'amongst', 'amoungst', 'amount',
                                      'an', 'and', 'another', 'any', 'anyhow',
                                      'anyone', 'anything', 'anyway',
                                      'anywhere', ...}))

In [9]:
# transform the train and test data
train_idf = tfidf_vectorizer.transform(train['tweet'])
test_idf = tfidf_vectorizer.transform(test['tweet'])

In [13]:
# create LogisticRegression model
model_LR = LogisticRegression()

# fit the model with training data
model_LR.fit(train_idf, train['label'])

# predict the label on the training data
predict_train = model_LR.predict(train_idf)

# predict the model on the test data
predict_test = model_LR.predict(test_idf)

# f1 score on training data
f1_train = f1_score(y_true=train['label'], y_pred=predict_train)
f1_test = f1_score(y_true=test['label'], y_pred=predict_test)

f1_train, f1_test

(0.48840927258193445, 0.46003262642740617)

In [15]:
# define the stages of the pipeline
pipeline = Pipeline(steps=[('tfidf', TfidfVectorizer(max_features=1000, stop_words=ENGLISH_STOP_WORDS)),
                          ('model', LogisticRegression())])
pipeline.fit(train['tweet'], train['label'])

Pipeline(steps=[('tfidf',
                 TfidfVectorizer(max_features=1000,
                                 stop_words=frozenset({'a', 'about', 'above',
                                                       'across', 'after',
                                                       'afterwards', 'again',
                                                       'against', 'all',
                                                       'almost', 'alone',
                                                       'along', 'already',
                                                       'also', 'although',
                                                       'always', 'am', 'among',
                                                       'amongst', 'amoungst',
                                                       'amount', 'an', 'and',
                                                       'another', 'any',
                                                       'anyhow', 'anyone',
           

In [16]:
# sample tweet
text = ["Virat Kohli, AB de Villiers set to auction their 'Green Day' kits from 2016 IPL match to raise funds"]

# predict the label using the pipeline
pipeline.predict(text)

array([0])