In [2]:
%pip install -r requirements.txt

Collecting pandas
  Downloading pandas-1.1.5-cp37-cp37m-win_amd64.whl (8.7 MB)
     ---------------------------------------- 8.7/8.7 MB 1.6 MB/s eta 0:00:00
Collecting scikit-learn
  Downloading scikit_learn-1.0.2-cp37-cp37m-win_amd64.whl (7.1 MB)
     ---------------------------------------- 7.1/7.1 MB 3.2 MB/s eta 0:00:00
Collecting pytz>=2017.2
  Using cached pytz-2022.1-py2.py3-none-any.whl (503 kB)
Collecting numpy>=1.15.4
  Downloading numpy-1.21.6-cp37-cp37m-win_amd64.whl (14.0 MB)
     ---------------------------------------- 14.0/14.0 MB 4.4 MB/s eta 0:00:00
Collecting threadpoolctl>=2.0.0
  Using cached threadpoolctl-3.1.0-py3-none-any.whl (14 kB)
Collecting joblib>=0.11
  Using cached joblib-1.1.0-py2.py3-none-any.whl (306 kB)
Collecting scipy>=1.1.0
  Downloading scipy-1.7.3-cp37-cp37m-win_amd64.whl (34.1 MB)
     ---------------------------------------- 34.1/34.1 MB 9.5 MB/s eta 0:00:00
Installing collected packages: pytz, threadpoolctl, numpy, joblib, scipy, pandas, sciki

In [3]:
# importing required libraries
import pandas as pd
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

In [5]:
data = pd.read_csv("dataset/twitter_sentiments.csv")
data.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [6]:
data.shape

(31962, 3)

In [7]:
data.label.value_counts()

0    29720
1     2242
Name: label, dtype: int64

In [8]:
train, test = train_test_split(data, test_size = 0.2, stratify = data['label'], random_state=21)

In [9]:
train.shape, test.shape

((25569, 3), (6393, 3))

In [10]:
test.label.value_counts(normalize=True)

0    0.929923
1    0.070077
Name: label, dtype: float64

In [11]:
tfidf_vectorizer = TfidfVectorizer(lowercase= True, max_features=1000, stop_words=ENGLISH_STOP_WORDS)

In [12]:
tfidf_vectorizer.fit(train.tweet)

TfidfVectorizer(max_features=1000,
                stop_words=frozenset({'a', 'about', 'above', 'across', 'after',
                                      'afterwards', 'again', 'against', 'all',
                                      'almost', 'alone', 'along', 'already',
                                      'also', 'although', 'always', 'am',
                                      'among', 'amongst', 'amoungst', 'amount',
                                      'an', 'and', 'another', 'any', 'anyhow',
                                      'anyone', 'anything', 'anyway',
                                      'anywhere', ...}))

In [13]:
train_idf = tfidf_vectorizer.transform(train.tweet)
test_idf  = tfidf_vectorizer.transform(test.tweet)

In [14]:
model_LR = LogisticRegression()
model_LR.fit(train_idf, train.label)

LogisticRegression()

In [16]:
predict_train = model_LR.predict(train_idf)
predict_test = model_LR.predict(test_idf)

In [17]:
# f1 score on train data
f1_score(y_true= train.label, y_pred= predict_train)

0.4888178913738019

In [18]:
f1_score(y_true= test.label, y_pred= predict_test)

0.45751633986928114

In [22]:
pipeline = Pipeline(steps=[('tfidf', TfidfVectorizer(lowercase=True, 
                                                        max_features=1000,
                                                        stop_words=ENGLISH_STOP_WORDS)),
                                                        ('model', LogisticRegression())])
pipeline.fit(train.tweet, train.label)

Pipeline(steps=[('tfidf',
                 TfidfVectorizer(max_features=1000,
                                 stop_words=frozenset({'a', 'about', 'above',
                                                       'across', 'after',
                                                       'afterwards', 'again',
                                                       'against', 'all',
                                                       'almost', 'alone',
                                                       'along', 'already',
                                                       'also', 'although',
                                                       'always', 'am', 'among',
                                                       'amongst', 'amoungst',
                                                       'amount', 'an', 'and',
                                                       'another', 'any',
                                                       'anyhow', 'anyone',
           

In [23]:
pipeline.predict(train.tweet)

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [24]:
text = ["Virat Kohli, AB de Villiers set to auction their 'Green Day' kits from 2016 IPL match to raise funds"]
pipeline.predict(text)

array([0], dtype=int64)

In [None]:
from joblib import dump
dump(pipeline, filename="text_classification.joblib")
data[data.label == 1]