In [1]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.pipeline import make_pipeline
import pickle

pd.set_option('max_colwidth', None)

In [2]:
df = pd.read_csv('data/tweets_processed.csv')
df.rename(columns={'lemmatized_tweets': 'Tweet', 'VADER_sentiment':'Sentiment'}, inplace=True)
df.head()

Unnamed: 0,Tweet,Sentiment
0,zydus group announce -PRON- new brand identity with the list entity of the group cadila healthcare limit to be now know as zyduslifescience limited more,Negative
1,danger of take -PRON- or leave -PRON- compromise outcome on tripswaiver south africa express concern that the delay in approve a trip waiver be hamper effort to diversify proper production of vaccine amp address vaccine inequity covid,Negative
2,nstnation state health director dr othman warijo say the case involve a year old girl with a history of asthma who receive -PRON- vaccine at the sultanah bahiyah hospital kedah child vaccine covid pickid,Neutral
3,non medical face mask kn protect -PRON- from non oily airborne pollutant immunity vaccine testkit glove plymask ff kn,Positive
4,forsale vaccine covid covid medical life domain technology tech science medicine doctor daysofcode bot memes bigdata security cloud javascript java datascience machinelearning web nft vc investor socialmedia brand ai,Positive


In [3]:
df.dropna(inplace=True)

In [4]:
X = df['Tweet']
y = df['Sentiment']

## Vectorizer

In [5]:
stop_words = stopwords.words('english')
stop_words.extend(['covid', 'dose', 'vaccine', 'vaccination', 'amp', 'coronavirus'])

In [6]:
# Initialize the vectorizer
vectorizer = TfidfVectorizer(min_df=5, lowercase=True, stop_words=stop_words)

# Fit it on X_train
vectorizer.fit(X)

# Transform
X = vectorizer.transform(X)

## Modeling

### Logistic Regression

In [7]:
# instantiate a logistic regression
logreg = LogisticRegression(max_iter=500)

# fit the model to train set only
logreg.fit(X, y)

LogisticRegression(max_iter=500)

In [8]:
new_review = ["Absolutely love this place! The best ever!"]

X_new = vectorizer.transform(new_review)
logreg.predict_proba(X_new)

array([[0.25100495, 0.18982336, 0.55917169]])

In [12]:
logreg.predict(X_new)[0]

'Positive'

## Create a pipeline

In [10]:
model_pipeline = make_pipeline(vectorizer, logreg)
model_pipeline.predict_proba({'This is the worst'})

array([[0.22709134, 0.48033409, 0.29257457]])

## Save Model

In [11]:
filename = 'saved_model/finalized_model.pickle'
pickle.dump(model_pipeline, open(filename, 'wb'))