In [45]:
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tag import pos_tag
from nltk.tokenize import TweetTokenizer
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder

import re
import string
import pickle
import pandas as pd


## Let's go

In [3]:
### import dataset
df = pd.read_csv("../datasets/tweets.csv")
df = df.dropna()
df["category"] = df["category"].astype(int)


In [None]:
print(f"{df.info()}\n")
print(f"Target values: {df['category'].unique()} \n")
df.tail()

In [16]:
# clean data
stop_words = stopwords.words('english')
translator = str.maketrans('', '', string.punctuation)
tokenizer = TweetTokenizer()
lemmatizer = WordNetLemmatizer()


def clean_text(text: str):

    # lowercase string
    text = text.lower()

    # remove stop words
    text = " ".join([word for word in str(
        text).split() if word not in stop_words])

    # remove urls
    text = re.sub('((www.[^s]+)|(https?://[^s]+))', ' ', text)

    # remove punctuations
    text = text.translate(translator)

    # remove repeating characters
    text = re.sub(r'(.)1+', r'1', text)

    # remove numbers
    text = re.sub('[0-9]+', '', text)

    # tokenize text
    text: list[str] = tokenizer.tokenize(text)

    # normalize with lemmatizer
    tokens = []
    for token, tag in pos_tag(text):
        if tag.startswith("NN"):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'

        token = lemmatizer.lemmatize(token, pos)
        tokens.append(token)
    text = tokens

    text = " ".join(text)

    return text

clean_text("modi like women like sandra")

'modi like woman like sandra'

In [17]:
df["clean_text"] = df["clean_text"].map(lambda x: clean_text(x))
df.tail()

Unnamed: 0,clean_text,category
162975,crore pay neerav modi recover congress leader ...,-1
162976,dear rss terrorist payal gawar modi kill plus ...,-1
162977,cover interaction forum leave,0
162978,big project come india modi dream project happ...,0
162979,ever listen like gurukul discipline maintain e...,1


In [39]:
# split data into training and testing datasets
x = df["clean_text"]
y = df["category"]
# print(x.tail())


x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.3)


In [26]:
### fit tf-idf vector
vectoriser = TfidfVectorizer(ngram_range=(1,2), max_features=500000)
vectoriser.fit(x_train)
print('No. of feature_words: ', len(vectoriser.get_feature_names_out ()))

No. of feature_words:  500000


In [40]:
### transform train and test dataset
x_train = vectoriser.transform(x_train)
x_test = vectoriser.transform(x_test)

# save vectorizer
with open("../models/tweet_feature_extractor.pickle", "wb") as file:
    pickle.dump(vectoriser, file)

In [41]:
b_naive_bayes_model = BernoulliNB()
b_naive_bayes_model.fit(x_train, y_train)
nb_y_pred = b_naive_bayes_model.predict(x_test)
print(f"BernoulliNB\nAccuracy:\t{accuracy_score(y_test, nb_y_pred)}")



BernoulliNB
Accuracy:	0.7107647624307132


In [43]:
svc_model = LinearSVC()
svc_model.fit(x_train, y_train)
svc_y_pred = svc_model.predict(x_test)
print(f"SVM\nAccuracy:\t{accuracy_score(y_test, svc_y_pred)}")


SVM
Accuracy:	0.8587674623141274


In [35]:
v = vectoriser.transform(["when modi promised “minimum government maximum governance” expected him begin the difficult job reforming the state why does take years get justice state should and not business and should exit psus and temples"])
svc_model.predict(v)

array([-1])

In [44]:
logistic_regression_model = LogisticRegression(C = 2, max_iter = 1000, n_jobs=-1)
logistic_regression_model.fit(x_train, y_train)
lr_y_pred = logistic_regression_model.predict(x_test)
print(f"Logistic Regression\nAccuracy:\t{accuracy_score(y_test, lr_y_pred)}")


Logistic Regression
Accuracy:	0.8488269824712115


In [None]:
encoder = LabelEncoder()
encoder.fit(y_train)
xgb_y_train = encoder.transform(y_train)
xgb_y_test = encoder.transform(y_test)

xgboost_model = XGBClassifier(n_jobs=-1)
xgboost_model.fit(x_train, xgb_y_train)
xgb_y_pred = xgboost_model.predict(x_test)
print(f"XGBoost\nAccuracy:\t{accuracy_score(xgb_y_test, xgb_y_pred)}")


In [47]:
random_forest_model = RandomForestClassifier(random_state=0, n_jobs=-1)
random_forest_model.fit(x_train, y_train)
rf_y_pred = random_forest_model.predict(x_test)
print(f"Random Forest\nAccuracy:\t{accuracy_score(y_test, rf_y_pred)}")


In [46]:
# save model

with open("../models/twitter_sentiment_model.pickle", "wb") as file:
    pickle.dump(svc_model, file)
