In [22]:
import re
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

In [4]:
df = pd.read_csv("sentiment_analysis.csv", header=[0])

In [5]:
display(df)

Unnamed: 0,ID,text,label
0,7.680980e+17,Josh Jenkins is looking forward to TAB Breeder...,1
1,7.680980e+17,RT @MianUsmanJaved: Congratulations Pakistan o...,1
2,7.680980e+17,"RT @PEPalerts: This September, @YESmag is taki...",1
3,7.680980e+17,"RT @david_gaibis: Newly painted walls, thanks ...",1
4,7.680980e+17,RT @CedricFeschotte: Excited to announce: as o...,1
...,...,...,...
550386,8.046170e+17,@goddesses_o I can't stop watching her...mm. M...,0
550387,8.046180e+17,Poor old Tom Odell doesn't look like he would ...,0
550388,8.046180e+17,#antsmasher I smashed 7 ants in this awesome ...,1
550389,8.046180e+17,@LizHudston @KymWyllie @Evasmiless @meanBok @l...,1


In [6]:
# remove ID column
df.drop('ID', axis=1, inplace=True)

In [7]:
def clean_text(text):
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = re.sub(r'@[A-Za-z0-9]+', '', text)  # Remove user tags
    text = re.sub(r'https?://[A-Za-z0-9./]+', '', text)  # Remove URLs
    text = re.sub(r'[^A-Za-z0-9 ]+', '', text)  # Remove non-alphanumeric characters
    text = re.sub(r'^RT ', '', text) # Remove Retweet
    return text.lower()

df['cleaned_text'] = df['text'].apply(clean_text)

In [8]:
display(df)

Unnamed: 0,text,label,cleaned_text
0,Josh Jenkins is looking forward to TAB Breeder...,1,josh jenkins is looking forward to tab breeder...
1,RT @MianUsmanJaved: Congratulations Pakistan o...,1,congratulations pakistan on becoming no1testt...
2,"RT @PEPalerts: This September, @YESmag is taki...",1,this september is taking you to maine mendoz...
3,"RT @david_gaibis: Newly painted walls, thanks ...",1,gaibis newly painted walls thanks a million to...
4,RT @CedricFeschotte: Excited to announce: as o...,1,excited to announce as of july 2017 feschotte...
...,...,...,...
550386,@goddesses_o I can't stop watching her...mm. M...,0,o i cant stop watching hermm more
550387,Poor old Tom Odell doesn't look like he would ...,0,poor old tom odell doesnt look like he would k...
550388,#antsmasher I smashed 7 ants in this awesome ...,1,antsmasher i smashed 7 ants in this awesome g...
550389,@LizHudston @KymWyllie @Evasmiless @meanBok @l...,1,morning girls have a wonderful friday


In [9]:
X_train, X_test, y_train, y_test = train_test_split(df['cleaned_text'],df['label'], test_size=0.2, random_state=1624)

In [10]:
# TF-IDF
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [11]:
# Bag of Words
vectorizer = CountVectorizer()
X_train_bow = vectorizer.fit_transform(X_train)
X_test_bow = vectorizer.transform(X_test)

In [12]:
# select 1000 best features by chi2
# TF-IDF
k = 1000 
selector = SelectKBest(chi2, k=k)
X_train_tfidf_selected = selector.fit_transform(X_train_tfidf, y_train)
X_test_tfidf_selected = selector.transform(X_test_tfidf)

In [13]:
# Bag of Words
selector = SelectKBest(chi2, k=k)
X_train_bow_selected = selector.fit_transform(X_train_bow, y_train)
X_test_bow_selected = selector.transform(X_test_bow)

In [None]:
# TF-IDF Logistic Regression
lr_tfidf = LogisticRegression(max_iter=5000)
param_grid = {'C': [0.1, 1, 10, 100],
              'solver': ['lbfgs', 'newton-cg', 'liblinear', 'sag', 'saga']}
grid_search_lr_tfidf = GridSearchCV(lr_tfidf, param_grid, cv=5)
grid_search_lr_tfidf.fit(X_train_tfidf_selected, y_train)

# Bag of Words Logistic Regression
lr_bow = LogisticRegression(max_iter=5000)
param_grid = {'C': [0.1, 1, 10, 100],
              'solver': ['lbfgs', 'newton-cg', 'liblinear', 'sag', 'saga']}
grid_search_lr_bow = GridSearchCV(lr_bow, param_grid, cv=5)
grid_search_lr_bow.fit(X_train_bow_selected, y_train)

# Evaluate the models
lr_tfidf_acc = grid_search_lr_tfidf.score(X_test_tfidf_selected, y_test)
lr_bow_acc = grid_search_lr_bow.score(X_test_bow_selected, y_test)

In [None]:
print(lr_tfidf_acc)
print(lr_bow_acc)

0.9558408052398731
0.9543782192788816


In [None]:
# TF-IDF Naive Bayes
nb_tfidf = MultinomialNB()
param_grid = {'alpha': [0.1, 0.5, 1, 5, 10]}
grid_search_nb_tfidf = GridSearchCV(nb_tfidf, param_grid, cv=5)
grid_search_nb_tfidf.fit(X_train_tfidf_selected, y_train)

# Bag of Words Naive Bayes
nb_bow = MultinomialNB()
param_grid = {'alpha': [0.1, 0.5, 1, 5, 10]}
grid_search_nb_bow = GridSearchCV(nb_bow, param_grid, cv=5)
grid_search_nb_bow.fit(X_train_bow_selected, y_train)

# Evaluate the models
nb_tfidf_acc = grid_search_nb_tfidf.score(X_test_tfidf_selected, y_test)
nb_bow_acc = grid_search_nb_bow.score(X_test_bow_selected, y_test)

In [None]:
print(nb_tfidf_acc)
print(nb_bow_acc)

0.9116634417100447
0.9390165244960438


In [26]:
# TF-IDF Decision Tree
dt_tfidf = DecisionTreeClassifier()
param_grid = {'max_features': ["sqrt", "log2"], 'max_depth': [10, 20, 50], 'min_samples_split': [2, 5, 10]}
grid_search_dt_tfidf = GridSearchCV(dt_tfidf, param_grid, cv=5)
grid_search_dt_tfidf.fit(X_train_bow_selected, y_train)

# Bag of Words Decision Tree
dt_bow = DecisionTreeClassifier()
param_grid = {'max_features': ["sqrt", "log2"], 'max_depth': [10, 20, 50], 'min_samples_split': [2, 5, 10]}
grid_search_dt_bow = GridSearchCV(dt_bow, param_grid, cv=5)
grid_search_dt_bow.fit(X_train_bow_selected, y_train)

# Evaluate the models
dt_tfidf_acc = grid_search_dt_tfidf.score(X_test_tfidf_selected, y_test)
dt_bow_acc = grid_search_dt_bow.score(X_test_bow_selected, y_test)

In [27]:
print(dt_tfidf_acc)
print(dt_bow_acc)

0.6750061319597743
0.7984810908529328


In [19]:
# TF-IDF XGBoost
xgb_tfidf = XGBClassifier()
param_grid = {'n_estimators': [100, 200, 500], 'max_depth': [3, 5, 10], 'learning_rate': [0.01, 0.1, 0.5]}
grid_search_xgb_tfidf = GridSearchCV(xgb_tfidf, param_grid, cv=5)
grid_search_xgb_tfidf.fit(X_train_bow_selected, y_train)

# Bag of Words XGBoost
xgb_bow = XGBClassifier()
param_grid = {'n_estimators': [100, 200, 500], 'max_depth': [3, 5, 10], 'learning_rate': [0.01, 0.1, 0.5]}
grid_search_xgb_bow = GridSearchCV(xgb_bow, param_grid, cv=5)
grid_search_xgb_bow.fit(X_train_bow_selected, y_train)

# Evaluate the models
xgb_tfidf_acc = grid_search_xgb_tfidf.score(X_test_tfidf_selected, y_test)
xgb_bow_acc = grid_search_xgb_bow.score(X_test_bow_selected, y_test)

In [20]:
print(xgb_tfidf_acc)
print(xgb_bow_acc)

0.5591075500322495
0.9589476648588741


In [None]:
print("Logistic Regression Accuracy - TF-IDF: {:.2f}, BoW: {:.2f}".format(lr_tfidf_acc, lr_bow_acc))
print("Naive Bayes Accuracy - TF-IDF: {:.2f}, BoW: {:.2f}".format(nb_tfidf_acc, nb_bow_acc))
print("Decision Tree Accuracy - TF-IDF: {:.2f}, BoW: {:.2f}".format(dt_tfidf_acc, dt_bow_acc))
print("XGBoost Accuracy - TF-IDF: {:.2f}, BoW: {:.2f}".format(xgb_tfidf_acc, xgb_bow_acc))

XGBoost with Bag of Words has the best accuracy.

In [29]:
 grid_search_xgb_bow.best_params_

{'learning_rate': 0.5, 'max_depth': 10, 'n_estimators': 500}

In [31]:
XGBClassifier(learning_rate= 0.5, max_depth= 10, n_estimators= 500)