In [3]:
import re
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC

In [4]:
df = pd.read_csv("sentiment_analysis.csv", header=[0])

In [5]:
display(df)

Unnamed: 0,ID,text,label
0,7.680980e+17,Josh Jenkins is looking forward to TAB Breeder...,1
1,7.680980e+17,RT @MianUsmanJaved: Congratulations Pakistan o...,1
2,7.680980e+17,"RT @PEPalerts: This September, @YESmag is taki...",1
3,7.680980e+17,"RT @david_gaibis: Newly painted walls, thanks ...",1
4,7.680980e+17,RT @CedricFeschotte: Excited to announce: as o...,1
...,...,...,...
550386,8.046170e+17,@goddesses_o I can't stop watching her...mm. M...,0
550387,8.046180e+17,Poor old Tom Odell doesn't look like he would ...,0
550388,8.046180e+17,#antsmasher I smashed 7 ants in this awesome ...,1
550389,8.046180e+17,@LizHudston @KymWyllie @Evasmiless @meanBok @l...,1


In [6]:
df.drop('ID', axis=1, inplace=True)

In [7]:
def clean_text(text):
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = re.sub(r'@[A-Za-z0-9]+', '', text)  # Remove user tags
    text = re.sub(r'https?://[A-Za-z0-9./]+', '', text)  # Remove URLs
    text = re.sub(r'[^A-Za-z0-9 ]+', '', text)  # Remove non-alphanumeric characters
    return text.lower()

df['cleaned_text'] = df['text'].apply(clean_text)

In [8]:
display(df)

Unnamed: 0,text,label,cleaned_text
0,Josh Jenkins is looking forward to TAB Breeder...,1,josh jenkins is looking forward to tab breeder...
1,RT @MianUsmanJaved: Congratulations Pakistan o...,1,rt congratulations pakistan on becoming no1te...
2,"RT @PEPalerts: This September, @YESmag is taki...",1,rt this september is taking you to maine men...
3,"RT @david_gaibis: Newly painted walls, thanks ...",1,rt gaibis newly painted walls thanks a million...
4,RT @CedricFeschotte: Excited to announce: as o...,1,rt excited to announce as of july 2017 fescho...
...,...,...,...
550386,@goddesses_o I can't stop watching her...mm. M...,0,o i cant stop watching hermm more
550387,Poor old Tom Odell doesn't look like he would ...,0,poor old tom odell doesnt look like he would k...
550388,#antsmasher I smashed 7 ants in this awesome ...,1,antsmasher i smashed 7 ants in this awesome g...
550389,@LizHudston @KymWyllie @Evasmiless @meanBok @l...,1,morning girls have a wonderful friday


In [9]:
X_train, X_test, y_train, y_test = train_test_split(df['cleaned_text'],df['label'], test_size=0.2, random_state=1624)

In [10]:
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [11]:
vectorizer = CountVectorizer()
X_train_bow = vectorizer.fit_transform(X_train)
X_test_bow = vectorizer.transform(X_test)

In [12]:
k = 1000 
selector = SelectKBest(chi2, k=k)
X_train_tfidf_selected = selector.fit_transform(X_train_tfidf, y_train)
X_test_tfidf_selected = selector.transform(X_test_tfidf)

In [13]:
selector = SelectKBest(chi2, k=k)
X_train_bow_selected = selector.fit_transform(X_train_bow, y_train)
X_test_bow_selected = selector.transform(X_test_bow)

In [None]:
# TF-IDF
lr_tfidf = LogisticRegression(max_iter=1000)
param_grid = {'C': [0.1, 1, 10, 100],
              'penalty': ['l1', 'l2', 'elasticnet', 'none'],
              'solver': ['lbfgs', 'newton-cg', 'liblinear', 'sag', 'saga']}
grid_search_lr_tfidf = GridSearchCV(lr_tfidf, param_grid, cv=5)
grid_search_lr_tfidf.fit(X_train_tfidf_selected, y_train)

# BoW
lr_bow = LogisticRegression(max_iter=1000)
param_grid = {'C': [0.1, 1, 10, 100],
              'penalty': ['l1', 'l2', 'elasticnet', 'none'],
              'solver': ['lbfgs', 'newton-cg', 'liblinear', 'sag', 'saga']}
grid_search_lr_bow = GridSearchCV(lr_bow, param_grid, cv=5)
grid_search_lr_bow.fit(X_train_bow_selected, y_train)

# Evaluate the models
lr_tfidf_acc = grid_search_lr_tfidf.score(X_test_tfidf_selected, y_test)
lr_bow_acc = grid_search_lr_bow.score(X_test_bow_selected, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
print(lr_tfidf_acc)
print(lr_bow_acc)

In [None]:
# TF-IDF
knn_tfidf = KNeighborsClassifier()
param_grid = {'n_neighbors': [3, 5, 7, 9],
              'weights': ['uniform', 'distance'],
              'p': [1, 2, 3]}
grid_search_knn_tfidf = GridSearchCV(knn_tfidf, param_grid, cv=5)
grid_search_knn_tfidf.fit(X_train_tfidf_selected, y_train)

# BoW
knn_bow = KNeighborsClassifier()
param_grid = {'n_neighbors': [3, 5, 7, 9],
              'weights': ['uniform', 'distance'],
              'p': [1, 2, 3]}
grid_search_knn_bow = GridSearchCV(knn_bow, param_grid, cv=5)
grid_search_knn_bow.fit(X_train_bow_selected, y_train)

# Evaluate the models
knn_tfidf_acc = grid_search_knn_tfidf.score(X_test_tfidf_selected, y_test)
knn_bow_acc = grid_search_knn_bow.score(X_test_bow_selected, y_test)

In [None]:
print(knn_tfidf_acc)
print(knn_bow_acc)

In [None]:
# TF-IDF
nb_tfidf = MultinomialNB()
param_grid = {'alpha': [0.1, 0.5, 1, 5, 10]}
grid_search_nb_tfidf = GridSearchCV(nb_tfidf, param_grid, cv=5)
grid_search_nb_tfidf.fit(X_train_tfidf_selected, y_train)

# BoW
nb_bow = MultinomialNB()
param_grid = {'alpha': [0.1, 0.5, 1, 5, 10]}
grid_search_nb_bow = GridSearchCV(nb_bow, param_grid, cv=5)
grid_search_nb_bow.fit(X_train_bow_selected, y_train)

# Evaluate the models
nb_tfidf_acc = grid_search_nb_tfidf.score(X_test_tfidf_selected, y_test)
nb_bow_acc = grid_search_nb_bow.score(X_test_bow_selected, y_test)

In [None]:
print(nb_tfidf_acc)
print(nb_bow_acc)

In [None]:
# TF-IDF
svm_tfidf = SVC(max_iter=1000)
param_grid = {'C': [0.1, 1, 10, 100], 'kernel': ['linear', 'rbf']}
grid_search_svm_tfidf = GridSearchCV(svm_tfidf, param_grid, cv=5)
grid_search_svm_tfidf.fit(X_train_bow_selected, y_train)

# BoW
svm_bow = SVC(max_iter=1000)
param_grid = {'C': [0.1, 1, 10, 100], 'kernel': ['linear', 'rbf']}
grid_search_svm_bow = GridSearchCV(svm_bow, param_grid, cv=5)
grid_search_svm_bow.fit(X_train_bow_selected, y_train)

# Evaluate the models
svm_tfidf_acc = grid_search_svm_tfidf.score(X_test_tfidf_selected, y_test)
svm_bow_acc = grid_search_svm_bow.score(X_test_bow_selected, y_test)

In [None]:
print(svm_tfidf_acc)
print(svm_bow_acc)

In [None]:
print("Logistic Regression Accuracy - TF-IDF: {:.2f}, BoW: {:.2f}".format(lr_tfidf_acc, lr_bow_acc))
print("k-NN Accuracy - TF-IDF: {:.2f}, BoW: {:.2f}".format(knn_tfidf_acc, knn_bow_acc))
print("Naive Bayes Accuracy - TF-IDF: {:.2f}, BoW: {:.2f}".format(nb_tfidf_acc, nb_bow_acc))
print("SVM Accuracy - TF-IDF: {:.2f}, BoW: {:.2f}".format(svm_tfidf_acc, svm_bow_acc))

In [None]:
grid_search_svm_bow.best_params_