In [1]:
import os 

project_dir = os.path.dirname(os.path.abspath(os.getcwd()))

In [2]:
import pandas as pd

df = pd.read_csv(os.path.join(project_dir, 'data/preprocessed/preprocessed_data.csv'))
df.head()

Unnamed: 0.1,Unnamed: 0,text,sentiment,chars,words_total
0,0,responded going,1,36,7
1,1,sooo sad miss san diego,0,46,10
2,2,boss bullying,0,25,5
3,3,interview leave alone,0,31,5
4,4,sons put releases already bought,0,75,14


In [3]:
df = df[df.columns[1:]]
df.head()

Unnamed: 0,text,sentiment,chars,words_total
0,responded going,1,36,7
1,sooo sad miss san diego,0,46,10
2,boss bullying,0,25,5
3,interview leave alone,0,31,5
4,sons put releases already bought,0,75,14


In [4]:
df.shape

(27480, 4)

In [5]:
df.isna().sum()

text           106
sentiment        0
chars            0
words_total      0
dtype: int64

In [6]:
df = df.dropna()
df.isna().sum()

text           0
sentiment      0
chars          0
words_total    0
dtype: int64

In [7]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

cv = CountVectorizer(max_features=5000)
x = cv.fit_transform(df['text'])

In [8]:
x.shape

(27374, 5000)

In [9]:
y = df["sentiment"].values
y

array([1, 0, 0, ..., 2, 2, 1])

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=2)

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [12]:
svc = SVC(kernel='sigmoid', gamma=1.0)
knc = KNeighborsClassifier()
dtc = DecisionTreeClassifier(max_depth=5)
lrc = LogisticRegression(solver='liblinear', penalty='l1')
rfc = RandomForestClassifier(n_estimators=50, random_state=2)
abc = AdaBoostClassifier(n_estimators=50, random_state=2)
bc = BaggingClassifier(n_estimators=50, random_state=2)
etc = ExtraTreesClassifier(n_estimators=50, random_state=2)
gbdt = GradientBoostingClassifier(n_estimators=50,random_state=2)

In [13]:
clfs = {
    'SVC' : svc,
    'KN' : knc, 
    'DT': dtc, 
    'LR': lrc, 
    'RF': rfc, 
    'AdaBoost': abc, 
    'BgC': bc, 
    'ETC': etc,
    'GBDT':gbdt,
}

In [14]:
from sklearn.metrics import accuracy_score, precision_score

def train_classifier(clf, X_train, y_train, X_test, y_test) -> tuple[float, float]:
    clf.fit(X_train, y_train)
    y_hat = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_hat) * 100
    precision = precision_score(y_test, y_hat, average="weighted") * 100
    return float(accuracy), float(precision)

In [15]:
score_acc, score_prec = train_classifier(clf=clfs["SVC"], X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test)
score_acc, score_prec

(58.84931506849315, 58.83033365428674)

In [16]:
from typing import Any


def clf_metric_calculator(Xtrain, ytrain, Xtest, ytest) -> tuple[Any, Any]:
    accuracy_scores = []
    precision_scores = []

    for algo, clf in clfs.items():
        current_acc, current_prec = train_classifier(clf, Xtrain, ytrain, Xtest, ytest)
        accuracy_scores.append(current_acc)
        precision_scores.append(current_prec)
    return accuracy_scores, precision_scores

In [17]:
accuracy_scores_cv, precision_scores_cv = clf_metric_calculator(
    Xtrain=X_train,
    ytrain=y_train,
    Xtest=X_test,
    ytest=y_test
)



In [18]:
performance_df_cv = pd.DataFrame({'Algorithm':clfs.keys(),'Accuracy':accuracy_scores_cv, "Precision":precision_scores_cv}).sort_values('Precision',ascending=True)
performance_df_cv

Unnamed: 0,Algorithm,Accuracy,Precision
0,SVC,58.849315,58.830334
1,KN,55.47032,61.353193
2,DT,49.059361,65.251757
8,GBDT,61.004566,67.990493
5,AdaBoost,63.817352,68.103484
6,BgC,68.200913,68.158635
7,ETC,69.13242,69.053188
4,RF,69.260274,69.245766
3,LR,70.575342,71.287147


In [19]:
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(df['text']).toarray()
X.shape

(27374, 23319)

In [20]:
x_train, x_test, Y_train, Y_test = train_test_split(X, y, test_size=0.2, random_state=2)

In [21]:
accuracy_scores_cv, precision_scores_cv = clf_metric_calculator(
    Xtrain=x_train,
    ytrain=Y_train,
    Xtest=x_test,
    ytest=Y_test
)