### Imports

In [1]:
import pandas as pd

# sklearn
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

### Fetch dataset and create training/test data

In [2]:
df = pd.read_csv('new_IMDB_Dataset.csv', on_bad_lines='skip')

X, y = df['review'], df['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

### Linear Support Vector Classifier

In [3]:

pipe = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('linearsvc', LinearSVC(
        loss='hinge',
        penalty='l2',
        random_state=42,
        max_iter=10000,
    ))
]).fit(X_train, y_train)

y_pred = pipe.predict(X_test)
report = classification_report(y_test, y_pred)
print(report)



              precision    recall  f1-score   support

    negative       0.87      0.87      0.87      5000
    positive       0.87      0.87      0.87      5000

    accuracy                           0.87     10000
   macro avg       0.87      0.87      0.87     10000
weighted avg       0.87      0.87      0.87     10000



### Linear SVC - Fine tuned

In [14]:

parameters = {
    'vectorizer__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'vectorizer__stop_words': ['english', None],
    'vectorizer__analyzer': ['word'],
    'linearsvc__tol': [1e-3, 1e-4, 1e-5],
}

pipe = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('linearsvc', LinearSVC(
        loss='hinge',
        penalty='l2',
        random_state=42,
        max_iter=2000,
    ))
])


clf = GridSearchCV(pipe, parameters, scoring='f1_macro', cv=5)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print("best params:", clf.best_params_)
print()
report = classification_report(y_test, y_pred)
print(report)



best params: {'linearsvc__tol': 0.001, 'vectorizer__analyzer': 'word', 'vectorizer__ngram_range': (1, 3), 'vectorizer__stop_words': None}

              precision    recall  f1-score   support

    negative       0.91      0.91      0.91      5000
    positive       0.91      0.91      0.91      5000

    accuracy                           0.91     10000
   macro avg       0.91      0.91      0.91     10000
weighted avg       0.91      0.91      0.91     10000

