# Getting 0.91... F1 score with TF-IDF vectorization and LinearSVC

In [1]:
import numpy as np
import pandas as pd
import re
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, roc_auc_score
from joblib import dump
from scipy.sparse import save_npz

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Yalikesi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Yalikesi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
train = pd.read_csv('train.csv', index_col='id')

Processing data: removing html tags, non-word characters, single letters and repeated spaces

In [3]:
stop_words = set(stopwords.words('english'))

def process(text):
      res = re.sub('<.*?>', ' ', text)
      res = re.sub('\W', ' ', res)
      res = re.sub('\s+[a-zA-Z]\s+', ' ', res)
      res = re.sub('\s+', ' ', res)
      word_tokens = word_tokenize(res)
      filtered_res = " ".join([w for w in word_tokens if w not in stop_words])
      return filtered_res

In [4]:
train['processed'] = train['review'].apply(lambda x: process(x))

In [5]:
tfidf = TfidfVectorizer(min_df=3, max_df=0.5, ngram_range=(1, 2))
x_train_vector = tfidf.fit_transform(train['processed'])
y_train = train['sentiment'].values

Using grid search for regularization parameter

In [6]:
params = {'C': [0.25, 0.5, 0.75, 1, 1.5, 2]}
gs = GridSearchCV(LinearSVC(), params, scoring='f1', n_jobs=-1)
gs.fit(x_train_vector, y_train)

GridSearchCV(estimator=LinearSVC(), n_jobs=-1,
             param_grid={'C': [0.25, 0.5, 0.75, 1, 1.5, 2]}, scoring='f1')

In [7]:
clf = CalibratedClassifierCV(gs.best_estimator_)
clf.fit(x_train_vector, y_train)

CalibratedClassifierCV(base_estimator=LinearSVC(C=0.5))

In [8]:
dump(clf, 'model.joblib')
dump(tfidf, 'tfidf.joblib')

['tfidf.joblib']

In [9]:
print('f1:', f1_score(y_train, clf.predict(x_train_vector)))
print('roc auc:', roc_auc_score(y_train, clf.predict_proba(x_train_vector)[:, 1]))

f1: 0.9915674478314496
roc auc: 0.9995341193469831


In [10]:
save_npz('encoded_words.npz', x_train_vector)
train.drop(columns=['sentiment', 'processed']).to_csv('reviews.csv')