In [2]:
import warnings

import numpy as np
from nltk import word_tokenize, wordpunct_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import make_scorer, f1_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV
import pandas as pd

warnings.filterwarnings("ignore")
np.random.seed(1)

In [3]:
df = pd.read_csv('train.csv')
df = df[df.Tweet != 'Not Available']
X, y = df.Tweet, df.Category

In [4]:
X = X.str.replace('&nbsp;', ' ')
X = X.str.replace(r'&.+?;', '', regex=True)
X = X.str.replace(r'https?\S+', 'http', regex=True)
X = X.str.strip().str.lower()

In [5]:
pipe = Pipeline([
    ('vec', TfidfVectorizer()),
    ('to_array', FunctionTransformer(lambda x: x.toarray())),
    ('clf', LinearSVC())
])

In [6]:
param_grid = dict(
    vec__norm=[None, 'l1', 'l2'],
    vec__tokenizer=[None, word_tokenize, wordpunct_tokenize],
    vec__sublinear_tf=[True, False],
)
grid_search = GridSearchCV(pipe, param_grid=param_grid, scoring=make_scorer(f1_score, average='micro'))

In [7]:
grid_search.fit(X, y)

In [8]:
clf = grid_search.best_estimator_
print(clf)

Pipeline(steps=[('vec',
                 TfidfVectorizer(norm='l1', sublinear_tf=True,
                                 tokenizer=<function word_tokenize at 0x7f38ac1609d0>)),
                ('to_array',
                 FunctionTransformer(func=<function <lambda> at 0x7f38a8401ee0>)),
                ('clf', LinearSVC())])


In [9]:
clf.fit(X, y)

In [10]:
df = pd.read_csv('test.csv')
df['Category'] = clf.predict(df.Tweet)
df.Category[df.Tweet == 'Not Available'] = 'positive'
df.drop(columns='Tweet').to_csv('submission.csv', index=False)