In [1]:
import pandas as pd
# Really we should use cross-validation to choose params, not train-test-split.
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegressionCV
from sklearn.pipeline import Pipeline
from sklearn.svm import OneClassSVM
from sklearn.ensemble import IsolationForest
from sklearn.base import BaseEstimator

In [2]:
df = pd.read_csv('weather.csv')
df.head()

Unnamed: 0,text,label
0,How hot is it today?,temperature
1,Is it hot outside?,temperature
2,Will it be uncomfortably hot?,temperature
3,Will it be sweltering?,temperature
4,How cold is it today?,temperature


In [3]:
offtopic = [
    # FRom http://www.richkni.co.uk/php/text/text.php
    'Now is the winter of our discontent',
    'Made glorious summer by this sun of York;',
    'And all the clouds that lour\'d upon our house',
    'In the deep bosom of the ocean buried.',
    'Now are our brows bound with victorious wreaths;',
    'Our bruised arms hung up for monuments;',
    'Our stern alarums changed to merry meetings,',
    'Our dreadful marches to delightful measures.',
    'Grim-visaged war hath smooth\'d his wrinkled front;' ,
    'And now, instead of mounting barded steeds',
    'To fright the souls of fearful adversaries,',
    'He capers nimbly in a lady\'s chamber',
    'To the lascivious pleasing of a lute.',
    'But I, that am not shaped for sportive tricks',
    'Nor made to court an amorous looking-glass',
    'I, that am rudely stamp\'d, and want love\'s majesty',
    'To strut before a wanton ambling nymph;',
    'I, that am curtail\'d of this fair proportion,',
]

In [4]:
class TextClassifier(BaseEstimator):
    def __init__(self, tfidf_params={}, classifier_params={}, anomaly_detector_params={}, preprocessor=None):
        self.tfidf_params = tfidf_params
        if 'ngram_range' not in tfidf_params:
            tfidf_params['ngram_range'] = (2, 3)
        self.classifier_params = classifier_params
        self.preprocessor = preprocessor
        self._tfidf = TfidfVectorizer(analyzer='char', **self.tfidf_params)
        self._clf = LogisticRegressionCV(**classifier_params)
        self._anomaly_detector = AnomalyClassifier(tfidf_params, anomaly_detector_params)
        
    def fit(self, texts, labels):
        if self.preprocessor is not None:
            texts, labels = preprocessor(texts, labels)
        self._clf.fit(self._tfidf.fit_transform(texts),
                      labels)
        return self
        
    def predict(self, text):
        return self._clf.predict(self._tfidf.transform(text))
        
    def predict_proba(self, text):
        return self._clf.predict_proba(self._tfidf.transform(text))

In [5]:
clf = Pipeline([
    ('tfidf', TfidfVectorizer(binary=True, analyzer='char', ngram_range=(2, 3))),
    ('clf', LogisticRegressionCV())
])
text_train, texts_test, labels_train, labels_test = train_test_split(list(df['text']) + offtopic,
                                                                     list(df['label']) + ['offtopic'] * len(offtopic),
                                                                     stratify=list(df['label']) + ['offtopic'] * len(offtopic), 
                                                                     random_state=42)
clf.fit(text_train, labels_train)



Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='char', binary=True, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(2, 3), norm='l2', preprocessor=None, smooth_idf=True,
...    random_state=None, refit=True, scoring=None, solver='lbfgs',
           tol=0.0001, verbose=0))])

In [6]:
for text, predicted_label, true_label in zip(texts_test, clf.predict(texts_test), labels_test):
    print(text)
    print(predicted_label)
    print(true_label)
    print('-' * 80)

How much rain will fall today?
conditions
conditions
--------------------------------------------------------------------------------
Will it be cloudy?
conditions
conditions
--------------------------------------------------------------------------------
Is it chilly?
conditions
temperature
--------------------------------------------------------------------------------
To fright the souls of fearful adversaries,
offtopic
offtopic
--------------------------------------------------------------------------------
When will the cold subside?
temperature
temperature
--------------------------------------------------------------------------------
Are the winds dangerous?
conditions
conditions
--------------------------------------------------------------------------------
To strut before a wanton ambling nymph;
offtopic
offtopic
--------------------------------------------------------------------------------
He capers nimbly in a lady's chamber
conditions
offtopic
--------------------------

In [8]:
new_text = [
    'Turn off the light',
    'What do you think about ELEX?',
    'Yet another offtopic sample'
]
for text, predicted_label in zip(new_text, clf.predict(new_text)):
    print(text)
    print(predicted_label)
    print('-' * 80)

Turn off the light
offtopic
--------------------------------------------------------------------------------
What do you think about ELEX?
offtopic
--------------------------------------------------------------------------------
Yet another offtopic sample
offtopic
--------------------------------------------------------------------------------


In [9]:
new_text = [
    'How low temperature we\'re expecting?',
    'What weather conditions we\'re expecting?',
]
for text, predicted_label in zip(new_text, clf.predict(new_text)):
    print(text)
    print(predicted_label)
    print('-' * 80)

How low temperature we're expecting?
temperature
--------------------------------------------------------------------------------
What weather conditions we're expecting?
conditions
--------------------------------------------------------------------------------
