In [1]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import everygrams
from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score
import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MultiLabelBinarizer
from typing import List, Generator, Optional, Tuple, Union
from sklearn.utils.extmath import softmax
import warnings

In [2]:
train = pd.read_csv("train.csv", index_col=0)
train.head()

Unnamed: 0_level_0,movie,dialogue,genres
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0,I thought you were in a meeting--? <BR> I am. ...,"[u'drama', u'romance']"
1,1,Are you sure you're okay? You're pale. <BR> I...,[u'drama']
2,2,Go on! Get out! <BR> Mom look don't say anythi...,[u'comedy']
3,3,I could have lost my fucking hands. <BR> That ...,"[u'mystery', u'thriller']"
4,4,Stick with me on this Gloria. I need you... <...,"[u'crime', u'thriller']"


In [3]:
train["genres"] = train["genres"].apply(eval)
train.drop(columns="movie", inplace=True)

In [4]:
train.head()

Unnamed: 0_level_0,dialogue,genres
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,I thought you were in a meeting--? <BR> I am. ...,"[drama, romance]"
1,Are you sure you're okay? You're pale. <BR> I...,[drama]
2,Go on! Get out! <BR> Mom look don't say anythi...,[comedy]
3,I could have lost my fucking hands. <BR> That ...,"[mystery, thriller]"
4,Stick with me on this Gloria. I need you... <...,"[crime, thriller]"


In [5]:
class Tokenizer:
    def __init__(self) -> None:
        self.orig_text_series = None
        self.token_series = None
        self.lemmatizer = WordNetLemmatizer()
        self.stop_words = set(stopwords.words('english'))
        self.is_data_loaded = False
        self.is_tokenized = False
        self.ngram_range = None
        
    def load_data(self, text_series: pd.Series) -> None:
        self.orig_text_series = text_series
        self.is_data_loaded = True
    
    @staticmethod
    def _clean_text(text_series: pd.Series) -> None:
        res = text_series.str.replace(r"(<[^<>]+>|[^\w\s]+)", " ").str.lower()
        return res
    
    def _lemmatize(self, tokens: List[str]) -> Generator[str, None, None]:
        lemmatized_token_series = map(self.lemmatizer.lemmatize, tokens)
        return lemmatized_token_series
    
    @staticmethod
    def _remove_stopwords(lemmatized_tokens: Generator[str, None, None], stop_words: set) -> List[str]:
        res = [t for t in lemmatized_tokens if t not in stop_words]
        return res
    
    @staticmethod
    def _every_grams(unigrams, ngram_range: Tuple[int, int] = (1, 1)):
        res = [" ".join(x) for x in everygrams(unigrams, *ngram_range)]
        return res
    
    def tokenize(self, ngram_range: Tuple[int, int] = (1, 1), *, remove_stopwords: bool = True) -> None:
        if not self.is_data_loaded:
            raise Exception("text data is not loaded")
        text_series = self._clean_text(self.orig_text_series)
        token_series = text_series.apply(word_tokenize)
        unigrams = token_series.apply(self._lemmatize)
        if remove_stopwords:
            unigrams = unigrams.apply(lambda x: self._remove_stopwords(x, self.stop_words))
        self.token_series = unigrams.apply(lambda x: self._every_grams(x, ngram_range))
        self.is_tokenized = True
        self.ngram_range = ngram_range

In [6]:
def safe_predict_proba(clf: "classifier", X: np.ndarray) -> np.ndarray:
    if hasattr(clf, "predict_proba"):
        pred = clf.predict_proba(X)
    elif hasattr(clf, "decision_function"):
        pred = softmax(clf.decision_function(X))
    else:
        raise AttributeError(f"{repr(clf)} has no prediction methods")
    return pred

def problem_predict(pred: pd.Series) -> np.ndarray:
    problem_mask = pred.apply(lambda x: x.size == 0)
    if np.any(problem_mask):
        warnings.warn(f"Empty prediction for {problem_mask.sum()} objects. Replaced with highest prob class.")
    problem_idx = np.where(problem_mask)[0]
    return problem_idx

In [7]:
def cross_val_score(train: pd.DataFrame, clf: "classifier", *, n_splits: int = 5, tfidf_features: int = 20000, 
                    ngram_range: Tuple[int, int] = (1, 1), remove_stopwords: bool = True, proba_step: float = 0.005) -> dict:
    tfidf = TfidfVectorizer(
        max_features=tfidf_features,
        tokenizer=lambda x: x,
        preprocessor=lambda x: x,
        token_pattern=None,
        lowercase=False
    )
    tokenizer = Tokenizer()
    
    tokenizer.load_data(train["dialogue"])
    tokenizer.tokenize(ngram_range, remove_stopwords=remove_stopwords)
    train["tokens"] = tokenizer.token_series
    train.drop(columns="dialogue", inplace=True)
    
    cv_info = {}
    probas = np.arange(0, 1 + 1**(-38), proba_step)
    cv_scores = np.zeros((len(probas), n_splits))
    kf = KFold(n_splits=n_splits, shuffle=True)
    for split_idx, (train_idx, test_idx) in enumerate(tqdm(kf.split(train), total=n_splits)):
        train_ = train.loc[train_idx].explode("genres", ignore_index=True)
        test_ = train.loc[test_idx]
        X_train_, y_train_ = train_["tokens"], train_["genres"]
        X_train = tfidf.fit_transform(X_train_)
        y_train = y_train_.values
        clf.fit(X_train, y_train)
        y_test = test_["genres"]
        X_test = tfidf.transform(test_["tokens"])
        pred = safe_predict_proba(clf, X_test) 
        mlb = MultiLabelBinarizer(classes=clf.classes_)
        y_true = mlb.fit_transform(test_["genres"].apply(set))
        for threshold_idx, threshold in enumerate(tqdm(probas, leave=False)):
            y_pred = (pred >= threshold).astype(int)
            score = f1_score(y_true, y_pred, average="samples")
            cv_scores[threshold_idx, split_idx] = score
        
    cv_scores = pd.DataFrame(cv_scores)
    score_mean =  cv_scores.mean(axis=1)
    score_std = cv_scores.std(axis=1)
    cv_scores["threshold"] = probas
    cv_scores["mean"] = score_mean
    cv_scores["std"] = score_std
    cv_info["cv_scores"] = cv_scores
    
    max_mean_idx = cv_scores["mean"].argmax()
    best_threshold, max_score, max_score_std = cv_scores[["threshold", "mean", "std"]].iloc[max_mean_idx]
    cv_info["best_threshold"] = best_threshold
    cv_info["max_score"] = max_score
    cv_info["max_score_std"] = max_score_std
    
    cv_info["classifier"] = clf
    cv_info["preprocessing_params"] = {
        "ngram_range": ngram_range,
        "tfidf_features": tfidf_features
    }
    
    return cv_info

In [8]:
def get_classes(clf, row, threshold):
    return clf.classes_[row > threshold]


def make_submission(train: pd.DataFrame, clf: "classifier", *, ngram_range: Tuple[int, int] = (1, 1), 
                    tfidf_features: int, threshold: float, file_suffix: Union[float, str] = "") -> None:
    tfidf = TfidfVectorizer(
        max_features=tfidf_features,
        tokenizer=lambda x: x,
        preprocessor=lambda x: x,
        token_pattern=None,
        lowercase=False
    )
    
    train_ = train.explode("genres", ignore_index=True)
    X_train_, y_train_ = train_["tokens"], train_["genres"]
    X_train = tfidf.fit_transform(X_train_)
    y_train = y_train_.values
    clf.fit(X_train, y_train)
    
    test = pd.read_csv("test.csv", index_col=0)
    tokenizer = Tokenizer()
    tokenizer.load_data(test["dialogue"])
    tokenizer.tokenize(ngram_range)
    test["tokens"] = tokenizer.token_series
    X_test = tfidf.transform(test["tokens"])
    pred = safe_predict_proba(clf, X_test)
    test["genres"] = [get_classes(clf, i, threshold) for i in pred]
    
    problem_idx = problem_predict(test["genres"])
    if problem_idx.size:
        test.iloc[problem_idx, test.columns.get_indexer(["genres"])] = clf.predict(X_test[problem_idx]).reshape(-1, 1)
    file_name = f"submission_{file_suffix}.csv" if file_suffix else "submission.csv"
    test["genres"].str.join(sep=" ").to_csv(file_name)
    
    return test["genres"], clf, ngram_range, tfidf, threshold

In [9]:
clf = LogisticRegression(C=3, n_jobs=-1, solver="newton-cg", dual=False)

cv_info = cross_val_score(train, clf, ngram_range=(1, 1), tfidf_features=30000, remove_stopwords=False)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=400.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=400.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=400.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=400.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=400.0), HTML(value='')))




In [10]:
cv_info

{'cv_scores':             0         1         2         3         4  threshold      mean  \
 0    0.167150  0.166330  0.166046  0.167259  0.166776      0.000  0.166712   
 1    0.313490  0.313196  0.311029  0.313561  0.311150      0.005  0.312485   
 2    0.377178  0.376030  0.371979  0.376112  0.374824      0.010  0.375224   
 3    0.421178  0.420543  0.414865  0.418042  0.418097      0.015  0.418545   
 4    0.455643  0.454869  0.448084  0.451167  0.452807      0.020  0.452514   
 ..        ...       ...       ...       ...       ...        ...       ...   
 395  0.000000  0.000000  0.000000  0.000000  0.000000      1.975  0.000000   
 396  0.000000  0.000000  0.000000  0.000000  0.000000      1.980  0.000000   
 397  0.000000  0.000000  0.000000  0.000000  0.000000      1.985  0.000000   
 398  0.000000  0.000000  0.000000  0.000000  0.000000      1.990  0.000000   
 399  0.000000  0.000000  0.000000  0.000000  0.000000      1.995  0.000000   
 
           std  
 0    0.000521  
 1 

In [11]:
predictions, fitted_clf, ngram_range, fitted_tfidf, threshold = make_submission(
    train, clf=cv_info["classifier"], **cv_info["preprocessing_params"], threshold=cv_info["best_threshold"],
    file_suffix=round(10000 * cv_info["max_score"])
)

In [12]:
np.mean([len(l) for l in predictions])

2.069233223439328

In [13]:
class Prophet:
    def __init__(self, fitted_clf: "classifier", ngram_range: Tuple[int, int], fitted_tfidf: "tfidf", 
                 threshold: float) -> None:
        self.clf = fitted_clf
        self.threshold = threshold
        self.tokenizer = Tokenizer()
        self.tfidf = fitted_tfidf
        
    def predict(self, sentence: Union[str, pd.Series], *, print_tokens: bool = False) -> pd.Series:
        if not isinstance(sentence, pd.Series):
            sentence_series = pd.Series(sentence)
        self.tokenizer.load_data(sentence_series)
        self.tokenizer.tokenize(ngram_range)
        token_series = self.tokenizer.token_series
        if print_tokens:
            print(token_series)
        X = self.tfidf.transform(token_series)
    
        pred = safe_predict_proba(clf, X) 
    
        res = pd.Series([get_classes(clf, i, threshold) for i in pred])
    
        problem_idx = problem_predict(res)
        if problem_idx.size:
            res.iloc[problem_idx] = clf.predict(X[problem_idx])
        
        return res

In [14]:
prophet = Prophet(fitted_clf, ngram_range, fitted_tfidf, threshold)

In [15]:
prophet.predict("With great power comes great responsibility.")

0    [romance, sci-fi]
dtype: object

In [16]:
prophet.predict("""\
Let me tell you something you already know. The world ain't all sunshine and rainbows. 
It's a very mean and nasty place and I don't care how tough you are it will beat you to your knees and keep you there 
permanently if you let it. You, me, or nobody is gonna hit as hard as life. But it ain't about how hard ya hit. 
It's about how hard you can get hit and keep moving forward. How much you can take and keep moving forward. 
That's how winning is done! Now if you know what you’re worth, go out and get what you’re worth, but you gotta be willing 
to take the hits and not pointing fingers, saying you ain’t where you wanna be because of him, or her, or anybody! 
Cowards do that and that ain’t you! You’re better than that!
""")

0    [drama]
dtype: object