In [1]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
import pandas as pd

In [None]:
big_df = pd.read_parquet('big_cleaned.parquet')

X_train, X_val, y_train, y_val = train_test_split(
    big_df["text_svm"], big_df["label"],
    test_size=0.2, random_state=42, stratify=big_df["label"]
)

svm_pipeline = Pipeline([
    ("feats", FeatureUnion(transformer_list=[
        ("word_tfidf", TfidfVectorizer(
            ngram_range=(1,2), min_df=2, max_df=0.9,
            stop_words="english", sublinear_tf=True
        )),
        ("char_tfidf", TfidfVectorizer(
            analyzer="char", ngram_range=(3,5),
            min_df=2, sublinear_tf=True
        ))
    ])),
    ("clf", LinearSVC(C=1.0))
])

svm_pipeline.fit(X_train, y_train)
pred = svm_pipeline.predict(X_val)
print(classification_report(y_val, pred, digits=3))