# Text Sentiment, no internet needed
We use a tiny labeled dataset and a simple bag-of-words model.

In [8]:
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, confusion_matrix
import pandas as pd

pipe = make_pipeline(
    TfidfVectorizer(analyzer='char', ngram_range=(3,5), lowercase=True, min_df=1),
    LinearSVC()  # robust on tiny text
)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
y_pred = cross_val_predict(pipe, df['text'], df['label'], cv=cv)

# print(classification_report(df['label'], y_pred, zero_division=0))
print(pd.DataFrame(confusion_matrix(df['label'], y_pred),
                   index=['neg','pos'], columns=['pred_neg','pred_pos']))

# fit once for live preds
pipe.fit(df['text'], df['label'])
for s in [
    "I am very happy with this service",
    "This broke and wasted my time",
    "I want to try to pay but it does not work"
]:
    print(s, "->", pipe.predict([s])[0])


     pred_neg  pred_pos
neg         1         4
pos         3         2
I am very happy with this service -> pos
This broke and wasted my time -> neg
I want to try to pay but it does not work -> neg


## Takeaways
- Start with simple models and strong baselines
- Feature engineering and clean data often beat complex models
- Always keep a held-out test set