In [1]:
import pandas as pd
from turpy.preprocess import TextPreprocesser
from turpy.augmentation import KeyboardAugmentator, EDAAugmentator
from turpy.models import TfIdfClassifier
from sklearn.model_selection import train_test_split

In [2]:
# Read data
df = pd.read_csv("data/tweets.csv").sample(frac=1, random_state=42)


X = df["tweet"]
y = df["label"]

# Preprocess text
preprocess = TextPreprocesser(
                                lowercase=True, remove_diacritics=True, remove_extra_whitespace=True, replace_urls=True, replace_hashtags="<HTAG>", replace_tags="<TAG>", replace_punctuations=True,
                                order = ["lowercase", "remove_diacritics", "replace_punctuations", "remove_extra_whitespace","replace_urls", "replace_hashtags","replace_tags"]
                            )

X = preprocess.transform(X)



X_train,X_test, y_train, y_test =  train_test_split(X, y, test_size=0.2, random_state=42)

# Add augmentations.
aug_kb = KeyboardAugmentator(aug_char_max=2)
X_aug_kb, y_aug_kb = aug_kb.fit_transform(X_train, y_train, n=5)

aug_eda = EDAAugmentator(synonym_insertion_prob=0, synonym_replacement_prob=0)
X_aug_eda, y_aug_eda = aug_eda.fit_transform(X_train, y_train, n=1)


X_full = pd.concat([X_train, X_aug_kb, X_aug_eda], ignore_index=True, axis=0)
y_full = pd.concat([y_train, y_aug_kb, y_aug_eda], ignore_index=True, axis=0)

# Train a TfIdf Classifier

model = TfIdfClassifier()
model.fit(X_full, y_full)

# Accuracy score might be different because Keyboard augmentator does not have a random_state to make it reproduceable.
model.score(X_test,y_test)

100%|██████████| 3905/3905 [00:00<00:00, 8488.39it/s]


0.5977482088024565

In [3]:
# Sklearn tf-idf and without any preprocessing or augmentation. 
# Default Tf-idf Classifier is basically tf-idf + ridge

df = pd.read_csv("data/tweets.csv").sample(frac=1, random_state=42)

X = df["tweet"]
y = df["label"]

X_train,X_test, y_train, y_test =  train_test_split(X, y, test_size=0.2, random_state=42)

model = TfIdfClassifier()
model.fit(X_train,y_train)
model.score(X_test,y_test)

0.593654042988741