In [4]:
import pandas as pd

df_unbalanced = pd.read_csv('data_input/training_data.csv')

df = df_unbalanced.groupby('label', group_keys=False).apply(
    lambda x: x.sample(n=min(len(x), 900), random_state=19)
).reset_index(drop=True)  # Reset index to remove unwanted artifacts

# Explicitly keep only the original columns to ensure structure is intact
df = df[df_unbalanced.columns]


  df = df_unbalanced.groupby('label', group_keys=False).apply(


In [5]:
X = df['text']
y = df['label']
len(X)

2996

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=19)  # 80% training and 20% test
len(X_train)

2396

In [8]:
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score
import joblib

In [6]:
pipeMNB = Pipeline([('tfidf', TfidfVectorizer(stop_words='english')), ('clf', MultinomialNB())])
pipeCNB = Pipeline([('tfidf', TfidfVectorizer(stop_words='english')), ('clf', ComplementNB())])
pipeSVC = Pipeline([('tfidf', TfidfVectorizer(stop_words='english')), ('clf', LinearSVC())])

In [7]:
pipeMNB.fit(X_train, y_train)
pipeCNB.fit(X_train, y_train)
pipeSVC.fit(X_train, y_train)

predictMNB = pipeMNB.predict(X_test)
predictCNB = pipeCNB.predict(X_test)
predictSVC = pipeSVC.predict(X_test)

print(accuracy_score(y_test, predictMNB))
print(accuracy_score(y_test, predictCNB))
print(accuracy_score(y_test, predictSVC))

0.8466666666666667
0.88
0.9366666666666666


In [9]:
# joblib.dump(pipeMNB, 'model_mnb.pkl')
# joblib.dump(pipeCNB, 'model_cnb.pkl')
# joblib.dump(pipeSVC, 'model_svc.pkl')


# load back the models
pipeMNB = joblib.load('model_mnb.pkl')
pipeCNB = joblib.load('model_cnb.pkl')
pipeSVC = joblib.load('model_svc.pkl')

In [10]:
X_wipo = df[df['label'] == 'wipo']['text']
Y_wipo = df[df['label'] == 'wipo']['label']

predict_svc_wipo = pipeSVC.predict(X_wipo)
predict_cnb_wipo = pipeCNB.predict(X_wipo)
predict_mnb_wipo = pipeMNB.predict(X_wipo)

print('Accuracy SVC wipo:', accuracy_score(Y_wipo, predict_svc_wipo))
print('Accuracy CNB wipo:', accuracy_score(Y_wipo, predict_cnb_wipo))
print('Accuracy MNB wipo:', accuracy_score(Y_wipo, predict_mnb_wipo))

Accuracy SVC wipo: 0.99
Accuracy CNB wipo: 0.9555555555555556
Accuracy MNB wipo: 0.9377777777777778
