In [8]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score
import pickle

### Read Data

In [9]:
df = pd.read_csv('./data/data.csv')
df.head()

Unnamed: 0,text,class,file_id
0,okay so um yes we do keep uh well we started o...,family_finance,2022
1,all right uh weve got a lot of them laughter t...,credit_card,2023
2,hello hey how you doing not bad not bad Whats...,credit_card,2061
3,well um with credit cards is me i i try to kee...,credit_card,2067
4,hi how are you doing uh i couldnt hear you i s...,job_benefits,2085


### Train Test Split

In [10]:
X = df['text']
y = df['class']
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.20,random_state=42)

### Combine count vectorizer and tf-idf transformer

In [11]:
tf_idf_vector = TfidfVectorizer()
X_train_tfidf_vector = tf_idf_vector.fit_transform(X_train)

### Train classifier by combining vectorization and training using pieline

In [12]:
text_clf = Pipeline([('tfidf',TfidfVectorizer()),('clf',LinearSVC())])
text_clf.fit(X_train,y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('clf', LinearSVC())])

### Evaluate Trained Model

In [13]:
predictions = text_clf.predict(X_test)

In [14]:
print(confusion_matrix(y_test,predictions))

[[ 0  0  0  0  0  1]
 [ 0  1  0  0  0  1]
 [ 0  0  9  0  0  0]
 [ 0  0  0 14  0  0]
 [ 0  0  0  0 10  0]
 [ 0  0  0  0  0 12]]


In [15]:
print(classification_report(y_test,predictions))

                precision    recall  f1-score   support

  bank_bailout       0.00      0.00      0.00         1
        budget       1.00      0.50      0.67         2
   credit_card       1.00      1.00      1.00         9
family_finance       1.00      1.00      1.00        14
  job_benefits       1.00      1.00      1.00        10
         taxes       0.86      1.00      0.92        12

      accuracy                           0.96        48
     macro avg       0.81      0.75      0.76        48
  weighted avg       0.94      0.96      0.95        48



  _warn_prf(average, modifier, msg_start, len(result))


### Save Model

In [16]:
filename = 'model.sav'
pickle.dump(text_clf, open(filename, 'wb'))
 
# some time later...
 
# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(X_test, y_test)
print(result)

0.9583333333333334


In [21]:
loaded_model.predict([X_test[24]])

array(['taxes'], dtype=object)