In [1]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv("./train.csv")

In [5]:
from collections import Counter

In [6]:
Counter(df["categories"])

Counter({'ASK_A_DOCTOR': 9440,
         'MISCELLANEOUS': 9736,
         'APPOINTMENTS': 11098,
         'LAB': 3457,
         'PRESCRIPTION': 12077,
         'JUNK': 17})

In [8]:
df.shape[0]

45825

In [9]:
type(df.iloc[0][2])

str

In [10]:
#pre-processing
import re 
def clean_str(string):
    """
    Tokenization/string cleaning for dataset
    Every dataset is lower cased except
    """
    string = re.sub(r"\n", "", string)    
    string = re.sub(r"\r", "", string) 
    string = re.sub(r"[0-9]", "digit", string)
    string = re.sub(r"\'", "", string)    
    string = re.sub(r"\"", "", string)    
    return string.strip().lower()
X = []
for i in range(df.shape[0]):
    X.append(clean_str(str(df.iloc[i][2])))
y = np.array(df["categories"])

In [11]:
#train test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=5)

In [12]:
#pipeline of feature engineering and model
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC

model = Pipeline([('vectorizer', CountVectorizer()),
 ('tfidf', TfidfTransformer()),
 ('clf', OneVsRestClassifier(LinearSVC(class_weight="balanced")))])
#the class_weight="balanced" option tries to remove the biasedness of model towards majority sample

In [13]:
#paramater selection
from sklearn.grid_search import GridSearchCV

parameters = {'vectorizer__ngram_range': [(1, 1), (1, 2), (2,2)],
               'tfidf__use_idf': (True, False)}



In [14]:
gs_clf_svm = GridSearchCV(model, parameters, n_jobs=-1)

In [15]:
gs_clf_svm = gs_clf_svm.fit(X, y)

In [16]:
#preparing the final pipeline using the selected parameters
model = Pipeline([('vectorizer', CountVectorizer(ngram_range=(1,2))),
    ('tfidf', TfidfTransformer(use_idf=True)),
    ('clf', OneVsRestClassifier(LinearSVC(class_weight="balanced")))])

In [17]:
#fit model with training data
model.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
       ...lti_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0),
          n_jobs=1))])

In [18]:
#evaluation on test data
pred = model.predict(X_test)
from sklearn.metrics import confusion_matrix, accuracy_score
confusion_matrix(pred, y_test)

array([[2763,  130,    0,   46,  230,   80],
       [ 206, 2245,    0,   48,  265,  275],
       [   1,    0,    5,    0,    5,    0],
       [  37,   39,    0,  812,   57,   23],
       [ 232,  206,    1,  109, 2127,  222],
       [  71,  229,    0,   36,  158, 3090]], dtype=int64)

In [27]:
pred

array(['ASK_A_DOCTOR', 'LAB', 'MISCELLANEOUS', ..., 'ASK_A_DOCTOR',
       'PRESCRIPTION', 'PRESCRIPTION'], dtype='<U13')

In [19]:
df2 = pd.read_csv("./test.csv")

In [20]:
Z = []
for i in range(df2.shape[0]):
    Z.append(clean_str(str(df2.iloc[i][1])))

In [21]:
pred2 = model.predict(Z)

In [22]:
df_pred2 = pd.DataFrame(pred2)

In [24]:
df_pred2.to_csv("./mysubmission.csv")