In [1]:
import pandas as pd 
from sklearn.model_selection import StratifiedKFold
import numpy as np 

from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC

from sklearn.metrics import accuracy_score
from sklearn.ensemble import VotingClassifier

import cloudpickle
import string
import re  


import warnings
from matplotlib import pyplot as plt 
warnings.filterwarnings("ignore")
pd.options.mode.chained_assignment = None  # default='warn'

## Load Data

In [2]:
# Load Data
train = pd.read_csv('../input/Train.csv') 
test = pd.read_csv('../input/Test.csv') 

In [3]:
train.head()

Unnamed: 0,ID,Text,Label
0,ID_AASHwXxg,Mwangonde: Khansala wachinyamata Akamati achi...,POLITICS
1,ID_AGoFySzn,MCP siidakhutire ndi kalembera Chipani cha Ma...,POLITICS
2,ID_AGrrkBGP,Bungwe la MANEPO Lapempha Boma Liganizire Anth...,HEALTH
3,ID_AIJeigeG,Ndale zogawanitsa miyambo zanyanya Si zachile...,POLITICS
4,ID_APMprMbV,Nanga wapolisi ataphofomoka? Masiku ano sichi...,LAW/ORDER


## Modeling

In [4]:
n_splits = 5   

kf = StratifiedKFold(n_splits=n_splits, random_state=47, shuffle=True)

# upload pretrained
vectorizer = cloudpickle.load(open('../vectorizer/vectorizer.pkl', 'rb')) 
X_train = vectorizer.predict(train['Text'])
X_test = vectorizer.predict(test['Text'])
y_train = train['Label'].copy()

n_labels = train['Label'].unique().shape[0]
y_oof = np.zeros([X_train.shape[0], n_labels])
y_test = np.zeros([X_test.shape[0], n_labels])

i = 0
metrics = list()

for tr_idx, val_idx in kf.split(X_train, y_train):

    X_tr, X_vl = X_train[tr_idx, :], X_train[val_idx, :]
    y_tr, y_vl = y_train.iloc[tr_idx], y_train.iloc[val_idx]
    
    estimators = [('mlp', MLPClassifier(250,  random_state=47)),
                  ('svm',  SVC(C=1.0,probability=True,random_state=42))]

    model = VotingClassifier(estimators=estimators, voting='soft')
    model.fit(X_tr, y_tr)
    
    y_pred = model.predict(X_vl)
    y_oof[val_idx, :] = model.predict_proba(X_vl)
    metric = accuracy_score(y_vl, y_pred)
    print("fold #{} val_loss: {}".format(i, metric))
    i += 1
    y_test += model.predict_proba(X_test) / n_splits
    metrics.append(metric)

 
metrics = np.array(metrics).mean()
print(f'Full accuracy {metrics}')  # Full accuracy 0.6922

fold #0 val_loss: 0.6666666666666666
fold #1 val_loss: 0.7421602787456446
fold #2 val_loss: 0.7073170731707317
fold #3 val_loss: 0.6759581881533101
fold #4 val_loss: 0.6689895470383276
Full accuracy 0.6922183507549361


In [5]:
y_sub = np.argmax(y_test, 1)
y_sub = pd.DataFrame({'Label': y_sub})
class_ = model.classes_
y_sub['Label'] = y_sub['Label'].apply(lambda x: class_[x])
sub = test[['ID']]
sub['LABEL'] = y_sub['Label']
sub.head()

Unnamed: 0,ID,LABEL
0,ID_ADHEtjTi,SOCIAL ISSUES
1,ID_AHfJktdQ,RELIGION
2,ID_AUJIHpZr,RELATIONSHIPS
3,ID_AUKYBbIM,POLITICS
4,ID_AZnsVPEi,SOCIAL ISSUES


In [6]:
# Save submission
sub.to_csv(f'../sub/MLP_SVC_sub_{round(metrics,4)}.csv', index=False)