In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.neural_network import MLPClassifier
import re  
import warnings
from matplotlib import pyplot as plt 
warnings.filterwarnings("ignore")
pd.options.mode.chained_assignment = None  # default='warn'
import string  

In [2]:
#mount google drive
from google.colab import drive
drive.mount("/content/gdrive", )

Mounted at /content/gdrive


In [3]:
train = pd.read_csv('/content/gdrive/MyDrive/Code Review /Train.csv')
test = pd.read_csv('/content/gdrive/MyDrive/Code Review /Test.csv')

In [4]:
train.head()

Unnamed: 0,ID,Text,Label
0,ID_AASHwXxg,Mwangonde: Khansala wachinyamata Akamati achi...,POLITICS
1,ID_AGoFySzn,MCP siidakhutire ndi kalembera Chipani cha Ma...,POLITICS
2,ID_AGrrkBGP,Bungwe la MANEPO Lapempha Boma Liganizire Anth...,HEALTH
3,ID_AIJeigeG,Ndale zogawanitsa miyambo zanyanya Si zachile...,POLITICS
4,ID_APMprMbV,Nanga wapolisi ataphofomoka? Masiku ano sichi...,LAW/ORDER


In [5]:
def remove_punct(text):
    text  = "".join([char for char in text if char not in string.punctuation])
    text = re.sub('[0-9]+', '', text)
    return text

In [6]:
def preprocess(text):
    return remove_punct(text)

train['Text'] = train['Text'].apply(lambda x: preprocess(x), 1)
test['Text'] = test['Text'].apply(lambda x: preprocess(x), 1)

In [7]:
vectorizer = TfidfVectorizer()
vectorizer.fit(train['Text']) 

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [8]:
FEATURES = 'Text'
LABEL = 'Label'

In [9]:
n_splits = 5

kf = StratifiedKFold(n_splits=n_splits, random_state=47, shuffle=True)
X_train = train[FEATURES].copy()
y_train = train[LABEL].copy()

X_test = test[FEATURES].copy()
n_labels = train[LABEL].unique().shape[0]
y_oof = np.zeros([X_train.shape[0], n_labels])
y_test = np.zeros([X_test.shape[0], n_labels])

X_train = vectorizer.transform(X_train)
X_test = vectorizer.transform(X_test)

i = 0
metrics = list()
for tr_idx, val_idx in kf.split(X_train, y_train):

    X_tr, X_vl = X_train[tr_idx, :], X_train[val_idx, :]
    y_tr, y_vl = y_train.iloc[tr_idx], y_train.iloc[val_idx]

    model = MLPClassifier(random_state=47)
    model.fit(X_tr, y_tr)

    y_pred = model.predict(X_vl)
    y_oof[val_idx, :] = model.predict_proba(X_vl)
    metric = accuracy_score(y_vl, y_pred)
    print("fold #{} val_loss: {}".format(i, metric))
    i += 1
    y_test += model.predict_proba(X_test) / n_splits
    metrics.append(metric)


metrics = np.array(metrics).mean()
print(f'Full accuracy {metrics}')  # Full accuracy 0.6365

fold #0 val_loss: 0.6180555555555556
fold #1 val_loss: 0.6550522648083623
fold #2 val_loss: 0.6306620209059234
fold #3 val_loss: 0.6341463414634146
fold #4 val_loss: 0.6445993031358885
Full accuracy 0.6365030971738289


In [10]:
tmp_train = pd.DataFrame(y_oof)
tmp_test = pd.DataFrame(y_test)
tmp_train.to_csv('mlp_train.csv', index=False)
tmp_test.to_csv('mlp_test.csv', index=False)

In [11]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# if you already create this directory don't run the cell please :) 
# !mkdir '/content/drive/MyDrive/AI4D_Malawi_News_Classification/DistilationInput' 

In [13]:
!cp /content/mlp_train.csv '/content/drive/MyDrive/Code Review '
!cp /content/mlp_test.csv '/content/drive/MyDrive/Code Review '

# **mlp_baseline.ipynb** Notebook io :
###  - **inputs:** Train.csv, Test.csv
###  - **outputs:** mlp_train.csv, mlp_test.csv

# **-----------------------------------------------------------------------------------------**

# outputs will be used in **Distilation Notebook**