In [1]:
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neural_network import MLPClassifier
from time import localtime, strftime
import logging

logging.basicConfig(filename='amz.log', format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p',
                    level=logging.DEBUG)

In [2]:
logging.info(f"Application amz started with PID {os.getpid()}")
cur_time = strftime("%Y-%d-%m-%H-%M-%S", localtime())
dataset_home = "/mydata/datasets/amz"

In [3]:
train = pd.read_csv(f"{dataset_home}/train.csv", names=["target", "title", "data"])
test = pd.read_csv(f"{dataset_home}/test.csv", names=["target", "title", "data"])

In [4]:
train = train.drop(columns=['title'])
test = test.drop(columns=['title'])

In [5]:
# NEUTRAL = 0, NEG = 1, POS = 2
train.loc[(train.target == 2),'target'] = 1
train.loc[(train.target == 4),'target'] = 2
train.loc[(train.target == 5),'target'] = 2
train.loc[(train.target == 3),'target'] = 0

test.loc[(test.target == 2),'target'] = 1
test.loc[(test.target == 4),'target'] = 2
test.loc[(test.target == 5),'target'] = 2
test.loc[(test.target == 3),'target'] = 0

train_df = train.groupby('target').sample(n=60000, random_state=123).sample(frac=1)
train_df_index = train_df.index
train_df = train_df.reset_index(drop=True)
test_df = test.groupby('target').sample(n=10000, random_state=123).sample(frac=1).reset_index(drop=True)

In [15]:
import re
def remove_special_characters(text):
    text = text.lower()
    pattern = r'[^a-zA-z0-9\s]'
    text = re.sub(pattern, '', text)
    return text

In [16]:
frames = [train_df, test_df]
corpus = pd.concat(frames, ignore_index=True)

vectorizer = TfidfVectorizer(stop_words='english', max_features=100000, ngram_range=(1, 3), preprocessor=remove_special_characters)
tfidf = vectorizer.fit_transform(corpus['data'])

X_train = tfidf[:train_df.shape[0]]
Y_train = train_df['target'].values

X_test = tfidf[train_df.shape[0]:]
Y_test = test_df['target'].values

In [17]:
vectorizer.vocabulary_

{'dr': 25558,
 'seuss': 78447,
 'favorite': 30606,
 'household': 41584,
 'bringing': 12585,
 'children': 16190,
 'spanish': 81924,
 'translation': 90082,
 'flawless': 32180,
 'wife': 96637,
 'poet': 66217,
 'translator': 90085,
 'differently': 23200,
 'dr seuss': 25568,
 'bought': 11716,
 'use': 92238,
 'wirelessly': 96867,
 'connect': 18541,
 'toshiba': 89644,
 'hddvd': 39583,
 'player': 65762,
 'wireless': 96845,
 'router': 75577,
 'hp': 41619,
 '1000': 91,
 'inkjet': 43506,
 'usb': 92218,
 'printer': 67752,
 'works': 97991,
 'installation': 43665,
 'huge': 41650,
 'pain': 63146,
 'really': 71735,
 'poor': 66431,
 'wizard': 97022,
 'establish': 28410,
 'communication': 17897,
 'ethernet': 28461,
 'line': 51027,
 'detect': 22054,
 'network': 59597,
 'immediately': 42758,
 'ip': 44374,
 'address': 2208,
 'conflict': 18489,
 'message': 56021,
 'device': 22138,
 'wired': 96844,
 'trick': 90254,
 'simply': 79492,
 'disconnect': 23742,
 'does': 24111,
 'tell': 86182,
 'dumb': 25941,
 'boug

In [25]:
clf = MLPClassifier(random_state=123, max_iter=500, early_stopping=True)
clf.fit(X_train, Y_train)



MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=True, epsilon=1e-08,
              hidden_layer_sizes=(100,), learning_rate='constant',
              learning_rate_init=0.001, max_iter=500, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=123, shuffle=True, solver='adam', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)

In [30]:
logging.info(f"n_iter_: {clf.n_iter_}")
logging.info(f"n_layers_: {clf.n_layers_}")
logging.info(f"loss_: {clf.loss_}")
score = clf.score(X_test, Y_test)
logging.info(f"Mean test accuracy: {score}")
score

0.6838