In [3]:
import json
import pandas as pd
import numpy as np

# Import reduce from functools
from functools import reduce

In [4]:
industry_words = {
    'automative': ['automotive', 'taxi', 'wheel', 'fuel', 'car','drive','auto','selfdrive','vehicle','road','automobile'],
    'Manufacturing': ['cleantech', 'deindustrialization', 'prefabrication', 'manufacturing', 'vitrification', 'fabrication' 'R&D','quality','produce','goods','factory','equipment'],
    'Consumer Products' : ['product', 'price', 'goods', 'commerce', 'economic', 'customer','marketing','demand','inventory','supply'],
    'Finance' : ['bank', 'money', 'capitalization', 'interest', 'fund', 'finance', 'asset','risk','loan','credit','fraud'],
    'Agriculture' :['soil', 'grain', 'agriculture', 'field','farm','soil','weather','crop','grow','animal', 'food' , 'land'],
    'Energy' : ['renewable', 'sustainable', 'green', 'electricity', 'energy', 'power','mines','solar','light','metal','electric','carbon', 'electonic','wind','speed'],
    'Health Care' : ['Health', 'Care', 'emergency','doctor','wellness','patient','hospital', 'clinic','treatment','disease', 'medical','cancer'],
    'Pharmaceuticals' : ['dose', 'pillbox', 'tonic', 'tablet', 'placebo', 'medicate', 'hospital', 'Pharmaceutical', 'drug','diagnose', 'test','trial','medicine', 'vaccine'],
    'Public and Social sector' : ['social', 'law','crime','terrorism','policing','govern', 'public', 'infrastructure', 'education', 'tax', 'urban', 'life', 'job','enforcement', 'surveillance'],
    'Media' :['mainstream', 'publishing', 'medium', 'social','media','video','content','news','release','film', 'press', 'viral', 'game'],
    'Telecom' : ['location', 'station', 'host', 'telecom', 'mobile', 'voice','call','subscription','network','phone', 'broadcast', 'internet','communication' ,'modulation'],
    'Transport & Logistics' : ['transport' , 'logistic', 'mail','parcel','travel','route','planes','truck', 'shipping', 'mobility', 'movement']
}
for keys, value in industry_words.items():
    industry_words[keys] = " ".join(value)

In [5]:
with open('news_data.json') as f:
      data2 = json.load(f)

In [6]:
new_cases = pd.DataFrame(data2['data'], columns=['id', 'title', 'summary', 
                                      'authors', 'tags', 
                                      'text', 'url', 'source',
                                      'created_at', 'updated_at',
                                      'author', 'date'])
new_cases.head(2)

Unnamed: 0,id,title,summary,authors,tags,text,url,source,created_at,updated_at,author,date
0,10813,"ZingBox aims for ‘Internet of Trusted Things’,...",Cybersecurity provider ZingBox has announced t...,,device\niot\nguardian\napproach\ndevices\nindu...,Cybersecurity provider ZingBox has announced t...,https://artificialintelligence-news.com/2017/0...,AInews,2020-02-05T17:08:34.343Z,2020-02-05T17:08:34.343Z,James Bourne,2017-04-25
1,10814,AI may help create more sustainable data centres,Enterprise data centre provider Aegis Data arg...,,data\ncentre\nnatural\nnew\ntechnology\nindust...,Enterprise data centre provider Aegis Data arg...,https://artificialintelligence-news.com/2017/0...,AInews,2020-02-05T17:08:34.355Z,2020-02-05T17:08:34.355Z,James Bourne,2017-04-25


In [7]:
new_cases[new_cases.id==10829]

Unnamed: 0,id,title,summary,authors,tags,text,url,source,created_at,updated_at,author,date
16,10829,Here’s how AI can assist medical science in te...,Artificial intelligence (AI) and deep learning...,,diseases\nmedical\nai\nexpo,Artificial intelligence (AI) and deep learning...,https://artificialintelligence-news.com/2017/0...,AInews,2020-02-05T17:08:34.515Z,2020-02-05T17:08:34.515Z,James Bourne,2017-06-12


In [8]:
# Import spacy
import spacy

# Instantiate the English model: nlp
nlp = spacy.load('en_core_web_md')

In [9]:
texts = new_cases.text

In [10]:
from wordcloud import STOPWORDS
english_stops = set(STOPWORDS)

import nltk
from nltk.corpus import wordnet

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

## 1) Lower all words and Remove non-alpha, stopwords, no-noun or no-verb and Lemmatize all words

In [10]:
# Import WordNetLemmatizer
from nltk.stem import WordNetLemmatizer

from nltk.tokenize import word_tokenize

# Tokenize the article: tokens
tokens =[word_tokenize(article) for article in texts]
print("Total number of texts: {}".format(len(tokens)))

len_array = [len(token_array) for token_array in tokens]
# Use reduce() to apply a lambda function over stark: result
total_tokens = reduce(lambda item1, item2: item1+item2, len_array)
print("Total number of tokens: {}".format(total_tokens))

# Convert the tokens into lowercase: lower_tokens
lower_tokens = [[t.lower() for t in token] for token in tokens]

# Retain alphabetic words: alpha_only
alpha_only_list = [[t for t in lower_token if t.isalpha()] for lower_token in lower_tokens]

# Remove all stop words: no_stops
no_stops = [[t for t in alpha_only if t not in english_stops] for alpha_only in alpha_only_list]

len_array = [len(token_array) for token_array in no_stops]
# Use reduce() to apply a lambda function over stark: result
total_tokens = reduce(lambda item1, item2: item1+item2, len_array)
print("Total of words after removing stop words: {}".format(total_tokens))

# Instantiate the WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

# Lemmatize all tokens into a new list: lemmatized
articles_lemmatized = [[wordnet_lemmatizer.lemmatize(t, get_wordnet_pos(t)) for t in no_stop if nltk.pos_tag([t])[0][1][0].upper()=='N' or nltk.pos_tag([t])[0][1][0].upper()=='V'] for no_stop in no_stops]

len_array = [len(article) for article in articles_lemmatized]
# Use reduce() to apply a lambda function over stark: result
total_tokens = reduce(lambda item1, item2: item1+item2, len_array)
print("Total of words after removing words except noun: {}".format(total_tokens))

Total number of texts: 1626
Total number of tokens: 1434310
Total of words after removing stop words: 694546
Total of words after removing words except noun: 571569


## 2) Remove organization, location nouns

In [11]:
for i in range(new_cases.shape[0]):
    # Create a new document: doc
    doc = nlp(new_cases.text[i])

    # Print all of the found entities and their labels
    for ent in doc.ents:
        if ent.label_=='ORG' or ent.label_=='GPE' or ent.label_=='LOC':
            words = ent.text.split()
            for word in words:
                word = word.lower()
                while word in articles_lemmatized[i]:
                    articles_lemmatized[i].remove(word) 

In [12]:
len_array = [len(article) for article in articles_lemmatized]
# Use reduce() to apply a lambda function over stark: result
total_tokens = reduce(lambda item1, item2: item1+item2, len_array)
print("Total of words after removing words in ORG, GPE, LOC: {}".format(total_tokens))

Total of words after removing words in ORG, GPE, LOC: 503028


## 3) Predict industry type with cleaned texts by using similarity method

### Convert list into string for each text

In [13]:
articles_string = [" ".join(article) for article in articles_lemmatized]
key_list = list(industry_words.keys())

### Create nlp object for each industry category

In [14]:
doc_industry_list = []
for key, value in industry_words.items():
        doc_industry_list.append(nlp(industry_words[key]))

### An example : 16th text

In [14]:
# Create nlp object for an cleaned article
doc_article = nlp(articles_string[16])
print(new_cases.title[16])
similarities = []
for doc_industry in doc_industry_list:
    similarity = doc_article.similarity(doc_industry)
    print(similarity)
    similarities.append(similarity)
max_value = max(similarities)
max_position = similarities.index(max_value)
industry_type = key_list[max_position]
print(industry_type)

Here’s how AI can assist medical science in telling a patient’s lifespan
0.5083207819127182
0.5599038223425676
0.6823005964751405
0.6627079356726187
0.6193198169612352
0.5920631884483465
0.7159515352149979
0.6621734547359772
0.7709572952570025
0.7063107083724028
0.6536469096643308
0.6471683954877313
Public and Social sector


### Predict industry type with similarity method along two nlp object for all texts in industry_type_list 

In [15]:
industry_type_list = []
for article in articles_string:
    # Create nlp object for an cleaned article
    doc_article = nlp(article)
    similarities = []
    for doc_industry in doc_industry_list:
        similarities.append(doc_article.similarity(doc_industry))
    max_value = max(similarities)
    max_position = similarities.index(max_value)
    industry_type = key_list[max_position]
    industry_type_list.append(industry_type)

## 4) Predict industry type with tf-idf words by using similarity method

### Create a dictionary

In [15]:
# Import Dictionary
from gensim.corpora.dictionary import Dictionary

# Create a Dictionary from the articles: dictionary
dictionary = Dictionary(articles_lemmatized)

# Create a MmCorpus: corpus
corpus = [dictionary.doc2bow(article) for article in articles_lemmatized]

# How much text in corpus_sorted
print(len(corpus))

1626


### Create tf-idf model

In [16]:
from gensim.models.tfidfmodel import TfidfModel

# Create a new TfidfModel using the corpus: tfidf
tfidf = TfidfModel(corpus)

tfidf_weights = [sorted(tfidf[doc], key=lambda w: w[1], reverse=True) for doc in corpus]

### Create nlp object for each industry category and key_list 

In [17]:
key_list = list(industry_words.keys())
doc_industry_list = []
for key, value in industry_words.items():
        doc_industry_list.append(nlp(industry_words[key]))

### An example : 16th text

In [18]:
article_string_single = [dictionary.get(term_id) for term_id, weight in tfidf_weights[16][:20]]
article_string_single = " ".join(article_string_single)
article_string_single

'lifespan poll adelaide surpasses disease prediction ai patient congestive emphysema survey ascertain job absence chronic organ image report resident commission'

In [32]:
doc_article = nlp(article_string_single)
similarities = []
for doc_industry in doc_industry_list:
    similarity = doc_article.similarity(doc_industry)
    print(similarity)
    similarities.append(similarity)
max_value = max(similarities)
industry1_pos = similarities.index(max_value)
industry_type1 = key_list[industry1_pos]
similarities[industry1_pos] = 0
second_max = max(similarities)
industry2_pos = similarities.index(second_max)
industry_type2 = key_list[industry2_pos]

print(industry_type1)
print(industry_type2)

0.40368552190838985
0.4599219946234982
0.536596037573089
0.5736589998630274
0.549153651399558
0.47661140152858783
0.7848336852301346
0.7033189258214428
0.6473614201741285
0.5610149268507558
0.5194483004976785
0.5220977663151091
Health Care
Pharmaceuticals


### Predict industry type with similarity method along two nlp object for all texts in industry_type_list

#### Select the most important first 20 words for each text a and convert list into string

In [20]:
articles_strings = [[dictionary.get(term_id) for term_id, weight in tfidf_weight[:20]] for tfidf_weight in tfidf_weights]
articles_strings = [" ".join(articles_string) for articles_string in articles_strings]

In [21]:
articles_strings[16]

'lifespan poll adelaide surpasses disease prediction ai patient congestive emphysema survey ascertain job absence chronic organ image report resident commission'

In [33]:
industry_type1_list = []
industry_type2_list = []

for article in articles_strings:
    # Create nlp object for an cleaned article
    doc_article = nlp(article)
    similarities = []
    for doc_industry in doc_industry_list:
        similarities.append(doc_article.similarity(doc_industry))
    max_value = max(similarities)
    max_position = similarities.index(max_value)
    industry_type = key_list[max_position]
    industry_type_list.append(industry_type)

    max_value = max(similarities)
    industry1_pos = similarities.index(max_value)
    industry_type1 = key_list[industry1_pos]
    industry_type1_list.append(industry_type1)
    similarities[industry1_pos] = 0
    second_max = max(similarities)
    industry2_pos = similarities.index(second_max)
    industry_type2 = key_list[industry2_pos]
    industry_type2_list.append(industry_type2)

In [40]:
industry_type1_list[:5]

['Public and Social sector',
 'Consumer Products',
 'Media',
 'Public and Social sector',
 'Consumer Products']

In [39]:
industry_type2_list[:5]

['Consumer Products', 'Energy', 'Telecom', 'Finance', 'Telecom']

In [51]:
df = new_cases[['text']]
df['Industry1'] = industry_type1_list
df['Industry2'] = industry_type2_list
#df.assign(column_name = industry_type1_list)
#new_cases.head(5)


In [107]:
df.to_csv ('df.csv', index = False, header=True)

In [1]:
df.to_csv('new_file.csv', sep='\t', index=False)


NameError: name 'df' is not defined

In [59]:
from sklearn.model_selection import train_test_split

X = df['text']
y = df.drop('text',axis=1)
xtrain, xtest, ytrain, ytest = train_test_split(X,y, random_state=42, test_size=0.2)

In [90]:
ytrain[['Industry1','Industry2']].values

array([['Media', 'Telecom'],
       ['Transport & Logistics', 'automative'],
       ['Media', 'Consumer Products'],
       ...,
       ['Telecom', 'Media'],
       ['Telecom', 'Energy'],
       ['Public and Social sector', 'Media']], dtype=object)

In [94]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
train_labels = mlb.fit_transform(ytrain[['Industry1','Industry2']].values)
test_labels = mlb.fit_transform(ytest[['Industry1','Industry2']].values)

In [93]:
mlb.classes_


array(['Agriculture', 'Consumer Products', 'Energy', 'Finance',
       'Health Care', 'Manufacturing', 'Media', 'Pharmaceuticals',
       'Public and Social sector', 'Telecom', 'Transport & Logistics',
       'automative'], dtype=object)

In [None]:
# define Spacy model, we ignore "tagger", "parser", "ner" to make processing faster
nlp = en_core_web_md.load(disable=["tagger", "parser", "ner"])

def keep_token(t):
    # remove stop words, punct, numbers
    return (t.is_alpha and
            not (t.is_space or t.is_punct or
                 t.is_stop or t.like_num))

def lemmatize_doc(doc):
    # Lemmatize
    return [ t.lemma_ for t in doc if keep_token(t)]

# apply on text filed to get clean_text
df['clean_text'] = df.text.apply(lambda x: ' '.join(lemmatize_doc(nlp(x.lower()))))



In [61]:
from wordcloud import STOPWORDS
english_stops = set(STOPWORDS)

import nltk
from nltk.corpus import wordnet

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

In [68]:
from nltk.stem.porter import PorterStemmer

wordnet_lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()


def tokenize_lemma_stopwords(text):
    text = text.replace("\n", " ")
    tokens = nltk.tokenize.word_tokenize(text.lower()) # split string into words (tokens)
    tokens = [t for t in tokens if t.isalpha()] # keep strings with only alphabets
    tokens = [wordnet_lemmatizer.lemmatize(t) for t in tokens] # put words into base form
    tokens = [stemmer.stem(t) for t in tokens]
    tokens = [t for t in tokens if len(t) > 2] # remove short words, they're probably not useful
    tokens = [t for t in tokens if t not in english_stops] # remove stopwords
    cleanedText = " ".join(tokens)
    return cleanedText

def dataCleaning(df):
    data = df.copy()
    data = data.apply(tokenize_lemma_stopwords)
    return data

In [69]:
cleanedTrainData = dataCleaning(xtrain)
cleanedTestData = dataCleaning(xtest)

In [70]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics

vectorizer = TfidfVectorizer()
vectorised_train_documents = vectorizer.fit_transform(cleanedTrainData)
vectorised_test_documents = vectorizer.transform(cleanedTestData)

In [71]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, hamming_loss

ModelsPerformance = {}

def metricsReport(modelName, test_labels, predictions):
    accuracy = accuracy_score(test_labels, predictions)

    macro_precision = precision_score(test_labels, predictions, average='macro')
    macro_recall = recall_score(test_labels, predictions, average='macro')
    macro_f1 = f1_score(test_labels, predictions, average='macro')

    micro_precision = precision_score(test_labels, predictions, average='micro')
    micro_recall = recall_score(test_labels, predictions, average='micro')
    micro_f1 = f1_score(test_labels, predictions, average='micro')
    hamLoss = hamming_loss(test_labels, predictions)
    print("------" + modelName + " Model Metrics-----")
    print("Accuracy: {:.4f}\nHamming Loss: {:.4f}\nPrecision:\n  - Macro: {:.4f}\n  - Micro: {:.4f}\nRecall:\n  - Macro: {:.4f}\n  - Micro: {:.4f}\nF1-measure:\n  - Macro: {:.4f}\n  - Micro: {:.4f}"\
          .format(accuracy, hamLoss, macro_precision, micro_precision, macro_recall, micro_recall, macro_f1, micro_f1))
    ModelsPerformance[modelName] = micro_f1

In [73]:
vectorised_train_documents.shape

(1300, 18051)

In [74]:
train_labels.shape

(2, 10)

In [95]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.multiclass import OneVsRestClassifier

knnClf = KNeighborsClassifier()

knnClf.fit(vectorised_train_documents, train_labels)
knnPredictions = knnClf.predict(vectorised_test_documents)
metricsReport("knn", test_labels, knnPredictions)


------knn Model Metrics-----
Accuracy: 0.3405
Hamming Loss: 0.1061
Precision:
  - Macro: 0.7435
  - Micro: 0.7319
Recall:
  - Macro: 0.4496
  - Micro: 0.5736
F1-measure:
  - Macro: 0.5304
  - Micro: 0.6432


In [96]:
from sklearn.tree import DecisionTreeClassifier

dtClassifier = DecisionTreeClassifier()
dtClassifier.fit(vectorised_train_documents, train_labels)
dtPreds = dtClassifier.predict(vectorised_test_documents)
metricsReport("Decision Tree", test_labels, dtPreds)

------Decision Tree Model Metrics-----
Accuracy: 0.2025
Hamming Loss: 0.1759
Precision:
  - Macro: 0.3748
  - Micro: 0.4724
Recall:
  - Macro: 0.3798
  - Micro: 0.4724
F1-measure:
  - Macro: 0.3718
  - Micro: 0.4724


In [97]:
from sklearn.ensemble import BaggingClassifier

bagClassifier = OneVsRestClassifier(BaggingClassifier(n_jobs=-1))
bagClassifier.fit(vectorised_train_documents, train_labels)
bagPreds = bagClassifier.predict(vectorised_test_documents)
metricsReport("Bagging", test_labels, bagPreds)

------Bagging Model Metrics-----
Accuracy: 0.2025
Hamming Loss: 0.1181
Precision:
  - Macro: 0.6355
  - Micro: 0.7262
Recall:
  - Macro: 0.3913
  - Micro: 0.4678
F1-measure:
  - Macro: 0.4704
  - Micro: 0.5690


In [102]:
from sklearn.ensemble import RandomForestClassifier
rfClassifier = RandomForestClassifier(n_jobs=-1)
rfClassifier.fit(vectorised_train_documents, train_labels)
rfPreds = rfClassifier.predict(vectorised_test_documents)
metricsReport("Random Forest", test_labels, rfPreds)

------Random Forest Model Metrics-----
Accuracy: 0.0828
Hamming Loss: 0.1286
Precision:
  - Macro: 0.5381
  - Micro: 0.7560
Recall:
  - Macro: 0.1475
  - Micro: 0.3374
F1-measure:
  - Macro: 0.1798
  - Micro: 0.4666


In [101]:
from sklearn.ensemble import GradientBoostingClassifier

boostClassifier = OneVsRestClassifier(GradientBoostingClassifier())
boostClassifier.fit(vectorised_train_documents, train_labels)
boostPreds = boostClassifier.predict(vectorised_test_documents)
metricsReport("Boosting", test_labels, boostPreds)

------Boosting Model Metrics-----
Accuracy: 0.2209
Hamming Loss: 0.1138
Precision:
  - Macro: 0.6769
  - Micro: 0.7390
Recall:
  - Macro: 0.3962
  - Micro: 0.4908
F1-measure:
  - Macro: 0.4816
  - Micro: 0.5899


In [98]:
from sklearn.naive_bayes import MultinomialNB

nbClassifier = OneVsRestClassifier(MultinomialNB())
nbClassifier.fit(vectorised_train_documents, train_labels)

nbPreds = nbClassifier.predict(vectorised_test_documents)
metricsReport("Multinomial NB", test_labels, nbPreds)

------Multinomial NB Model Metrics-----
Accuracy: 0.0245
Hamming Loss: 0.1309
Precision:
  - Macro: 0.1571
  - Micro: 0.8977
Recall:
  - Macro: 0.0788
  - Micro: 0.2423
F1-measure:
  - Macro: 0.0922
  - Micro: 0.3816


In [99]:
from sklearn.svm import LinearSVC

svmClassifier = OneVsRestClassifier(LinearSVC(), n_jobs=-1)
svmClassifier.fit(vectorised_train_documents, train_labels)

svmPreds = svmClassifier.predict(vectorised_test_documents)
metricsReport("SVC Sq. Hinge Loss", test_labels, svmPreds)

------SVC Sq. Hinge Loss Model Metrics-----
Accuracy: 0.3190
Hamming Loss: 0.0920
Precision:
  - Macro: 0.8128
  - Micro: 0.8120
Recall:
  - Macro: 0.4481
  - Micro: 0.5828
F1-measure:
  - Macro: 0.5575
  - Micro: 0.6786


In [103]:
from skmultilearn.problem_transform import LabelPowerset

powerSetSVC = LabelPowerset(LinearSVC())
powerSetSVC.fit(vectorised_train_documents, train_labels)

powerSetSVCPreds = powerSetSVC.predict(vectorised_test_documents)
metricsReport("Power Set SVC", test_labels, powerSetSVCPreds)

------Power Set SVC Model Metrics-----
Accuracy: 0.4724
Hamming Loss: 0.1048
Precision:
  - Macro: 0.7078
  - Micro: 0.6856
Recall:
  - Macro: 0.5795
  - Micro: 0.6856
F1-measure:
  - Macro: 0.6118
  - Micro: 0.6856


In [105]:
#Comparison on different models based on their Micro-F1 score


print("  Model Name " + " "*10 + "| Micro-F1 Score")
print("-------------------------------------------")
for key, value in ModelsPerformance.items():
    print("  " + key, " "*(20-len(key)) + "|", value)
    print("-------------------------------------------")

  Model Name           | Micro-F1 Score
-------------------------------------------
  knn                  | 0.643164230438521
-------------------------------------------
  Decision Tree        | 0.4723926380368098
-------------------------------------------
  Bagging              | 0.5690298507462687
-------------------------------------------
  Multinomial NB       | 0.3816425120772947
-------------------------------------------
  SVC Sq. Hinge Loss   | 0.6785714285714286
-------------------------------------------
  Boosting             | 0.5898617511520737
-------------------------------------------
  Random Forest        | 0.4665959703075292
-------------------------------------------
  Power Set SVC        | 0.6855828220858896
-------------------------------------------


In [None]:
# define the model
model = Sequential()
model.add(Dense(20, input_dim=n_inputs, kernel_initializer='he_uniform', activation='relu'))
model.add(Dense(n_outputs, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam')

In [None]:

# get the model
def get_model(n_inputs, n_outputs):
	model = Sequential()
	model.add(Dense(20, input_dim=n_inputs, kernel_initializer='he_uniform', activation='relu'))
	model.add(Dense(n_outputs, activation='sigmoid'))
	model.compile(loss='binary_crossentropy', optimizer='adam')
	return model

In [None]:

# make a prediction on the test set
yhat = model.predict(X_test)
# round probabilities to class labels
yhat = yhat.round()
# calculate accuracy
acc = accuracy_score(y_test, yhat)

In [None]:
def evaluate_model(X, y):
	results = list()
	n_inputs, n_outputs = X.shape[1], y.shape[1]
	# define evaluation procedure
	cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
	# enumerate folds
	for train_ix, test_ix in cv.split(X):
		# prepare data
		X_train, X_test = X[train_ix], X[test_ix]
		y_train, y_test = y[train_ix], y[test_ix]
		# define model
		model = get_model(n_inputs, n_outputs)
		# fit model
		model.fit(X_train, y_train, verbose=0, epochs=100)
		# make a prediction on the test set
		yhat = model.predict(X_test)
		# round probabilities to class labels
		yhat = yhat.round()
		# calculate accuracy
		acc = accuracy_score(y_test, yhat)
		# store result
		print('>%.3f' % acc)
		results.append(acc)
	return results

    

In [None]:
results = evaluate_model(X, y)
# summarize performance
print('Accuracy: %.3f (%.3f)' % (mean(results), std(results)))  