In [484]:
import re, os, sys, glob
from pathlib import Path
from collections import Counter
import pickle 

import pandas as pd
import numpy as np

from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

pd.options.mode.chained_assignment = None
pd.options.display.max_columns = None
np.warnings.filterwarnings('ignore')

In [485]:
train_data_folder = "../train-data/"

data = []

for path in glob.glob('%s/*.txt'%train_data_folder):
    print(path)
    filename = os.path.basename(path).split('.')[0]
    with open(path, 'r') as f:
        text = f.readlines()
    for l in text:
        data.append((l, filename))

train-data\bs.txt
train-data\cf.txt
train-data\is.txt
train-data\note.txt
train-data\plaintext.txt


In [486]:
data[:3]

[('CAPITAL AND LIABILITIES\n', 'bs'),
 ('Capital 1 4,605,934 4,564,858\n', 'bs'),
 ('Reserves and surplus 2 252,976,864 215,975,735\n', 'bs')]

In [487]:
def preprocessor(x):
    x = re.sub(r'[0-9\,]+', 'num', x)
    return re.sub(r'[\n()]+', ' ', x).lower()

In [488]:
df = pd.DataFrame(data, columns=['raw', 'type'])
df['clean'] = df.raw.apply(preprocessor)
df.head()

Unnamed: 0,raw,type,clean
0,CAPITAL AND LIABILITIES\n,bs,capital and liabilities
1,"Capital 1 4,605,934 4,564,858\n",bs,capital num num num
2,"Reserves and surplus 2 252,976,864 215,975,735\n",bs,reserves and surplus num num num
3,"Deposits 3 2,007,381,476 1,428,738,567\n",bs,deposits num num num
4,"Borrowings 4 748,935,808 386,066,730\n",bs,borrowings num num num


In [489]:
Counter(df.type)

Counter({'bs': 107, 'cf': 102, 'is': 92, 'note': 313, 'plaintext': 93})

In [490]:
nb = make_pipeline(TfidfVectorizer(min_df=2, ngram_range=(1, 3), stop_words={'english'}),
                   MultinomialNB(alpha=0.1)
                    )

X = df['clean']
y= df.type
nb.fit(X, y)
y_pred_class = nb.predict(X)
# print(y_pred_class)
metrics.accuracy_score(y, y_pred_class)

0.8656294200848657

In [491]:
train_data_folder = "train-data/"

data = []

for path in glob.glob('%s/*.txt'%train_data_folder):
    print(path)
    filename = os.path.basename(path).split('.')[0]
    with open(path, 'r') as f:
        text = f.read()
    data.append((text.lower(), filename))

train-data\bs.txt
train-data\cf.txt
train-data\is.txt
train-data\note.txt
train-data\plaintext.txt


In [492]:
data[0]

("capital and liabilities\ncapital 1 4,605,934 4,564,858\nreserves and surplus 2 252,976,864 215,975,735\ndeposits 3 2,007,381,476 1,428,738,567\nborrowings 4 748,935,808 386,066,730\nother liabilities and provisions 5 110,555,951 115,253,287\ntotal 3,124,456,033 2,150,599,177\nassets\ncash and balances with reserve bank of india 6 114,257,489 69,520,697\nbalances with banks and money at call and short notice 7 133,086,175 125,973,744\ninvestments 8 683,989,387 500,317,983\nadvances 9 2,035,338,628 1,322,626,769\nfixed assets 10 8,323,917 6,835,385\nother assets 11 149,460,437 1,25,324,599\ntotal 3,124,456,033 2,150,599,177\ncontingent liabilities 12 5,818,296,390 3,795,641,601\nbills for collection 19,355,641 13,900,033\n\nequity and liabilities\nshareholders' funds\nshare capital 856.81 856.81 729.53 729.53 529.53 396.20\nshare capital suspense 48.90 - - - - -\nreserves and surplus 7,188.96 6,051.15 3,869.24 3,034.44 1,543.22 709.70\n8,094.67 6,907.96 4,598.77 3,763.97 2,072.75 1,105

In [493]:
df = pd.DataFrame(data, columns=['raw', 'type'])
df['clean'] = df.raw.apply(preprocessor)
df

Unnamed: 0,raw,type,clean
0,"capital and liabilities\ncapital 1 4,605,934 4...",bs,capital and liabilities capital num num num re...
1,cash flow from operating activities\nnet profi...,cf,cash flow from operating activities net profit...
2,"income\ninterest earned 13 202,674,216 164,246...",is,income interest earned num num num other incom...
3,cash flow from financing activities\nincrease ...,note,cash flow from financing activities increase i...
4,"resolved further that, for the purpose of\ngiv...",plaintext,resolved further thatnum for the purpose of gi...


In [494]:
# vect = HashingVectorizer(non_negative=True)
# X = vect.fit_transform(df['clean'])
# y= df.type
# nb = MultinomialNB()
# nb.fit(X, y)
# y_pred_class = nb.predict(X)
# print(y_pred_class)
# metrics.accuracy_score(y, y_pred_class)

# Testing

In [495]:
X = df['clean']
y= df.type
y_pred_class = nb.predict(X)
y_pred_class_proba = nb.predict_proba(X)
print(y_pred_class)
print(np.round(y_pred_class_proba, 3))
print(np.max(y_pred_class_proba, axis=1))
print('Accuracy: %s'%metrics.accuracy_score(y, y_pred_class))

['bs' 'cf' 'is' 'note' 'plaintext']
[[0.979 0.008 0.006 0.006 0.   ]
 [0.    0.999 0.    0.    0.   ]
 [0.003 0.001 0.993 0.003 0.   ]
 [0.    0.    0.    1.    0.   ]
 [0.    0.    0.    0.    1.   ]]
[0.97918304 0.99949679 0.99311478 0.99996408 1.        ]
Accuracy: 1.0


In [498]:
pickle.dump(nb, open('models/mindf2.pkl', 'wb'))

In [497]:
test ="""
Particulars For the year ended
March 31, 2017
For the year ended
March 31, 2016
(C) CASH FLOW FROM FINANCING ACTIVITIES
Financing of hedging contract (6.71) 20.17
Interest Costs (4.11) (27.57)
Net Cash from /(Used) in Financing Activities (III) (10.82) (7.40)
Net Increase /(Decrease) in Cash & Cash equivalents (I+ II+ III) 605.75 62.86
Cash & Cash equivalents at the beginning of the year 941.89 879.03
Cash & Cash equivalents at the end of the year 1,547.64 941.89
Notes :
1 ) Cash and Cash equivalents include cash and bank balances in current accounts and deposit accounts. (Refer Note no. 8 (d))
2) Previous year figures have been regrouped wherever necessary to correspond with the figures of the current year.
As per our attached report of even date.
For and on behalf of For and on behalf of the Board of Directors
KHIMJI KUNVERJI & CO. APTECH LIMITED
Chartered Accountants
(Firm Registration No. 105146W)
ANIL PANT C. Y. PAL
Managing Director & CEO Vice Chairman
SHIVJI K VIKAMSEY (Din :07565631) (Din: 00106536)
Partner
(M.No 2242)
Place : Mumbai T. K. RAVISHANKAR KETAN SHAH
Date : May 24, 2017 Executive Vice President & CFO Company Secretary
"""


def predict_table_type(x):
    x = preprocessor(x)
    print(x)
    print('Probability: %s'%np.max(nb.predict_proba([x])))
    return nb.predict([x])[0]
    
predict_table_type(test)

 particulars for the year ended march num num for the year ended march num num c  cash flow from financing activities financing of hedging contract  num.num  num.num interest costs  num.num   num.num net cash from / used  in financing activities  iii   num.num   num.num net increase / decrease  in cash & cash equivalents  i+ ii+ iii  num.num num.num cash & cash equivalents at the beginning of the year num.num num.num cash & cash equivalents at the end of the year num.num num.num notes : num   cash and cash equivalents include cash and bank balances in current accounts and deposit accounts.  refer note no. num  d num  previous year figures have been regrouped wherever necessary to correspond with the figures of the current year. as per our attached report of even date. for and on behalf of for and on behalf of the board of directors khimji kunverji & co. aptech limited chartered accountants firm registration no. numw anil pant c. y. pal managing director & ceo vice chairman shivji k vik

'cf'