In [29]:
def getBinaryNumTarget(text):
  if text =='Yes':
    return 1
  else:
    return 0

In [2]:
import pandas as pd

In [3]:
df=pd.read_csv("bitathon.csv",low_memory=False)

In [4]:
df.tail()

Unnamed: 0,dispute,complaint
6821,1,Premier Portfolio Group contacted me stating t...
6822,1,I have been disputing a collection with the FT...
6823,1,A debt collector is reporting the incorrect da...
6824,1,Plaza Services apparently purchased this debt ...
6825,1,On XXXX XXXX XXXX XXXX XXXX XXXX and XXXX XXXX...


In [28]:
df = df[['Consumer complaint narrative','Consumer disputed?']].dropna()

In [30]:
df['Consumer disputed?']=df['Consumer disputed?'].apply(getBinaryNumTarget)

In [3]:
df.head()

Unnamed: 0,dispute,complaint
0,0,I am a homeownership advisor with XXXX. On XXX...
1,0,I Want to file a complaint about XXXX For Fore...
2,0,"I live in XXXX, my former mortgage holder, XXX..."
3,0,My wife & I had a FHA backed XXXX yr XXXX serv...
4,0,I have been in review with my lender for XXXX ...


In [4]:
df.isna().sum()

dispute      0
complaint    0
dtype: int64

In [21]:
import spacy.cli
spacy.cli.download("en_core_web_sm")
import en_core_web_sm
nlp = en_core_web_sm.load()

✔ Download and installation successful
You can now load the model via spacy.load('en_core_web_sm')


In [6]:
df['dispute'].value_counts()

1    3629
0    3197
Name: dispute, dtype: int64

In [7]:
import spacy
from spacy.util import minibatch, compounding
import random


In [8]:
def load_data(train_data, limit=0, split=0.8):
    random.shuffle(train_data)
    train_data = train_data[-limit:]
    texts, labels = zip(*train_data)
    cats = [{"NOT DISPUTED": not bool(y), "DISPUTED": bool(y)} for y in labels]
    split = int(len(train_data) * split)

    return (texts[:split], cats[:split]), (texts[split:], cats[split:])

In [9]:
def evaluate(tokenizer, textcat, texts, cats):
    docs = (tokenizer(text) for text in texts)
    tp = 0.0  # True positives
    fp = 1e-8  # False positives
    fn = 1e-8  # False negatives
    tn = 0.0  # True negatives
    for i, doc in enumerate(textcat.pipe(docs)):
        gold = cats[i]
        for label, score in doc.cats.items():
            if label not in gold:
                continue
            if label == "NOT DISPUTED":
                continue
            if score >= 0.5 and gold[label] >= 0.5:
                tp += 1.0
            elif score >= 0.5 and gold[label] < 0.5:
                fp += 1.0
            elif score < 0.5 and gold[label] < 0.5:
                tn += 1
            elif score < 0.5 and gold[label] >= 0.5:
                fn += 1
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    if (precision + recall) == 0:
        f_score = 0.0
    else:
        f_score = 2 * (precision * recall) / (precision + recall)
    return {"textcat_p": precision, "textcat_r": recall, "textcat_f": f_score}

In [22]:
nlp=spacy.load('en_core_web_sm')

In [11]:
df.replace(to_replace='[\n\r\tXXXX#''"")(!,;/]', value='', regex=True, inplace=True)

In [12]:
df.head()

Unnamed: 0,dispute,complaint
0,0,I am a homeownership advisor with . On 15 I su...
1,0,I Want to file a complaint about For Foreclos...
2,0,I live in my former mortgage holder went ban...
3,0,My wife & I had a FHA backed yr serviced by ...
4,0,I have been in review with my lender for 7 mo...


In [23]:
textcat=nlp.create_pipe( "textcat", config={"exclusive_classes": True, "architecture": "simplecnn"})
nlp.add_pipe(textcat, last=True)
nlp.pipe_names

['tagger', 'parser', 'ner', 'textcat']

In [24]:
textcat.add_label("DISPUTED")
textcat.add_label("NOT DISPUTED")

1

In [25]:
df['tuples'] = df.apply(lambda row: (row['complaint'], row['dispute']), axis=1)
train = df['tuples'].tolist()

In [26]:
(train_texts, train_cats), (dev_texts, dev_cats) = load_data(train, split=0.9)
train_data = list(zip(train_texts,[{'cats': cats} for cats in train_cats]))

In [31]:
n_iter = 20
# Disabling other components
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat']
with nlp.disable_pipes(*other_pipes):  # only train textcat
    optimizer = nlp.begin_training()
    print("Training the model...")
    print('{:^5}\t{:^5}\t{:^5}\t{:^5}'.format('LOSS', 'P', 'R', 'F'))
    # Performing training
    for i in range(n_iter):
        losses = {}
        batches = minibatch(train_data, size=compounding(4., 32., 1.001))
        for batch in batches:
            texts, annotations = zip(*batch)
            nlp.update(texts, annotations, sgd=optimizer, drop=0.2,
                       losses=losses)

      # Calling the evaluate() function and printing the scores
        with textcat.model.use_params(optimizer.averages):
            scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats)
        print('{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}'
              .format(losses['textcat'], scores['textcat_p'],
                      scores['textcat_r'], scores['textcat_f']))

Training the model...
LOSS 	  P  	  R  	  F  
0.850	0.977	0.941	0.959
0.640	0.980	0.955	0.967
0.288	0.971	0.952	0.962
0.204	0.969	0.955	0.962
0.188	0.974	0.947	0.960
0.178	0.974	0.947	0.960
0.202	0.966	0.947	0.956
0.149	0.971	0.947	0.959
0.146	0.983	0.952	0.967
0.145	0.977	0.955	0.966
0.161	0.972	0.961	0.966
0.191	0.972	0.961	0.966
0.117	0.971	0.955	0.963
0.116	0.974	0.955	0.965
0.225	0.977	0.958	0.967
0.155	0.971	0.952	0.962
0.207	0.980	0.952	0.966
0.115	0.977	0.949	0.963
0.129	0.977	0.947	0.961
0.071	0.980	0.947	0.963


In [32]:
# Testing the model(Case of Non Dispute i.e 0)
test_text = "Transunion is continuing to report accounts have been recently deleted due to cancelled contract.Disputed via phone as well as fax and online."
doc=nlp(test_text)
doc.cats 

{'DISPUTED': 1.0, 'NOT DISPUTED': 4.045129564644867e-09}

In [33]:
# Testing the model(Case of Dispute i.e 1)
test_text1 = "I have been subjected to harrassment calls. The caller shows no mercy in disturbing throughout the day"
doc=nlp(test_text1)
doc.cats 

{'DISPUTED': 0.3866216540336609, 'NOT DISPUTED': 0.6133783459663391}

In [34]:
with nlp.use_params(optimizer.averages):
            nlp.to_disk('finalmodelmini')