# Model training and prediction file - Mortgage Rate model

Author: Jonathan Chan

Date: 2020-06-23 2:30PM

In [1]:
#load early version of Flair - Most updated version currently has a bug when using model.predict(multi_class_prob=True)
!pip install --upgrade git+https://github.com/flairNLP/flair.git@63aeabf9a18bdf53af3bcba5bd80f43ac717656e

Collecting git+https://github.com/flairNLP/flair.git@63aeabf9a18bdf53af3bcba5bd80f43ac717656e
  Cloning https://github.com/flairNLP/flair.git (to revision 63aeabf9a18bdf53af3bcba5bd80f43ac717656e) to /tmp/pip-req-build-uhyrudj5
  Running command git clone -q https://github.com/flairNLP/flair.git /tmp/pip-req-build-uhyrudj5
  Running command git checkout -q 63aeabf9a18bdf53af3bcba5bd80f43ac717656e
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Building wheels for collected packages: flair
  Building wheel for flair (PEP 517) ... [?25l[?25hdone
  Created wheel for flair: filename=flair-0.5-cp36-none-any.whl size=148939 sha256=189eeb8a830b1a25aaf9d26884730830d41a6df1a1bab81c63bc502f0acac7fe
  Stored in directory: /tmp/pip-ephem-wheel-cache-__twi3od/wheels/64/b4/85/92bb9070843ba488b2b5e42712a24c414a0ee617eec46d67e0
Successfully built flair
Installing collected packages: flai

In [2]:
from flair.data_fetcher import NLPTaskDataFetcher
from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentLSTMEmbeddings, DocumentRNNEmbeddings, BertEmbeddings
from flair.models import TextClassifier
from flair.trainers import ModelTrainer
from pathlib import Path
from flair.datasets import CSVClassificationCorpus
from flair.data import Corpus
import pandas as pd

from flair.data import Sentence

In [3]:

data_folder = "./drive/My Drive/Colab Notebooks/capstone/data/"  #Path to folder containing phase 1 training data
new_data_folder = "./drive/My Drive/Colab Notebooks/capstone/data/phase_2_mortgage_rate_oversampled/" #Path to folder containing phase 2 training data

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
def finetuned_model_predictions(input_file_path, finetuned_classifier, output_file_path):
  '''Makes Sentiment Predictions on unannotated data points contained in the input csvfile by loading the user-defined classifier.
     Exports the csvfile by adding two columns 'pred_label' and 'confidence' and filling in results from model predictions.
  '''
  unannotated_df = pd.read_csv(input_file_path)

  unannotated_df['best_label'] = None
  unannotated_df['best_confidence'] = None
  unannotated_df['second_likely'] = None
  unannotated_df['second_confidence'] = None
  unannotated_df['least_likely'] = None
  unannotated_df['least_confidence'] = None
  for i in range(len(unannotated_df)):
    #print(unannotated_df['title_desc'].iloc[i])
    sentence = Sentence(unannotated_df['title_desc'].iloc[i])
    finetuned_classifier.predict(sentence,  multi_class_prob=True)
    pred_score_label = [(sentence.labels[c].score, sentence.labels[c].value) for c in range(len(sentence.labels))]
    pred_score_label.sort()

    # list in ascending order on confidence score
    best_label = int(pred_score_label[-1][1])
    best_confidence = pred_score_label[-1][0]
    second_likely_label = int(pred_score_label[-2][1]) 
    second_likely_confidence = pred_score_label[-2][0]
    least_likely_label = int(pred_score_label[0][1]) 
    least_likely_confidence = pred_score_label[0][0]

    unannotated_df['best_label'].iloc[i] = best_label
    unannotated_df['best_confidence'].iloc[i] = best_confidence
    unannotated_df['second_likely'].iloc[i] = second_likely_label
    unannotated_df['second_confidence'].iloc[i] = second_likely_confidence
    unannotated_df['least_likely'].iloc[i] = least_likely_label
    unannotated_df['least_confidence'].iloc[i] = least_likely_confidence

  print(f"All { len(unannotated_df) } rows done prediction! ")
  unannotated_df.to_csv(output_file_path,index=False)
  print("Done export!")

  return unannotated_df['best_label'].value_counts()

### First Stage (Train on benchmark dataset)

In [8]:
benchmark = pd.read_csv(data_folder + "combined_benchmark.csv")

In [9]:
benchmark = benchmark[['label', 'text']]
benchmark.head()

Unnamed: 0,label,text
0,0,Why not subscribe to the magazine ?
1,-1,"Tornio Works employs 2,300 of whom more than 1..."
2,1,"The move is aimed at boosting sales , cost-eff..."
3,0,"As a result of the merger , the largest profes..."
4,-1,18 March 2010 A leakage in the gypsum pond was...


#### Create train, dev and test set

In [10]:
benchmark = benchmark.sample(frac=1, random_state=42)


benchmark.iloc[0:int(len(benchmark)*0.8)].to_csv(data_folder + 'train.csv', sep=',', index = False, header = False)
benchmark.iloc[int(len(benchmark)*0.8):int(len(benchmark)*0.9)].to_csv(data_folder + 'test.csv', sep=',', index = False, header = False)
benchmark.iloc[int(len(benchmark)*0.9):].to_csv(data_folder + 'dev.csv', sep=',', index = False, header = False)

In [11]:
train_df = pd.read_csv(data_folder + "train.csv", header = None)
train_df.head()

Unnamed: 0,0,1
0,1,About Elcoteq Elcoteq SE is a leading electron...
1,-1,U.S. goods trade deficit deteriorates; factory...
2,0,Product coverage : baked goods ; biscuits ; br...
3,0,Tyrv+Æinen is of the opinion that the airline ...
4,-1,Wall St. Week Ahead: U.S. stock reign may not ...


#### Build corpus

In [12]:
# corpus = NLPTaskDataFetcher.load_classification_corpus(Path(data_folder), test_file='test.csv', dev_file='dev.csv', train_file='train.csv')
column_name_map = {1: "text", 0: "label_topic"}

corpus: Corpus = CSVClassificationCorpus(data_folder,
                                         column_name_map,
                                         skip_header=False, #no header in kaggle data
                                         delimiter=',',    # comma separated rows
                                          train_file='train.csv',
                                         dev_file = 'dev.csv',
                                         test_file = 'test.csv'
)

2020-06-23 21:20:52,336 Reading data from drive/My Drive/Colab Notebooks/capstone/data
2020-06-23 21:20:52,338 Train: drive/My Drive/Colab Notebooks/capstone/data/train.csv
2020-06-23 21:20:52,340 Dev: drive/My Drive/Colab Notebooks/capstone/data/dev.csv
2020-06-23 21:20:52,341 Test: drive/My Drive/Colab Notebooks/capstone/data/test.csv


#### Create word embeddings

In [13]:
word_embeddings = [BertEmbeddings(), FlairEmbeddings('news-forward-fast'), FlairEmbeddings('news-backward-fast')]


  """Entry point for launching an IPython kernel.


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…


2020-06-23 21:21:15,368 https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-news-english-forward-1024-v0.2rc.pt not found in cache, downloading to /tmp/tmph9kkb138


100%|██████████| 19689779/19689779 [00:00<00:00, 32260616.04B/s]

2020-06-23 21:21:16,120 copying /tmp/tmph9kkb138 to cache at /root/.flair/embeddings/lm-news-english-forward-1024-v0.2rc.pt
2020-06-23 21:21:16,150 removing temp file /tmp/tmph9kkb138





2020-06-23 21:21:19,375 https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-news-english-backward-1024-v0.2rc.pt not found in cache, downloading to /tmp/tmp7vnz28yv


100%|██████████| 19689779/19689779 [00:00<00:00, 50778553.28B/s]

2020-06-23 21:21:19,909 copying /tmp/tmp7vnz28yv to cache at /root/.flair/embeddings/lm-news-english-backward-1024-v0.2rc.pt





2020-06-23 21:21:19,940 removing temp file /tmp/tmp7vnz28yv


#### First Stage Fine-tuning

In [None]:
document_embeddings = DocumentLSTMEmbeddings(word_embeddings, hidden_size=512, reproject_words=True, reproject_words_dimension=256)
classifier = TextClassifier(document_embeddings, label_dictionary=corpus.make_label_dictionary(), multi_label=False)
trainer = ModelTrainer(classifier, corpus)
trainer.train(data_folder, max_epochs=10)

2020-06-11 20:46:32,348 Computing label dictionary. Progress:


  """Entry point for launching an IPython kernel.
100%|██████████| 1314/1314 [00:01<00:00, 807.68it/s]

2020-06-11 20:46:34,241 [b'0', b'1', b'-1']
2020-06-11 20:46:34,263 ----------------------------------------------------------------------------------------------------
2020-06-11 20:46:34,267 Model: "TextClassifier(
  (document_embeddings): DocumentLSTMEmbeddings(
    (embeddings): StackedEmbeddings(
      (list_embedding_0): BertEmbeddings(
        (model): BertModel(
          (embeddings): BertEmbeddings(
            (word_embeddings): Embedding(30522, 768, padding_idx=0)
            (position_embeddings): Embedding(512, 768)
            (token_type_embeddings): Embedding(2, 768)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (encoder): BertEncoder(
            (layer): ModuleList(
              (0): BertLayer(
                (attention): BertAttention(
                  (self): BertSelfAttention(
                    (query): Linear(in_features=768, out_features=768, bias=Tru




2020-06-11 20:46:35,936 epoch 1 - iter 3/37 - loss 1.72542675 - samples/sec: 71.55
2020-06-11 20:46:50,470 epoch 1 - iter 6/37 - loss 1.41298331 - samples/sec: 78.78
2020-06-11 20:47:02,726 epoch 1 - iter 9/37 - loss 1.38017842 - samples/sec: 77.69
2020-06-11 20:47:15,016 epoch 1 - iter 12/37 - loss 1.36771390 - samples/sec: 75.43
2020-06-11 20:47:28,746 epoch 1 - iter 15/37 - loss 1.29182448 - samples/sec: 70.53
2020-06-11 20:47:41,178 epoch 1 - iter 18/37 - loss 1.26792369 - samples/sec: 74.70
2020-06-11 20:47:53,446 epoch 1 - iter 21/37 - loss 1.27641764 - samples/sec: 76.76
2020-06-11 20:48:05,801 epoch 1 - iter 24/37 - loss 1.25897953 - samples/sec: 79.21
2020-06-11 20:48:17,829 epoch 1 - iter 27/37 - loss 1.22091764 - samples/sec: 81.07
2020-06-11 20:48:30,202 epoch 1 - iter 30/37 - loss 1.20964215 - samples/sec: 79.54
2020-06-11 20:48:42,165 epoch 1 - iter 33/37 - loss 1.19802785 - samples/sec: 84.35
2020-06-11 20:48:54,258 epoch 1 - iter 36/37 - loss 1.20336205 - samples/sec: 8

{'dev_loss_history': [0.8829440474510193,
  0.7203056216239929,
  0.9014942049980164,
  0.7286069393157959,
  0.8807229995727539,
  0.6408273577690125,
  0.7970978617668152,
  0.6934536695480347,
  0.8263334631919861,
  0.8744912147521973],
 'dev_score_history': [0.7989,
  0.8622,
  0.8077,
  0.8522,
  0.8099,
  0.8673,
  0.8497,
  0.8647,
  0.8448,
  0.8497],
 'test_score': 0.8439,
 'train_loss_history': [1.2002636323104034,
  0.9350489458522281,
  0.8268443056055017,
  0.8040031033593256,
  0.7500786281920768,
  0.6970284959754428,
  0.7145128443434432,
  0.6085004999830916,
  0.664192912546364,
  0.5949043977904964]}

### Second Stage (train on hand annotated datasets)

#### Build corpus

In [14]:

new_column_name_map = {5: "text", 4: "label_topic"}

corpus: Corpus = CSVClassificationCorpus(new_data_folder,
                                         new_column_name_map,
                                         skip_header=True,
                                         delimiter=',',    # comma separated rows
                                         train_file='train.csv',
                                         dev_file = 'dev.csv',
                                         test_file = 'test.csv'
)

2020-06-23 21:21:33,222 Reading data from drive/My Drive/Colab Notebooks/capstone/data/phase_2_mortgage_rate_oversampled
2020-06-23 21:21:33,228 Train: drive/My Drive/Colab Notebooks/capstone/data/phase_2_mortgage_rate_oversampled/train.csv
2020-06-23 21:21:33,229 Dev: drive/My Drive/Colab Notebooks/capstone/data/phase_2_mortgage_rate_oversampled/dev.csv
2020-06-23 21:21:33,231 Test: drive/My Drive/Colab Notebooks/capstone/data/phase_2_mortgage_rate_oversampled/test.csv


In [15]:
mort_test_df = pd.read_csv("./drive/My Drive/Colab Notebooks/capstone/data/phase_2_mortgage_rate_oversampled/test.csv",
                           usecols=['title_desc_sent_1','title_desc'])
mort_test_df

Unnamed: 0,title_desc_sent_1,title_desc
0,-1,"Oil plunge, coronavirus fears prompt panic sel..."
1,0,Royal Bank of Canada misses expectations in to...
2,0,"U.S. Federal Reserve holds rates steady, expec..."
3,-1,U.S. central bank cuts interest rate for 1st t...
4,0,From binge to bust: A Canadian oil town lines ...
5,0,City of Charlottetown wants short-term rentals...
6,0,Portugal's ruling Socialists top general elect...
7,1,Canada's banks increase prime lending rates. C...
8,-1,Canada’s biggest banks are finding their growt...
9,0,U.S. Federal Reserve leaves rates near zero as...


#### Second Stage fine-tuning

In [None]:
benchmark_classifier = TextClassifier.load(data_folder + 'best-model.pt')

2020-06-11 21:14:23,960 loading file ./drive/My Drive/Colab Notebooks/capstone/data/best-model.pt


In [None]:
trainer = ModelTrainer(benchmark_classifier, corpus)
trainer.train(new_data_folder, max_epochs=10)


2020-06-11 21:14:39,141 ----------------------------------------------------------------------------------------------------
2020-06-11 21:14:39,146 Model: "TextClassifier(
  (document_embeddings): DocumentLSTMEmbeddings(
    (embeddings): StackedEmbeddings(
      (list_embedding_0): BertEmbeddings(
        (model): BertModel(
          (embeddings): BertEmbeddings(
            (word_embeddings): Embedding(30522, 768, padding_idx=0)
            (position_embeddings): Embedding(512, 768)
            (token_type_embeddings): Embedding(2, 768)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (encoder): BertEncoder(
            (layer): ModuleList(
              (0): BertLayer(
                (attention): BertAttention(
                  (self): BertSelfAttention(
                    (query): Linear(in_features=768, out_features=768, bias=True)
                    (key): Linear(in_feat

  _warn_prf(average, modifier, msg_start, len(result))


saving best model
2020-06-11 21:15:53,968 ----------------------------------------------------------------------------------------------------
2020-06-11 21:15:54,846 epoch 2 - iter 1/6 - loss 1.21896505 - samples/sec: 63.88
2020-06-11 21:16:08,170 epoch 2 - iter 2/6 - loss 1.34578860 - samples/sec: 80.09
2020-06-11 21:16:19,948 epoch 2 - iter 3/6 - loss 1.22393487 - samples/sec: 73.39
2020-06-11 21:16:31,931 epoch 2 - iter 4/6 - loss 1.12664030 - samples/sec: 86.30
2020-06-11 21:16:43,396 epoch 2 - iter 5/6 - loss 1.10699414 - samples/sec: 78.26
2020-06-11 21:16:54,503 epoch 2 - iter 6/6 - loss 1.14799018 - samples/sec: 331.43
2020-06-11 21:17:05,857 ----------------------------------------------------------------------------------------------------
2020-06-11 21:17:05,858 EPOCH 2 done: loss 1.1480 - lr 0.1000000
2020-06-11 21:17:06,526 DEV : loss 1.2450100183486938 - score 0.7353
2020-06-11 21:17:06,559 BAD EPOCHS (no improvement): 0
saving best model
2020-06-11 21:17:08,425 --------

  _warn_prf(average, modifier, msg_start, len(result))


{'dev_loss_history': [1.3760476112365723,
  1.2450100183486938,
  1.7610528469085693,
  0.8523023128509521,
  0.8766542077064514,
  0.8038918375968933,
  1.3781912326812744,
  2.0921192169189453,
  1.024451732635498,
  1.8005850315093994],
 'dev_score_history': [0.6944,
  0.7353,
  0.7692,
  0.8333,
  0.8621,
  0.8772,
  0.7692,
  0.7353,
  0.8333,
  0.8197],
 'test_score': 0.8772,
 'train_loss_history': [1.5251025954882305,
  1.1479901770750682,
  0.7269907395044962,
  0.9134433368841807,
  0.5661698778470358,
  0.675675223271052,
  0.46617567042509717,
  0.4531128679712613,
  0.41995643575986225,
  0.16242523243029913]}

### Making predictions using finetuned classifier


In [6]:
finetuned_classifier = TextClassifier.load(new_data_folder + 'best-model.pt')

2020-06-23 21:29:23,007 loading file ./drive/My Drive/Colab Notebooks/capstone/data/phase_2_mortgage_rate_oversampled/best-model.pt


In [7]:
#BBG
input_file_path = './drive/My Drive/Colab Notebooks/capstone/data/unannotated_for_predictions/predictions_dataset_mortgagerates_Bloomberg.csv'
output_file_path = './drive/My Drive/Colab Notebooks/capstone/data/predictions_output/unannotated_mortgagerates_Bloomberg_predictions.csv'
finetuned_model_predictions(input_file_path, finetuned_classifier, output_file_path)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


All 100 rows done prediction! 
Done export!


 0    70
-1    28
 1     2
Name: best_label, dtype: int64

In [8]:
#CBC
input_file_path = './drive/My Drive/Colab Notebooks/capstone/data/unannotated_for_predictions/predictions_dataset_mortgagerates_cbc.csv'
output_file_path = './drive/My Drive/Colab Notebooks/capstone/data/predictions_output/unannotated_mortgagerates_CBC_predictions.csv'
finetuned_model_predictions(input_file_path, finetuned_classifier, output_file_path)

All 94 rows done prediction! 
Done export!


 0    70
-1    23
 1     1
Name: best_label, dtype: int64

### Error analysis - two news source prediction files

In [9]:

#Error Analysis - Bloomberg News

unannotated_mortgage_Bloomberg_predictions = pd.read_csv("./drive/My Drive/Colab Notebooks/capstone/data/predictions_output/unannotated_mortgagerates_Bloomberg_predictions.csv")

BBG_sorted = unannotated_mortgage_Bloomberg_predictions[['title_desc','best_label','best_confidence','second_likely','second_confidence']]

BBG_sorted

#Several questionable predictions - positive predictions look inaccurate?

Unnamed: 0,title_desc,best_label,best_confidence,second_likely,second_confidence
0,Fed warns of severe economic impact from pande...,-1,0.813356,0,0.182111
1,'Never seen anything as catastrophic': Malls i...,-1,0.543916,1,0.310926
2,"Once seen as safer than gold, Canadian real es...",0,0.475323,-1,0.471138
3,Bank of Canada set for bigger buying spree — h...,0,0.556337,-1,0.420379
4,One of Canada's largest private lenders halts ...,0,0.631627,-1,0.328775
...,...,...,...,...,...
95,It’s not just Vancouver: Property market slump...,1,0.701772,0,0.191667
96,"Toronto condo party to sober up in 2019, devel...",0,0.771178,-1,0.162955
97,Mortgage rules now 'overkill' that are hitting...,0,0.535101,-1,0.427269
98,Bank of Canada expands balance sheet list to m...,0,0.958254,1,0.030828


In [11]:
## error analysis - Bloomberg News


BBG_sorted.query('best_label == 1').sort_values('best_confidence')

# Only two positive prediction in Bloomberg mortgage rate data - not enough positive examples in phase 2 training data?

#less confident prediction -  positive language used about an indivdual, not wider economic implication
# (0.51) "RBC’s $100-billion woman buys more stocks — just not in the U.S.. Sarah Riopelle, senior portfolio manager at RBC Global Asset Management, boosted her equity allocation by 2 percentage points to 59 per cent over the past month"

#unreasonable wrong - negative sentiment towards housing market, but use of "rising borrowing costs" and "increased goverment regulation" may skew towards positive language
# (0.70) "It’s not just Vancouver: Property market slump goes global, jolting cities from Hong Kong to Sydney. Rising borrowing costs, increased government regulation and volatile stock markets playing a role, along with dwindling demand from Chinese buyers"


Unnamed: 0,title_desc,best_label,best_confidence,second_likely,second_confidence
31,RBC’s $100-billion woman buys more stocks — ju...,1,0.507935,-1,0.2673
95,It’s not just Vancouver: Property market slump...,1,0.701772,0,0.191667


In [12]:
## error analysis - Bloomberg News

##not confident versus confident predictions of positive class

BBG_sorted.query('best_label == -1').sort_values('best_confidence')

# less confident prediction - mentions of decline and growth in housing sales
# (0.59) "Canada’s home sales rise for third month, easing concerns about correction. CREA hikes its forecast from 1.6% decline to 1.2% growth"

# Confident in negative label, but explicit mention of rising Mortgage rate suggests that the model has not been adequately trained on identifying positive classifications 
# (0.80)"Mortgage rates are rising in Canada even as central bank cuts. Canada’s mortgage rates are creeping up — even though the country’s central bank has slashed borrowing costs to combat the COVID-19 pandemic."

Unnamed: 0,title_desc,best_label,best_confidence,second_likely,second_confidence
69,A closer look at the taxes that sent Vancouver...,-1,0.420038,0,0.365742
80,CIBC sees Canadian dollar falling to 15-year l...,-1,0.484138,0,0.43163
55,The bond floodgates open in Canada with at lea...,-1,0.491884,0,0.482687
39,Election pledges risk re-igniting Canadian hom...,-1,0.501516,0,0.441718
89,Interest rates cast pall on Canadian home sale...,-1,0.517787,0,0.462936
1,'Never seen anything as catastrophic': Malls i...,-1,0.543916,1,0.310926
94,The year higher borrowing costs and stricter m...,-1,0.558703,1,0.266926
23,Virus-driven interest rate cut could add keros...,-1,0.566852,0,0.373036
56,"Canada’s home sales rise for third month, easi...",-1,0.594216,0,0.38286
40,Liberals face tough battle over wallets in key...,-1,0.612068,0,0.363434


In [10]:
#Error Analysis - CBC

unannotated_mortgage_CBC_predictions = pd.read_csv("./drive/My Drive/Colab Notebooks/capstone/data/predictions_output/unannotated_mortgagerates_CBC_predictions.csv")

CBC_sorted = unannotated_mortgage_CBC_predictions[['title_desc','best_label','best_confidence','second_likely','second_confidence']]

CBC_sorted #look more accurate

Unnamed: 0,title_desc,best_label,best_confidence,second_likely,second_confidence
0,Mortgage arrears rate could spike to double wh...,-1,0.693398,0,0.286308
1,Interest rates are plunging — so why aren't mo...,0,0.793304,-1,0.179028
2,Why worries about the coronavirus are pushing ...,-1,0.718187,0,0.260812
3,U.S. Fed chair rules out negative interest rat...,-1,0.738995,0,0.246673
4,'Pretty cheap money': Canadian mortgage rates ...,-1,0.583257,0,0.389637
...,...,...,...,...,...
89,'Rents have just gone sky high': Cardigan cand...,0,0.738062,-1,0.211249
90,Forget Toronto. Buying in P.E.I. increasingly ...,0,0.917664,1,0.043547
91,"Toronto area housing sales up 24.3% in July, p...",0,0.804369,1,0.157610
92,'I've never felt shame like this in my life': ...,0,0.777915,1,0.111696


In [42]:
## error analysis - CBC source

##not confident versus confident predictions of positive class

CBC_sorted.query('best_label == 1').sort_values('best_confidence')

# Only one positive prediction in CBC mortgage rate data - not enough positive examples in phase 2 training data?
# "45 per cent of Hamilton renters living in unaffordable housing, new report says. Average rent in the downtown core, mountain has risen 40 per cent in 8 years" 
#is correct prediction, but likely missing other positives due to training data limitations

Unnamed: 0,title_desc,best_label,best_confidence,second_likely,second_confidence
41,45 per cent of Hamilton renters living in unaf...,1,0.393134,-1,0.308323


In [43]:
## error analysis - CBC source

##not confident versus confident predictions of negative class

CBC_sorted.query('best_label == -1').sort_values('best_confidence')

##below confident and wrong but reasonable about wordings balancing out each other:
#(0.94)"St. John's housing prices will lead country in growth, according to Moody's. Economist says below-trend prices, drop in unemployment rate leading to predicted increase"
#(0.84)"As the U.S. economy tilts toward concern, Canada's is on the upswing: Don Pittis. Federal Reserve chair Jerome Powell holds interest rates steady, but opens the door to future cuts"

##less reasonable - positive language towards housing market:
#(0.56)"CMHC reports annual pace of housing starts climbed 1.9% in August. Housing starts rose to 226,639 units in August, up from 222,467 units in July"

# Good predictions on negative class, but overclassification of neutral tag

Unnamed: 0,title_desc,best_label,best_confidence,second_likely,second_confidence
47,Bank of Canada's Poloz says global growth to r...,-1,0.498649,0,0.469633
37,Tenants left homeless after apartment fire str...,-1,0.503742,0,0.48349
79,Province not prepared to ease policies as hous...,-1,0.503884,0,0.444265
76,Yellowknife's housing market slows as economic...,-1,0.513553,0,0.455827
48,"Faced with rental housing crisis, Rosemont—La ...",-1,0.553428,0,0.41569
62,CMHC reports annual pace of housing starts cli...,-1,0.559098,0,0.227104
4,'Pretty cheap money': Canadian mortgage rates ...,-1,0.583257,0,0.389637
19,Average rent in Calgary climbs by 1.7% as popu...,-1,0.636779,0,0.338492
7,Canada's big banks cut credit card interest ra...,-1,0.648855,0,0.337319
45,More Fed cuts expected to push Canadian intere...,-1,0.659583,0,0.312544


In [None]:
#Takeaways from error analysis: Overfit to neutral tags despite using oversampled data - improper classification of positive text likely