In [180]:
import pandas as pd
import nltk
from nltk.classify import naivebayes

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.classify import NaiveBayesClassifier
from nltk.classify.util import accuracy

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

from collections import Counter

from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import classification_report

In [181]:
df = pd.read_csv('CRM_reviews_2.csv')
df.head(5)

Unnamed: 0,Index,Date,Review,Rating,Product,District
0,1,01/01/2015,"Reliable, not bad fast transactions wide range...",5,Mortgage,6
1,2,01/01/2015,Worried About Fraud But fast transactions wide...,5,Online Services,44
2,3,01/01/2015,Helped Me In A Case Of Fraud competitive rates...,5,Online Services,24
3,4,01/01/2015,Stable Interest Rate competitive rates easy on...,5,Online Services,65
4,5,01/01/2015,Where I Lost My Pin helpful customer service h...,5,Online Services,2


In [182]:
online = df[df['Product'] == 'Online Services'].copy()
mortgage = df[df['Product'] == 'Mortgage'].copy()

## Online Services Prep

In [183]:
OS_rating1 = online[online['Rating'] == 1].copy()
OS_rating5 = online[online['Rating'] == 5].copy()

In [184]:
print('number of 1-star reviews {}'.format(len(OS_rating1)))
print('number of 5-star reviews {}'.format(len(OS_rating5)))

number of 1-star reviews 7093
number of 5-star reviews 12027


In [185]:
#Standardise the data so there are the same sized samples of 1 and 5 rated reviews
#Sample appropriate sizes from 1's and 5's 

OS_rating1_sample = OS_rating1.sample(n=6000, random_state=42)
OS_rating5_sample = OS_rating5.sample(n=6000, random_state=42)
print('number of 1-star reviews {}'.format(len(OS_rating1_sample)))
print('number of 5-star reviews {}'.format(len(OS_rating5_sample)))

number of 1-star reviews 6000
number of 5-star reviews 6000


In [186]:
#concat your equally sized samples, filter to two columns, reset index
OS_sample_full = pd.concat([OS_rating1_sample, OS_rating5_sample])
OS_sample_full1 = OS_sample_full[['Rating', 'Review']]
OS_sample_full1 = OS_sample_full1.reset_index(drop=True)
OS_sample_full1

Unnamed: 0,Rating,Review
0,1,Rude Customer Service complicated to figure ou...
1,1,Why!? frustrating soo frustrating!strange erro...
2,1,Complicated Online Banking poor interest rates...
3,1,To Frustrate Me complicated to figure out bad ...
4,1,Soo Frustrating!Strange slow transactions slow...
...,...,...
11995,5,Efficient waved fees helpful customer service ...
11996,5,Helped Me In A Case Of Fraud friendly staff st...
11997,5,Worried About Fraud But friendly staff helpful...
11998,5,Secure From Fraud secure from fraud secure fro...


In [187]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def remove_stopwords_and_lemmatize(text):
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stop words and lemmatize the tokens
    filtered_tokens = [lemmatizer.lemmatize(word.lower()) for word in tokens if word.lower() not in stop_words and word.isalpha()]
    # Reconstruct the text without stop words
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text

In [188]:
# Now when you run the function to remove stopwords and lemmatize, it should not give you the warning
OS_sample_full1['Review'] = OS_sample_full1['Review'].apply(remove_stopwords_and_lemmatize)
#tokenise each
OS_sample_full1['Tokens'] = OS_sample_full1['Review'].apply(word_tokenize)

In [189]:
OS_sample_full1.Tokens[1]

['frustrating',
 'soo',
 'frustrating',
 'strange',
 'error',
 'kept',
 'cropping',
 'poor',
 'interest',
 'rate',
 'additional',
 'charge',
 'frustrating']

In [190]:
OS_sample_full1

Unnamed: 0,Rating,Review,Tokens
0,1,rude customer service complicated figure soo f...,"[rude, customer, service, complicated, figure,..."
1,1,frustrating soo frustrating strange error kept...,"[frustrating, soo, frustrating, strange, error..."
2,1,complicated online banking poor interest rate ...,"[complicated, online, banking, poor, interest,..."
3,1,frustrate complicated figure bad advice error ...,"[frustrate, complicated, figure, bad, advice, ..."
4,1,soo frustrating strange slow transaction slow ...,"[soo, frustrating, strange, slow, transaction,..."
...,...,...,...
11995,5,efficient waved fee helpful customer service f...,"[efficient, waved, fee, helpful, customer, ser..."
11996,5,helped case fraud friendly staff stable intere...,"[helped, case, fraud, friendly, staff, stable,..."
11997,5,worried fraud friendly staff helpful customer ...,"[worried, fraud, friendly, staff, helpful, cus..."
11998,5,secure fraud secure fraud secure fraud fast tr...,"[secure, fraud, secure, fraud, secure, fraud, ..."


## Mortgage Prep

In [191]:
mort_rating1 = mortgage[mortgage['Rating'] == 1].copy()
mort_rating5 = mortgage[mortgage['Rating'] == 5].copy()

In [192]:
print('number of 1-star reviews {}'.format(len(mort_rating1)))
print('number of 5-star reviews {}'.format(len(mort_rating5)))

number of 1-star reviews 1027
number of 5-star reviews 806


In [193]:
#Standardise the data so there are the same sized samples of 1 and 5 rated reviews
#Sample appropriate sizes from 1's and 5's 

mort_rating1_sample = mort_rating1.sample(n=750, random_state=42)
mort_rating5_sample = mort_rating5.sample(n=750, random_state=42)
print('number of 1-star reviews {}'.format(len(mort_rating1_sample)))
print('number of 5-star reviews {}'.format(len(mort_rating5_sample)))

number of 1-star reviews 750
number of 5-star reviews 750


In [194]:
#concat your equally sized samples, filter to two columns, reset index
mort_sample_full = pd.concat([mort_rating1_sample, mort_rating5_sample])
mort_sample_full1 = mort_sample_full[['Rating', 'Review']]
mort_sample_full1 = mort_sample_full1.reset_index(drop=True)
mort_sample_full1

Unnamed: 0,Rating,Review
0,1,Insecure additional charges limited services i...
1,1,What Happened what happened fees rude customer...
2,1,Poor Interest Rates rude customer service addi...
3,1,What Happened what happened rude customer serv...
4,1,High Fees what happened i hope youre proud of ...
...,...,...
1495,5,Stressed At First Tho Thank you! Thank you!.
1496,5,Not Bad wide range of services Thank you! help...
1497,5,Thank You! very good with ease not bad excelle...
1498,5,Stressed At First Tho worried but got really h...


In [195]:
# Now when you run the function to remove stopwords and lemmatize, it should not give you the warning
mort_sample_full1['Review'] = mort_sample_full1['Review'].apply(remove_stopwords_and_lemmatize)
#tokenise each
mort_sample_full1['Tokens'] = mort_sample_full1['Review'].apply(word_tokenize)

In [196]:
mort_sample_full1.Tokens[1]

['happened', 'happened', 'fee', 'rude', 'customer', 'service']

In [197]:
mort_sample_full1

Unnamed: 0,Rating,Review,Tokens
0,1,insecure additional charge limited service hop...,"[insecure, additional, charge, limited, servic..."
1,1,happened happened fee rude customer service,"[happened, happened, fee, rude, customer, serv..."
2,1,poor interest rate rude customer service addit...,"[poor, interest, rate, rude, customer, service..."
3,1,happened happened rude customer service bad fi...,"[happened, happened, rude, customer, service, ..."
4,1,high fee happened hope youre proud rude custom...,"[high, fee, happened, hope, youre, proud, rude..."
...,...,...,...
1495,5,stressed first tho thank thank,"[stressed, first, tho, thank, thank]"
1496,5,bad wide range service thank helpful customer ...,"[bad, wide, range, service, thank, helpful, cu..."
1497,5,thank good ease bad excellent financial advice,"[thank, good, ease, bad, excellent, financial,..."
1498,5,stressed first tho worried got really helpful ...,"[stressed, first, tho, worried, got, really, h..."


## Model Building

### Online Services

In [198]:
# Prepare the data
def document_features(token_list):
    return {token: True for token in token_list}

OS_data = [(document_features(tokens), str(score)) for tokens, score in zip(OS_sample_full1['Tokens'], OS_sample_full1['Rating'])]

In [199]:
OS_data

[({'rude': True,
   'customer': True,
   'service': True,
   'complicated': True,
   'figure': True,
   'soo': True,
   'frustrating': True,
   'strange': True,
   'slow': True,
   'transaction': True,
   'error': True,
   'long': True,
   'wait': True,
   'time': True,
   'lost': True,
   'detail': True,
   'staff': True,
   'unhelpful': True},
  '1'),
 ({'frustrating': True,
   'soo': True,
   'strange': True,
   'error': True,
   'kept': True,
   'cropping': True,
   'poor': True,
   'interest': True,
   'rate': True,
   'additional': True,
   'charge': True},
  '1'),
 ({'complicated': True,
   'online': True,
   'banking': True,
   'poor': True,
   'interest': True,
   'rate': True,
   'frustrate': True},
  '1'),
 ({'frustrate': True,
   'complicated': True,
   'figure': True,
   'bad': True,
   'advice': True,
   'error': True,
   'kept': True,
   'cropping': True},
  '1'),
 ({'soo': True,
   'frustrating': True,
   'strange': True,
   'slow': True,
   'transaction': True,
   'com

In [200]:
import nltk.classify
# Split the data
OS_train_data, OS_test_data = train_test_split(OS_data, test_size=0.25, random_state=42)

# Train the classifier
OS_clf = NaiveBayesClassifier.train(OS_train_data)

print("Classifier accuracy percent:",(nltk.classify.accuracy(OS_clf, OS_test_data))*100)

Classifier accuracy percent: 100.0


In [201]:
from sklearn.metrics import confusion_matrix

OS_test_actual = [label for _, label in OS_test_data]
OS_test_pred = [OS_clf.classify(features) for features, _ in OS_test_data]

# Generate the confusion matrix using sklearn
OS_cm = confusion_matrix(OS_test_actual, OS_test_pred)


# You might need to explicitly define your labels
OSlabels = sorted(set(OS_test_actual))  # Sort labels if necessary

OS_cm_df = pd.DataFrame(OS_cm, index=OSlabels, columns=OSlabels)

In [202]:
OS_cm_df

Unnamed: 0,1,5
1,1527,0
5,0,1473


In [203]:
OS_accuracy = accuracy_score(OS_test_actual, OS_test_pred)
print(f"Accuracy: {OS_accuracy:.2f}")



OS_report = classification_report(OS_test_actual, OS_test_pred)
print(OS_report)

Accuracy: 1.00
              precision    recall  f1-score   support

           1       1.00      1.00      1.00      1527
           5       1.00      1.00      1.00      1473

    accuracy                           1.00      3000
   macro avg       1.00      1.00      1.00      3000
weighted avg       1.00      1.00      1.00      3000



In [204]:
# Show the most informative features
informative_features =OS_clf.show_most_informative_features(100)

Most Informative Features
                   fraud = None                1 : 5      =      3.6 : 1.0
                    rate = True                5 : 1      =      2.1 : 1.0
                   error = None                5 : 1      =      1.9 : 1.0
             complicated = None                5 : 1      =      1.8 : 1.0
             frustrating = None                5 : 1      =      1.8 : 1.0
                    rate = None                1 : 5      =      1.7 : 1.0
                   quick = None                1 : 5      =      1.5 : 1.0
               responded = None                1 : 5      =      1.5 : 1.0
                  secure = None                1 : 5      =      1.5 : 1.0
             competitive = None                1 : 5      =      1.5 : 1.0
                     fee = None                1 : 5      =      1.5 : 1.0
                   waved = None                1 : 5      =      1.5 : 1.0
                    case = None                1 : 5      =      1.5 : 1.0

### Mortgages

In [205]:
# Prepare the data
def document_features(token_list):
    return {token: True for token in token_list}

mort_data = [(document_features(tokens), str(score)) for tokens, score in zip(mort_sample_full1['Tokens'], mort_sample_full1['Rating'])]

In [206]:
mort_data

[({'insecure': True,
   'additional': True,
   'charge': True,
   'limited': True,
   'service': True,
   'hope': True,
   'youre': True,
   'proud': True},
  '1'),
 ({'happened': True,
   'fee': True,
   'rude': True,
   'customer': True,
   'service': True},
  '1'),
 ({'poor': True,
   'interest': True,
   'rate': True,
   'rude': True,
   'customer': True,
   'service': True,
   'additional': True,
   'charge': True,
   'limited': True},
  '1'),
 ({'happened': True,
   'rude': True,
   'customer': True,
   'service': True,
   'bad': True,
   'financial': True,
   'advice': True,
   'distressed': True,
   'limited': True,
   'high': True,
   'fee': True},
  '1'),
 ({'high': True,
   'fee': True,
   'happened': True,
   'hope': True,
   'youre': True,
   'proud': True,
   'rude': True,
   'customer': True,
   'service': True,
   'additional': True,
   'charge': True},
  '1'),
 ({'great': True,
   'mum': True,
   'hope': True,
   'youre': True,
   'proud': True,
   'bad': True,
   'fin

In [207]:
import nltk.classify
# Split the data
mort_train_data, mort_test_data = train_test_split(mort_data, test_size=0.25, random_state=42)

# Train the classifier
mort_clf = NaiveBayesClassifier.train(mort_train_data)

print("Classifier accuracy percent:",(nltk.classify.accuracy(mort_clf, mort_test_data))*100)

Classifier accuracy percent: 100.0


In [208]:
from sklearn.metrics import confusion_matrix

mort_test_actual = [label for _, label in mort_test_data]
mort_test_pred = [mort_clf.classify(features) for features, _ in mort_test_data]

# Generate the confusion matrix using sklearn
mort_cm = confusion_matrix(mort_test_actual, mort_test_pred)


# You might need to explicitly define your labels
mortlabels = sorted(set(mort_test_actual))  # Sort labels if necessary

mort_cm_df = pd.DataFrame(mort_cm, index=mortlabels, columns=mortlabels)

In [209]:
mort_cm_df

Unnamed: 0,1,5
1,182,0
5,0,193


In [210]:
mort_accuracy = accuracy_score(mort_test_actual, mort_test_pred)
print(f"Accuracy: {mort_accuracy:.2f}")



mort_report = classification_report(mort_test_actual, mort_test_pred)
print(mort_report)

Accuracy: 1.00
              precision    recall  f1-score   support

           1       1.00      1.00      1.00       182
           5       1.00      1.00      1.00       193

    accuracy                           1.00       375
   macro avg       1.00      1.00      1.00       375
weighted avg       1.00      1.00      1.00       375



In [211]:
# Show the most informative features
mort_informative_features =mort_clf.show_most_informative_features(100)

Most Informative Features
                 helpful = None                1 : 5      =      2.0 : 1.0
                    rate = True                5 : 1      =      1.9 : 1.0
                     fee = True                1 : 5      =      1.6 : 1.0
                    rate = None                1 : 5      =      1.5 : 1.0
                   range = None                1 : 5      =      1.4 : 1.0
                    wide = None                1 : 5      =      1.4 : 1.0
                    fast = None                1 : 5      =      1.4 : 1.0
             competitive = None                1 : 5      =      1.4 : 1.0
                    good = None                1 : 5      =      1.4 : 1.0
                   froze = None                1 : 5      =      1.4 : 1.0
                     mum = None                5 : 1      =      1.4 : 1.0
                   thank = None                1 : 5      =      1.4 : 1.0
                reliable = None                1 : 5      =      1.4 : 1.0

## Testing the Model

In [212]:
def simple_preprocess_and_features(text):
    # Tokenize the text
    tokens = word_tokenize(text.lower())  # Lowercasing and tokenizing
    # Convert tokens to a feature dictionary
    features = {word: True for word in tokens}
    return features

### Mortgage

In [213]:
# Positive Review - Mortgage
new_text = "Smooth mortgage process with excellent customer service, making my dream home purchase a reality."

# Preprocess the text and extract features
features = simple_preprocess_and_features(new_text)

# Use the classifier to predict
predicted_label = mort_clf.classify(features)
print(f"Predicted label: {predicted_label}")

# Get the probability distribution
prob_dist = mort_clf.prob_classify(features)
print(f"Probabilities:")
for label in mort_clf.labels():
    print(f"{label}: {prob_dist.prob(label):.4f}")

Predicted label: 5
Probabilities:
5: 0.9971
1: 0.0029


In [214]:
# Positive Review - Mortgage
new_text = "Competitive interest rates and flexible repayment options, allowing me to save significantly on my home loan."

# Preprocess the text and extract features
features = simple_preprocess_and_features(new_text)

# Use the classifier to predict
predicted_label = mort_clf.classify(features)
print(f"Predicted label: {predicted_label}")

# Get the probability distribution
prob_dist = mort_clf.prob_classify(features)
print(f"Probabilities:")
for label in mort_clf.labels():
    print(f"{label}: {prob_dist.prob(label):.4f}")

Predicted label: 5
Probabilities:
5: 0.9973
1: 0.0027


In [235]:
# Negative Review - Mortgage
new_text = "Constant delays and miscommunication throughout the mortgage application, causing frustration and anxiety."

# Preprocess the text and extract features
features = simple_preprocess_and_features(new_text)

# Use the classifier to predict
predicted_label = mort_clf.classify(features)
print(f"Predicted label: {predicted_label}")

# Get the probability distribution
prob_dist = mort_clf.prob_classify(features)
print(f"Probabilities:")
for label in mort_clf.labels():
    print(f"{label}: {prob_dist.prob(label):.4f}")

Predicted label: 1
Probabilities:
5: 0.4951
1: 0.5049


In [216]:
# Negative Review - Mortgage
new_text = "Hidden fees and unexpected charges added to the mortgage agreement, leading to unexpected financial burden."

# Preprocess the text and extract features
features = simple_preprocess_and_features(new_text)

# Use the classifier to predict
predicted_label = mort_clf.classify(features)
print(f"Predicted label: {predicted_label}")

# Get the probability distribution
prob_dist = mort_clf.prob_classify(features)
print(f"Probabilities:")
for label in mort_clf.labels():
    print(f"{label}: {prob_dist.prob(label):.4f}")

Predicted label: 1
Probabilities:
5: 0.4734
1: 0.5266


In [217]:
# Complex Postive Review - Mortgage
new_text = "Despite initial apprehensions, the mortgage process proved surprisingly efficient with personalised guidance, though the lack of transparency regarding fluctuating interest rates posed challenges for long-term financial planning."

# Preprocess the text and extract features
features = simple_preprocess_and_features(new_text)

# Use the classifier to predict
predicted_label = mort_clf.classify(features)
print(f"Predicted label: {predicted_label}")

# Get the probability distribution
prob_dist = mort_clf.prob_classify(features)
print(f"Probabilities:")
for label in mort_clf.labels():
    print(f"{label}: {prob_dist.prob(label):.4f}")

Predicted label: 1
Probabilities:
5: 0.4970
1: 0.5030


In [218]:
# Neutral Review - Mortgage
new_text = "The mortgage application process was standard, with satisfactory rates and terms, although the absence of comprehensive financial education resources left room for improvement in empowering borrowers to make informed decisions."

# Preprocess the text and extract features
features = simple_preprocess_and_features(new_text)

# Use the classifier to predict
predicted_label = mort_clf.classify(features)
print(f"Predicted label: {predicted_label}")

# Get the probability distribution
prob_dist = mort_clf.prob_classify(features)
print(f"Probabilities:")
for label in mort_clf.labels():
    print(f"{label}: {prob_dist.prob(label):.4f}")

Predicted label: 1
Probabilities:
5: 0.4734
1: 0.5266


### Online Services

In [219]:
# Positive Review - Online Services
new_text = "Convenient online banking platform with user-friendly interface, making managing finances hassle-free."

# Preprocess the text and extract features
features = simple_preprocess_and_features(new_text)

# Use the classifier to predict
predicted_label = OS_clf.classify(features)
print(f"Predicted label: {predicted_label}")

# Get the probability distribution
prob_dist = OS_clf.prob_classify(features)
print(f"Probabilities:")
for label in OS_clf.labels():
    print(f"{label}: {prob_dist.prob(label):.4f}")

Predicted label: 5
Probabilities:
5: 0.6197
1: 0.3803


In [220]:
# Negative Review - Online Services
new_text = "Frequent technical glitches and slow response times on the mobile app, hindering banking transactions."

# Preprocess the text and extract features
features = simple_preprocess_and_features(new_text)

# Use the classifier to predict
predicted_label = OS_clf.classify(features)
print(f"Predicted label: {predicted_label}")

# Get the probability distribution
prob_dist = OS_clf.prob_classify(features)
print(f"Probabilities:")
for label in OS_clf.labels():
    print(f"{label}: {prob_dist.prob(label):.4f}")

Predicted label: 1
Probabilities:
5: 0.0006
1: 0.9994


In [221]:
# Complex Negative Review - Online Services
new_text = "While the online banking platform boasted advanced security features, the complex authentication procedures and limited customer support channels made resolving issues a cumbersome ordeal, overshadowing its convenience."

# Preprocess the text and extract features
features = simple_preprocess_and_features(new_text)

# Use the classifier to predict
predicted_label = OS_clf.classify(features)
print(f"Predicted label: {predicted_label}")

# Get the probability distribution
prob_dist = OS_clf.prob_classify(features)
print(f"Probabilities:")
for label in OS_clf.labels():
    print(f"{label}: {prob_dist.prob(label):.4f}")

Predicted label: 1
Probabilities:
5: 0.0009
1: 0.9991


In [222]:
# Neutral Review - Online Services
new_text = "The online banking experience offered a balance of convenience and security, yet the interface design lacked innovation, presenting a functional but unremarkable user journey for everyday transactions."

# Preprocess the text and extract features
features = simple_preprocess_and_features(new_text)

# Use the classifier to predict
predicted_label = OS_clf.classify(features)
print(f"Predicted label: {predicted_label}")

# Get the probability distribution
prob_dist = OS_clf.prob_classify(features)
print(f"Probabilities:")
for label in OS_clf.labels():
    print(f"{label}: {prob_dist.prob(label):.4f}")

Predicted label: 5
Probabilities:
5: 0.6197
1: 0.3803


## Attempting to create a filter

In [238]:
def simple_preprocess_and_features(text):
    # Tokenize the text
    tokens = word_tokenize(text.lower())  # Lowercasing and tokenizing
    # Convert tokens to a feature dictionary
    features = {word: True for word in tokens}
    return features

features = simple_preprocess_and_features(text)

In [242]:
online_model = OS_clf
mortgage_model = mort_clf

def classify_post(text, online_model, mortgage_model):
     # Use the appropriate classifier to predict
    if 'online' in text.lower():
        predicted_label = online_model.classify(features)
        prob_dist = online_model.prob_classify(features)
        pred_source = 'Online'
    elif 'mortgage' in text.lower() or 'home' in text.lower():
        predicted_label = mortgage_model.classify(features)
        prob_dist = mortgage_model.prob_classify(features)
        pred_source = 'Mortgage'
    else:
        return 'Unknown'

    # Return the predicted label and probability distribution
    return predicted_label, prob_dist, pred_source

text = 'I had a great experience with the online service.'

predicted_label, prob_dist, pred_source = classify_post(text, online_model, mortgage_model)

# Output the results
print('Type: ', pred_source)
print('Predicted label:', predicted_label)
print('Probability distribution:')
for label in prob_dist.samples():
    print(f"{label}: {prob_dist.prob(label):.4f}")

Type:  Online
Predicted label: 5
Probability distribution:
5: 0.9997
1: 0.0003


In [243]:
online_model = OS_clf
mortgage_model = mort_clf

def classify_post(text, online_model, mortgage_model):
     # Use the appropriate classifier to predict
    if 'online' in text.lower():
        predicted_label = online_model.classify(features)
        prob_dist = online_model.prob_classify(features)
        pred_source = 'Online'
    elif 'mortgage' in text.lower() or 'home' in text.lower():
        predicted_label = mortgage_model.classify(features)
        prob_dist = mortgage_model.prob_classify(features)
        pred_source = 'Mortgage'
    else:
        return 'Unknown'

    # Return the predicted label and probability distribution
    return predicted_label, prob_dist, pred_source

text = 'Using online services for financial matters has been disappointing lately, encountering frequent technical issues and sluggish response times.'
 
predicted_label, prob_dist, pred_source = classify_post(text, online_model, mortgage_model)

# Output the results
print('Type: ', pred_source)
print('Predicted label:', predicted_label)
print('Probability distribution:')
for label in prob_dist.samples():
    print(f"{label}: {prob_dist.prob(label):.4f}")

Type:  Online
Predicted label: 5
Probability distribution:
5: 0.9997
1: 0.0003


In [244]:
online_model = OS_clf
mortgage_model = mort_clf

def classify_post(text, online_model, mortgage_model):
     # Use the appropriate classifier to predict
    if 'online' in text.lower():
        predicted_label = online_model.classify(features)
        prob_dist = online_model.prob_classify(features)
        pred_source = 'Online'
    elif 'mortgage' in text.lower() or 'home' in text.lower():
        predicted_label = mortgage_model.classify(features)
        prob_dist = mortgage_model.prob_classify(features)
        pred_source = 'Mortgage'
    else:
        return 'Unknown'

    # Return the predicted label and probability distribution
    return predicted_label, prob_dist, pred_source

text = 'Competitive interest rates and flexible repayment options, allowing me to save significantly on my home loan.'
 
online_model = OS_clf
mortgage_model = mort_clf
predicted_label, prob_dist, pred_source = classify_post(text, online_model, mortgage_model)

# Output the results
print('Type: ', pred_source)
print('Predicted label:', predicted_label)
print('Probability distribution:')
for label in prob_dist.samples():
    print(f"{label}: {prob_dist.prob(label):.4f}")



Type:  Mortgage
Predicted label: 5
Probability distribution:
5: 0.9973
1: 0.0027
