##Dataset loading

In [None]:
import numpy as np
import pandas as pd

In [None]:
df = pd.read_csv("/content/drive/MyDrive/complaints.csv")

In [None]:
df = df[["Product", "Consumer complaint narrative"]]

In [None]:
df = df.dropna()

In [None]:
class_dict =  {

'Vehicle loan or lease' : 'loan',
'Credit reporting, credit repair services, or other personal consumer reports' : 'credit_report',
'Credit card or prepaid card' : 'card' ,
'Money transfer, virtual currency, or money service' : 'money_transfer',
'Mortgage' : 'Mortgage',
'Payday loan, title loan, or personal loan' : 'loan' ,
'Debt collection' : 'Debt collection',
'Checking or savings account' : 'account',
'Student loan' : 'loan',
'Consumer Loan' : 'loan',
'Money transfers' : 'money_transfer',
'Credit card' : 'card',
'Bank account or service' : 'account',
'Credit reporting' : 'credit_report',
'Prepaid card' : 'card',
'Payday loan' : 'loan',
'Other financial service' : 'credit_report',
'Virtual currency' : 'money_transfer'
}

In [None]:
df.replace({'Product': class_dict}, inplace=True)

In [None]:
sampled_df = pd.DataFrame()

for col in df['Product'].unique():
  sample = df[df['Product'] == col].sample(1000)

  sampled_df = sampled_df.append(sample, ignore_index=True)
  # print(col)

  sampled_df = sampled_df.append(sample, ignore_index=True)


In [None]:
import re

In [None]:
def text_cleaning(text):
  text = text.lower()   #lower case

  text = re.sub(r'[^\w\s]', '', text)  #removing punct

  text = re.sub(r'[0-9]', '', text)   #removing digits

  text = re.sub(r'[x]{2,}', '', text) # removing more than 2 occurance of X

  return text

In [None]:
sampled_df['Consumer complaint narrative'] = sampled_df['Consumer complaint narrative'].apply(text_cleaning)

In [None]:
classes = {'loan' : 0,
           'credit_report' :1,
           'card': 2,
           'money_transfer' : 3,
           'Mortgage' : 4,
           'Debt collection' : 5,
           'account' : 6
          }

sampled_df.replace({'Product' : classes}, inplace=True)

In [None]:
sampled_df.columns = ['label', 'text']

In [None]:
sampled_df.to_csv("complaint_dataset.csv", index=False)

In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m23.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: xxhash, dill, multiprocess, datasets
Successfully installed dataset

In [None]:
from datasets import load_dataset

In [None]:
data = load_dataset('csv', data_files='complaint_dataset.csv')

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
train_test = data['train'].train_test_split(test_size=0.2, seed=42)

In [None]:
train_test['test']

Dataset({
    features: ['label', 'text'],
    num_rows: 1400
})

##load fine tuned model

In [None]:
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification

In [None]:
model_checkpoint = "/content/drive/MyDrive/output_dir/checkpoint-1000"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint)

In [None]:
classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)

In [None]:
train_test['test']['text'][:1]

['good day my name is   and i currently have a mortgage loan with citi mortgage in the city of  my property taxes were lower by  by the city of  documentation was sent over to citi mortgage and only to find out my mortgage will only be lower by  for the year i am current with my mortgage so i do not understand their calculation i called the company and they are lying about escrow differences i was told the same thing by citi staff last year when they raised my mortgage from  to  i really need your help to correct this matter because i know i will lose my home with citi mortgage scheme thank you']

In [None]:
train_test['test']['label'][:1]

[4]

In [None]:
predz = classifier(train_test['test']['text'][:100], padding=True, truncation=True)

In [None]:
predz

In [None]:
y_pred = [int(label['label'][-1]) for label in predz]

In [None]:
y_pred

In [None]:
for label in predz:
  print(int(label['label'][-1]))

In [None]:
predz

In [None]:
y_actual = train_test['test']['label'][:100]

In [None]:
from sklearn.metrics import accuracy_score, f1_score

In [None]:
accuracy_score(y_actual, y_pred)

0.8

In [None]:
f1_score(y_actual, y_pred, average='macro')

0.802880645238878

In [None]:
## Business impact

In [None]:
train_test['test']['label'][:1]

[4]

In [None]:
complaint = train_test['test']['text'][:1]

prediction = classifier(complaint, padding=True, truncation=True)

prediction

[{'label': 'LABEL_4', 'score': 0.9914877414703369}]

In [None]:
prediction[0]['label']

'LABEL_4'

In [None]:
def get_receiver_email(prediction):
  if prediction[0]['label'] == 'LABEL_0':
    receiver_email = 'XXXXXXX@gmail.com'

  elif prediction[0]['label'] == 'LABEL_1':
    receiver_email = 'XXXXXXX@yahoo.com'

  elif prediction[0]['label'] == 'LABEL_2':
    receiver_email = 'XXXXXXX@gmail.com'

  elif prediction[0]['label'] == 'LABEL_3':
    receiver_email = 'XXXXXXX@gmail.com'

  elif prediction[0]['label'] == 'LABEL_4':
    receiver_email = 'XXXXXXX@gmail.com'

  elif prediction[0]['label'] == 'LABEL_5':
    receiver_email = 'XXXXXXX@gmail.com'

  elif prediction[0]['label'] == 'LABEL_6':
    receiver_email = 'XXXXXXX@gmail.com'

  return receiver_email

##Sending Mail

In [None]:
from google.colab import userdata

sender_password = userdata.get('EMAIL_PASSWORD')

In [None]:
import smtplib, ssl


def send_mail(receiver_email, message):
  port = 465
  smtp_server = 'smtp.gmail.com'

  sender_email = 'ugaledatta41@gmail.com'

  # receiver_email =

  # sender_password = ''
  # message = 'this mail send from python'

  context = ssl.create_default_context()

  server = smtplib.SMTP_SSL(smtp_server, port)

  server.login(sender_email, sender_password)

  server.sendmail(sender_email, receiver_email, message)

  print('mail sent')

In [None]:
def forward_complaint(complaint):
  prediction = classifier(complaint, padding=True, truncation=True)
  print(prediction)

  receiver_email = get_receiver_email(prediction)
  print(receiver_email)
  # server.sendmail(sender_email, receiver_email, complaint[0])
  send_mail(receiver_email, complaint[0])

  return {"message" : "complaint forwarded"}



complaint = train_test['test']['text'][100:101]

forward_complaint(complaint)