# Zero Shot Learning and Natural Language Inference for Text Classification - Model Predicting

**Author:** Airc Miao
**Date:** 2024-02-17
**Rule:**
1. If any value is empty, return "low"
2. Otherwise, use the model to predict the label

In [1]:
import pandas as pd
import re
import nltk
import joblib
from sklearn.feature_extraction.text import CountVectorizer

# Give an example here. Correct output is "medium

In [9]:
general_cap_classification = "UNPROFESSIONAL CONDUCT"

summary = """According to the complainant, on 11-6-22 at 8:30 PM, while in the confines of the 24th District, they were treated unprofessionally by an unknown officer. Police stopped them. The complainant states that the officer attempted to hit them with their patrol car, after accusing them of trying to go around other vehicles. The complainant states that the officer disrespected them, went over their loud speaker and made an inappropriate comment."""

In [10]:
# Trim texts
general_cap_classification = general_cap_classification.strip()
summary = summary.strip()

# if any is empty, return "low"
if general_cap_classification == '' or summary == '':
    print('low')

In [11]:
def cleanAndTokenizeText(txt):
    # Remove "According to the complainant,"
    txt = txt.replace('According to the complainant,', '')

    # Remove dates
    date_pattern= r'(?:\,\s)*(?:on\s)*\d+-\d+-\d+[\s]*[\,]*'
    txt = re.sub(date_pattern, '', txt, flags=re.IGNORECASE)

    # Remove time
    time_pattern= r'(at)?\s?\d+:\d+\s?(AM|PM)?\,?\s?'
    txt = re.sub(time_pattern, '', txt, flags=re.IGNORECASE)

    # Remove locations of patter of (the\s)?\d+\w+\s?District
    location_pattern= r'(the\s)?\d+\w+\s?District'
    txt = re.sub(location_pattern, '', txt, flags=re.IGNORECASE)

    # Remove "While in the confines of ,"
    txt = txt.replace("While in the confines of ,", '')

    words = nltk.tokenize.word_tokenize(txt)

    # make all lower case
    words = [word.lower() for word in words]

    #stopwords
    stopwords = set(nltk.corpus.stopwords.words('english'))

    # Remove stopwords
    words = [word for word in words if word.lower() not in stopwords]

    # Remove punctuation
    words = [word for word in words if word.isalnum()]

    # Stemming
    stemmer = nltk.stem.PorterStemmer()
    words = [stemmer.stem(word) for word in words]

    # Remove numbers
    words = [word for word in words if not word.isdigit()]

    # remove only one-letter words
    words = [word for word in words if len(word) > 1]

    # return text of words
    return ' '.join(words)

In [17]:
# Create a dataframe with text as column which combies the general_cap_classification and summary
df = pd.DataFrame({'text': [general_cap_classification + '. ' + summary]})

# Clean text
df['text'] = df['text'].apply(cleanAndTokenizeText)

# CountVectorizer
count_vectorizer = joblib.load('vectorizer.pkl')

# Transform the text
X = count_vectorizer.transform(df['text'])

In [18]:
# Load the model
model = joblib.load('model_lr_cv.pkl')


# Predict the label
label = model.predict(X )[0]

# Return the label
label



'medium'