In [2]:
import re

import pandas as pd

import numpy as np

from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import train_test_split

from sklearn.metrics import classification_report

from sklearn.metrics import accuracy_score

import math

import nltk

nltk.download('wordnet')
nltk.download('omw-1.4')

from sklearn.feature_extraction.text import CountVectorizer

from collections import defaultdict

from textblob import TextBlob

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/aniketarahane/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/aniketarahane/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [3]:
dataset = pd.read_csv('~/Downloads/dataset - Sheet2 (1).csv')
dataset

Unnamed: 0,text,categories
0,LETTER OF RESPONSE 1/3/2015 Marcus Clinton 3 ...,roommate/family problems
1,-- As a solution to the law soilt we ask to s...,money/ job problems
2,From: Ally Newman Fax: (253) 891-5440 To: Fax...,landlord/rental agreement issues
3,From: Ally Newman Fax: (253) 891-5440 To: Fax...,landlord/rental agreement issues
4,From: Ally Newman Fax: (253) 891-5440 To: Fax...,landlord/rental agreement issues
5,"Eviction Summons Madrona Ridge Residential, L...",landlord/rental agreement issues
6,Dec. 22. 2014. 3:40 PM ORD PLLC FUCKtilY ROX ...,money/ job problems
7,age : 2. of 2 01/7/2015 13:14 PM PHONE #30380...,money/ job problems
8,"Kimani Kironji Emily Coates E, Jessie Lewis J...",landlord/rental agreement issues
9,"black mold, In attempt to have the landlord h...",landlord/rental agreement issues


In [4]:
def remove_tags(string):
    result = re.findall("[a-zA-Z]+",string)
    result = (" ".join(result))
    result = result.lower()
    new_doc = TextBlob(result)
    #result = ("".join(result))
    result = new_doc.correct()
    result = ("".join(result))
    #result = re.sub('','',string)          #remove HTML tags; not really necessary
    #result = re.sub('https://.*','',result)   #remove URLs; not really necessary
    #result = re.sub(r'[^w'+removelist+']', ' ',result)    #remove non-alphanumeric characters; for some reason this makes everything a w which is annoying because this is really the only necessary one for this dataset
    return result
dataset['text'] = dataset['text'].apply(lambda cw : remove_tags(cw))

nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english')) #gets rid of meaningless words
dataset['text'] = dataset['text'].apply(lambda x: ' '. join([word for word in x.split() if word not in (stop_words)]))

w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
dataset
lemmatizer = nltk.stem.WordNetLemmatizer()
def lemmatize_text(text):
    st = ""
    for w in w_tokenizer.tokenize(text):
           st = st + lemmatizer.lemmatize(w) + " "
    return st
dataset['text'] = dataset.text.apply(lemmatize_text)
dataset['text'] 
#I wanted to do a spelling correcting since a lot of the words were spelt wrong but it was taking way too long to run; i will try to find a more efficient method

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/aniketarahane/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


0     letter response marcus clinton correct graham ...
1     solution law soil ask set payment roman comedi...
2     ally german fax fax page law office keen knuts...
3     ally german fax fax page found property th ave...
4     ally german fax fax page know going people loo...
5     election summons matrena ridge residential pla...
6     dec pm ord luckily box p dec may concern respo...
7     age pm phone bent wright caesar ra tillman dat...
8     kimani kironji emily coat e sessile lewis jan ...
9     black mold attempt landlord handle matter agre...
10    mail plus copy olympic dr te h gig harbor wa g...
11    page r id attorney aug mark hue vasili plainti...
12    fro jan pm est page laurel blunt hannah pierce...
13    jan p hour hand p dec hour hand p h st sarcoma...
14    unit limb page pocket ty bedford january th b ...
15    unit limb page und sent fee court cast whateve...
16    ace page please fax rickets response confine d...
17    ace page following attempt deliver notice 

In [5]:
reviews = dataset['text'].values
labels = dataset['categories'].values
encoder = LabelEncoder()
encoded_labels = encoder.fit_transform(labels)#pretty self explanatory just encodes values to the labels and text

In [6]:
train_sentences, test_sentences, train_labels, test_labels = train_test_split(reviews, encoded_labels, stratify = encoded_labels)

In [7]:
vec = CountVectorizer(max_features = 3000)
X = vec.fit_transform(train_sentences)
vocab = vec.get_feature_names_out()
X = X.toarray()
word_counts = {}
for l in range(4):
    word_counts[l] = defaultdict(lambda: 0)
for i in range(X.shape[0]):
    l = train_labels[i]
    for j in range(len(vocab)):
        word_counts[l][vocab[j]] += X[i][j]
#so now we're starting to build a naive bayes classifier; couldve used a variant from sklearn library but just to get a better understanding of how it works we'll break up the parts;
#vectorized(words to real numbers) and put in dictionary  which is word_counts and vocab is unique words
#also so there is obviously gonna be words in test set that are not in training set so to make sure that that one word probability isn't 0 we are gonn do laplace smoothing which is really just adding 1
def laplace_smoothing(n_label_items, vocab, word_counts, word, text_label):
    a = word_counts[text_label][word] + 1
    b = n_label_items[text_label] + len(vocab)
    return math.log(a/b)
def group_by_label(x, y, labels):
    data = {}
    for l in labels:
        data[l] = x[np.where(y == l)]
    return data
def fit(x, y, labels):
    n_label_items = {}
    log_label_priors = {}
    n = len(x)
    grouped_data = group_by_label(x, y, labels)
    for l, data in grouped_data.items():
        n_label_items[l] = len(data)
        log_label_priors[l] = math.log(n_label_items[l] / n)
    return n_label_items, log_label_priors
def predict(n_label_items, vocab, word_counts, log_label_priors, labels, x):
    result = []
    for text in x:
        label_scores = {l: log_label_priors[l] for l in labels}
        words = set(w_tokenizer.tokenize(text))
        for word in words:
            if word not in vocab: continue
            for l in labels:
                log_w_given_l = laplace_smoothing(n_label_items, vocab, word_counts, word, l)
                label_scores[l] += log_w_given_l
        result.append(max(label_scores, key=label_scores.get))
    return result

In [8]:
labels = [0,1,2,3]
n_label_items, log_label_priors = fit(train_sentences,train_labels,labels)
pred = predict(n_label_items, vocab, word_counts, log_label_priors, labels, test_sentences)

print("Accuracy of prediction on test set : ", accuracy_score(test_labels,pred))
pred

Accuracy of prediction on test set :  0.7272727272727273


[1, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1]