# Logistic Regression

In [1]:
import numpy as np
import os
import time
import pandas as pd
import re
import json
import keras

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import classification_report, confusion_matrix

Using TensorFlow backend.


In [2]:
# Build the corpus and sequences
labelToName = { 0 : 'Rejected', 1 : 'Allowed' }
namesInLabelOrder = ['Rejected', 'Allowed']
os.chdir('D:\\PhD\\Dataset')
df=pd.read_csv('CriminalBailApplication.csv')
X=df.text.tolist()
labels=df.loc[:,['label']]


def preprocess(temp):
    temp=re.sub("\s\s+"," ",temp)    #Replacing multiple spaces with one
    temp=temp.replace(u'\xa0', ' ').encode('utf-8')  #Replacing non-breaking space    
    temp=temp.decode().split(' ')    #Converting a string into list of words seperated by space charcter
    return temp

#print(X[1])
X = [preprocess(t) for t in X]  
#print("\n\n\n After Preprocess:\n\n",X[1])

sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=1).split(X, labels)
train_indices, test_indices = next(sss)

def seqlengths(l):
    if isinstance(l,list):
        yield len(l)
        for y in l:
            yield from seqlengths(y)

maxsequenceLength=max(seqlengths(X))
print("Max sequence length:",maxsequenceLength)

Max sequence length: 1503


In [3]:
# Encode the documents
kTokenizer = keras.preprocessing.text.Tokenizer() 
kTokenizer.fit_on_texts(X)
Xencoded = np.array([np.array(xi) for xi in kTokenizer.texts_to_sequences(X)])                

print("Shape Before Padding:",np.shape(Xencoded[22]))
Xencoded=keras.preprocessing.sequence.pad_sequences(Xencoded, maxlen=maxsequenceLength, truncating='post')
print("\nShape After Padding:",np.shape(Xencoded[22]))

labels = np.array(labels)

nWords=len(kTokenizer.word_index)
print("\n nWords:",nWords)

Shape Before Padding: (724,)

Shape After Padding: (1503,)

 nWords: 10203


In [4]:
vectorizer = TfidfVectorizer(analyzer=lambda x: x, min_df=1).fit(X)
Xencoded=vectorizer.transform(X)

from sklearn.linear_model import LogisticRegression
results = {}
model = LogisticRegression()
train_x = Xencoded[train_indices]
test_x = Xencoded[test_indices]
train_labels = labels[train_indices]
test_labels = labels[test_indices]
model.fit(train_x, train_labels)
predicted_labels = model.predict(test_x)
results['confusion_matrix'] = confusion_matrix(labels[test_indices], predicted_labels).tolist()
results['classification_report'] = classification_report(labels[test_indices], predicted_labels, digits=4, target_names=namesInLabelOrder, output_dict=True)
print ("\n\nConfusion Matrix:\n",confusion_matrix(labels[test_indices], predicted_labels))
print ("\n\nClassification Report:\n\n",classification_report(labels[test_indices], predicted_labels, digits=4, target_names=namesInLabelOrder))


os.chdir('D:\\PhD\\Dataset\\Results\\Paper2')
filename = 'TFIDF-LR-'+time.strftime("%d-%m-%Y-%H-%M-%S")+'.json'
f = open (filename,'w')
out = json.dumps(results, ensure_ascii=True, indent=4)
f.write(out)
f.close()



Confusion Matrix:
 [[61  0]
 [17  0]]


Classification Report:

               precision    recall  f1-score   support

    Rejected     0.7821    1.0000    0.8777        61
     Allowed     0.0000    0.0000    0.0000        17

    accuracy                         0.7821        78
   macro avg     0.3910    0.5000    0.4388        78
weighted avg     0.6116    0.7821    0.6864        78



  return f(**kwargs)
  _warn_prf(average, modifier, msg_start, len(result))
