In [32]:
# imports to run Baselines
# pip install nltk pandas numpy scikit-learn
# Python 3.9
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import re
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from collections import Counter



In [33]:
# Run once
#import nltk
#nltk.download('stopwords')

In [34]:
#Pre-Processing Code
def lower(text):
    return text.lower()

def rem_punc(text):
    return re.sub(r'[^\w\s]', ' ', text)

def rem_stopwords(list):
    stop_words = stopwords.words('english')
    return [word for word in list if word not in stop_words]

def lem(list):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(word) for word in list]

def stem(list):
    ps = PorterStemmer()
    return [ps.stem(word) for word in list]

def preprocessing(text, stemy=False):
    text = lower(text)
    text = rem_punc(text)
    text = re.sub(r'\d+', '', text)
    list = text.split()
    list = lem(list)
    if stemy: list = stem(list)
    list = rem_stopwords(list)
    return " ".join(list)

In [35]:
# Logical Regression Baseline-Model
train_data = pd.read_csv("training_data.csv")
validation_data = pd.read_csv("validation_data.csv")

y_train = train_data['labels'].values
train_corpus = []
for i in range(0, len(train_data['labels'])):
    train_corpus.append(preprocessing(train_data['file_content'][i]))

y_test = validation_data['labels'].values
test_corpus = []
for i in range(0, len(validation_data['labels'])):
    test_corpus.append(preprocessing(validation_data['file_content'][i]))

cv = CountVectorizer()
x_train = cv.fit_transform(train_corpus).toarray()
x_test = cv.transform(test_corpus).toarray()

classifier = LogisticRegression(max_iter=500)
classifier.fit(x_train, y_train)

y_pred = classifier.predict(x_test)

''' If you want to see the failing cases
correct = 0
total = 0
for i in range(len(y_test)):
    if y_test[i] == y_pred[i]:
        correct += 1
        total += 1
        print("Passed ----> ", end="")
    else:
        total += 1
        print("Failed ----> ", end="")
    print("actual: " + y_test[i] + ", prediction: " + y_pred[i])
'''
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print("\nClassification Report BOW Logical Regression:")
print(report)

Accuracy: 0.5333

Classification Report BOW Logical Regression:
                 precision    recall  f1-score   support

Fire_Department       0.75      0.50      0.60         6
Law_Enforcement       0.79      0.55      0.65        42
  Non_Emergency       0.21      1.00      0.34         7
     Paramedics       0.88      0.35      0.50        20

       accuracy                           0.53        75
      macro avg       0.66      0.60      0.52        75
   weighted avg       0.76      0.53      0.58        75



In [None]:
# Majority-Baseline Model
most_common_label = max(set(y_train), key=list(y_train).count)
majority_prediction = np.array([most_common_label] * len(y_test))


majority_baseline = accuracy_score(y_test, majority_prediction)
print(f"Majority Baseline Accuracy: {majority_baseline:.4f}")
print("\nReport")
print(classification_report(y_test, majority_prediction))

Majority Baseline Accuracy: 0.5600

Report
                 precision    recall  f1-score   support

Fire_Department       0.00      0.00      0.00         6
Law_Enforcement       0.56      1.00      0.72        42
  Non_Emergency       0.00      0.00      0.00         7
     Paramedics       0.00      0.00      0.00        20

       accuracy                           0.56        75
      macro avg       0.14      0.25      0.18        75
   weighted avg       0.31      0.56      0.40        75



In [None]:
# submission.csv format
testing_data = pd.read_csv("testing_data.csv")
test_corpus = []
for i in range(0, len(testing_data['id'])):
    test_corpus.append(preprocessing(testing_data['file_content'][i]))
x_testing = cv.transform(test_corpus).toarray()


testing_predictions = classifier.predict(x_testing)
testing_data['labels'] = testing_predictions
testing_data.to_csv("submission.csv", index=False)

# This is the format of submission expected.