# Classification with softmax and SVM

In [1]:
import matplotlib.pyplot as plt
import sklearn
import nltk
import numpy as np
import re
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.metrics import confusion_matrix

## Check the data

In [2]:
df = pd.read_csv('dataset.csv')

In [3]:
df[df['label'] == "building"]

Unnamed: 0,label,sentence
0,building,The earthquake led to the collapse of 72 build...
1,building,Collapsed reinforced concrete buildings were l...
2,building,They show photos of three collapsed buildings.
3,building,"Specifically, it shows the photo of a 6-story ..."
4,building,There is no particular plan or elevation irreg...
...,...,...
62,building,"In that regard, research should continue to de..."
63,building,A 10-story school building in Manila (Emilio A...
64,building,Soil liquefaction underneath the building was ...
65,building,This tilting demonstrates the effect of far fi...


## Clean & Lemmatize (~Tokenize)

In [4]:
def clean(txt_lst):   
    def clean_text(text, remove_stopwords = True):
        text = text.lower()
        text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
        text = re.sub(r'\<a href', ' ', text)
        text = re.sub(r'&amp;', '', text) 
        text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text)
        text = re.sub(r'<br />', ' ', text)
        text = re.sub(r'\'', ' ', text)
        if remove_stopwords:
            text = text.split()
            stops = set(stopwords.words("english"))
            text = [w for w in text if not w in stops]
            text = " ".join(text)
        return nltk.WordPunctTokenizer().tokenize(text)
    return list(map(clean_text, txt_lst))

def lemmatize(txt_lst):
    lemm = nltk.stem.WordNetLemmatizer()
    return list(map(lambda word: list(map(lemm.lemmatize, word)),
                    txt_lst))

df['cleaned'] = clean(df['sentence'])
df['lemmatized'] = lemmatize(df['cleaned'])  

## Vectorize BOW and Split for training/testing

In [5]:
train_data, validate_data = sklearn.model_selection.train_test_split(df, train_size = 0.65, random_state=42)

In [6]:
bow_converter = CountVectorizer(tokenizer=lambda doc: doc, lowercase=False)

x_train = bow_converter.fit_transform(train_data['lemmatized'])
x_validate = bow_converter.transform(validate_data['lemmatized'])

y_train = train_data["label"]
y_validate = validate_data["label"]

## Softmax Regression for Classification

In [24]:
model_lr = LogisticRegression(C=1).fit(x_train, y_train)
train_score = model_lr.score(x_train, y_train)
test_score = model_lr.score(x_validate, y_validate)
print('Train Score: ', train_score)
print('Test Score: ', test_score)

max_test_score = 0


# w/ Elastic Penalty

# for C in np.linspace(0, 3, 20):
#     for r in np.linspace(0, 1, 20):
#         model = LogisticRegression(penalty='elasticnet', 
#                                    C=2, solver='saga', l1_ratio=r).fit(x_train, y_train)
#         test_score = model.score(x_test, y_test)
#         if test_score > max_test_score:
#             max_test_score = test_score

# print('Test Score: ', train_score)
# print('Test Score: ', max_test_score)

Train Score:  1.0
Test Score:  0.717391304347826


### Confusion Matrix (Softmax)

In [8]:
y_pred = model_lr.predict(x_validate)
confusion_matrix(y_validate, y_pred)

array([[22,  0,  4],
       [ 4,  3,  3],
       [ 2,  0,  8]])

## SVM

In [9]:
model_svm = svm.LinearSVC()
model_svm.fit(x_train, y_train)
train_score = model_svm.score(x_train, y_train)
test_score = model_svm.score(x_validate, y_validate)
print('Train Score: ', train_score)
print('Test Score: ', test_score)

max_test_score = 0

Train Score:  1.0
Test Score:  0.7391304347826086


### Confusion Matrix (SVM)

In [11]:
y_pred = model_svm.predict(x_validate)
confusion_matrix(y_validate, y_pred)

array([[21,  0,  5],
       [ 2,  5,  3],
       [ 2,  0,  8]])

In [13]:
df_albania = pd.read_csv('Albania.csv')

df_albania['cleaned'] = clean(df_albania['Sentence'])
df_albania['lemmatized'] = lemmatize(df_albania['cleaned'])  

x_test = bow_converter.transform(df_albania['lemmatized'])

df_albania["SVM"] = model_svm.predict(x_test)
df_albania["Logistic Regression"] = model_lr.predict(x_test)

In [16]:
queries_building = ["building", "house", "apartment", "hotel", 
                 "school", "damage", 
                 ["hospital", "damage"], 
                 ["school", "damage"], 
                 ["hospital", "collapse"],
                 ["school", "failure"],
                 ["school", "damage"],
                 ["school", "collapse"]]

queries_infra = ["bridge", "highway", "road", "dam", "refinery", "airport", 
              "power plant", "rail", "tunnel", "port", "substation", 
              "subway",
              ["transmission", "tower"],
              ["cell", "tower"],
              ["pipeline", "damage"], 
              ["pipeline", "failure"], 
              ["pipeline", "collapse"]]

queries_resil = ["economic", "economy", "population", "casualty", 
              "injury", "electricity", "water", "telecommunication", 
              "phone", "power outage", "transportation", "service", 
              "services", "internet", "displaced", "homeless",
              "builidng", "school", "gas"]

def query_count(sent, queries):
    count = 0
    for query in queries:
        if isinstance(query, list):
            if all([word in sent for word in query]):
                count += 1
        else:
            if query in sent:
                count += 1
    return count

def model_query(sentences):
    y = []
    labels = ["building", "infrastructure", "resilience"]
    for s in sentences:
        count_building = query_count(s, queries_building)
        count_infra = query_count(s, queries_infra)
        count_resil = query_count(s, queries_resil)
        
        counts = [count_building,
                  count_infra, 
                  count_resil]
        
        if max(counts) == 0:
            y.append("other")
        else:
            imax = np.argmax(counts)
            y.append(labels[imax])
    return y

df_albania["Keyword Search"] = model_query(df_albania['Sentence'])
    

In [29]:
df_albania = df_albania.drop(columns=['cleaned', 'lemmatized'])

In [31]:
df_albania.to_csv("dataset_albania.csv")