# Classification with softmax and SVM

In [13]:
import matplotlib.pyplot as plt
import sklearn
import nltk
import numpy as np
import re
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression

## Check the data

In [2]:
df = pd.read_csv('dataset.csv')

In [3]:
df.head()

Unnamed: 0,label,sentence
0,building,The earthquake led to the collapse of 72 build...
1,building,Collapsed reinforced concrete buildings were l...
2,building,They show photos of three collapsed buildings.
3,building,"Specifically, it shows the photo of a 6-story ..."
4,building,There is no particular plan or elevation irreg...


In [1]:
df[df['label'] == "building"]

NameError: name 'df' is not defined

## Clean & Lemmatize (~Tokenize)

In [5]:
def clean(txt_lst):   
    def clean_text(text, remove_stopwords = True):
        text = text.lower()
        text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
        text = re.sub(r'\<a href', ' ', text)
        text = re.sub(r'&amp;', '', text) 
        text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text)
        text = re.sub(r'<br />', ' ', text)
        text = re.sub(r'\'', ' ', text)
        if remove_stopwords:
            text = text.split()
            stops = set(stopwords.words("english"))
            text = [w for w in text if not w in stops]
            text = " ".join(text)
        return nltk.WordPunctTokenizer().tokenize(text)
    return list(map(clean_text, txt_lst))

def lemmatize(txt_lst):
    lemm = nltk.stem.WordNetLemmatizer()
    return list(map(lambda word: list(map(lemm.lemmatize, word)),
                    txt_lst))

df['cleaned'] = clean(df['sentence'])
df['lemmatized'] = lemmatize(df['cleaned'])  

In [6]:
df.head

<bound method NDFrame.head of           label                                           sentence  \
0      building  The earthquake led to the collapse of 72 build...   
1      building  Collapsed reinforced concrete buildings were l...   
2      building     They show photos of three collapsed buildings.   
3      building  Specifically, it shows the photo of a 6-story ...   
4      building  There is no particular plan or elevation irreg...   
..          ...                                                ...   
126  resilience  Regarding the consequences of the earthquake, ...   
127  resilience  Regarding economic losses, this USGS tool also...   
128  resilience  Still, this event is another reminder of the p...   
129  resilience  Similar to the Mw 6.5 Idaho earthquake, had th...   
130  resilience  A potential complication in that case would be...   

                                               cleaned  \
0    [earthquake, led, collapse, 72, buildings, inc...   
1    [collaps

## Vectorize BOW and Split for training/testing

In [8]:
train_data, test_data = sklearn.model_selection.train_test_split(df, train_size = 0.65, random_state=42)

In [11]:
bow_converter = CountVectorizer(tokenizer=lambda doc: doc, lowercase=False)

x_train = bow_converter.fit_transform(train_data['cleaned'])
x_test = bow_converter.transform(test_data['cleaned'])

y_train = train_data["label"]
y_test = test_data["label"]

## Softmax Regressen for Classification

In [50]:
model = LogisticRegression(C=1).fit(x_train, y_train)
train_score = model.score(x_train, y_train)
test_score = model.score(x_test, y_test)
print('Train Score: ', train_score)
print('Test Score: ', test_score)

Train Score:  1.0
Test Score:  0.7391304347826086


## Softmax Regression w/ Elastic Penalty

In [48]:
max_test_score = 0

for C in np.linspace(0, 3, 20):
    for r in np.linspace(0, 1, 20):
        model = LogisticRegression(penalty='elasticnet', 
                                   C=2, solver='saga', l1_ratio=r).fit(x_train, y_train)
        test_score = model.score(x_test, y_test)
        if test_score > max_test_score:
            max_test_score = test_score

print('Test Score: ', train_score)
print('Test Score: ', max_test_score)





















Test Score:  1.0
Test Score:  0.717391304347826


