## Disaster or not: Text Classification using TFIDF and Logistic Regression

In [None]:
!wget https://github.com/ravi-ilango/odsc2020_nlp/blob/main/lab1/disaster_data.zip?raw=true -O disaster_data.zip

!unzip disaster_data.zip

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

### Load data

In [None]:
pd.read_csv('./disaster_data/train.csv').head()

In [None]:
#
# queries are stored in the variable query_text
# correct intent labels are stored in the variable labels
#
query_text = pd.read_csv('./disaster_data/train.csv').text.values
labels = pd.read_csv('./disaster_data/train.csv').target.values

query_text.shape

In [None]:
plt.hist(labels)
plt.xlabel('target')
plt.ylabel('count')
plt.title('target distribution')
plt.xticks(np.arange(len(np.unique(labels))));

### Train and Test split

In [None]:
from sklearn.model_selection import train_test_split

query_train, query_test, y_train, y_test = train_test_split(query_text, labels, test_size=0.2, random_state=13)

### Vectorize the text document

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

ngram_range = (1,2)

vectorizer = TfidfVectorizer(ngram_range=ngram_range, 
                             stop_words='english', 
                             max_features=150)

X_train = vectorizer.fit_transform(query_train).toarray()
X_test = vectorizer.transform(query_test).toarray()

### Fit a classifier using the vectors

In [None]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression()
clf.fit(X_train, y_train)

### Calculate Test Accuracy

In [None]:
y_pred = clf.predict(X_test)
print (f"Test Accuracy = {(y_pred == y_test).mean()}")

### Check Model Prediction

In [None]:
def predict(model, query_txt):
    x = vectorizer.transform([query_txt]).toarray()
    pred = model.predict(x)
    if pred[0] == 1:
        print ("Disaster")
    else:
        print ("Not a disaster")

In [None]:
predict (clf, "Forest fire near La Ronge Sask. Canada")

In [None]:
predict(clf, "The weather is awesome")
