## Preliminary Set Up

In [29]:
# import general packages
import pandas as pd
import numpy as np

In [30]:
# load in data
training = pd.read_csv('../data/train.csv')
testing = pd.read_csv('../data/test.csv')

## Descriptive Statistics / Data Exploration

In [31]:
# training data size
training.count()

id          7613
keyword     7552
location    5080
text        7613
target      7613
dtype: int64

In [32]:
# testing data size
testing.count()

id          3263
keyword     3237
location    2158
text        3263
dtype: int64

In [33]:
# count number of observations in each class
training['target'].value_counts() # 1 = disaster, 0 = not

0    4342
1    3271
Name: target, dtype: int64

## Basic NLP

In [34]:
# split into initial training & testing (Kaggle test data is unlabeled)
from sklearn.model_selection import train_test_split
training_i, testing_i = train_test_split(training, test_size=0.2)

### Naive Bayes

In [35]:
# imported needed functions for Naive Bayes
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

In [36]:
# learn basic text classifier using bag of words & naive bayes
textClassNB = Pipeline([('wordVector', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),
                      ('classify', MultinomialNB()),
                    ])
textClassNB = textClassNB.fit(training_i['text'], training_i['target'])

In [37]:
# test performance of naive bayes classifier
predicted = textClassNB.predict(testing_i['text'])
np.mean(predicted == testing_i['target'])

0.7951411687458962

### SVM

In [38]:
# learn basic text classifier using bag of words & SVM
from sklearn.linear_model import SGDClassifier
textClassSVM = Pipeline([('wordVector',CountVectorizer(stop_words='english')),
                        ('tfidf', TfidfTransformer()),
                        ('classifySVM', 
                         SGDClassifier(loss='hinge',
                                       penalty='l2',
                                       max_iter=15,
                                       tol=1e-3,
                                      alpha=1e-3))])
textClassSVM = textClassSVM.fit(training_i['text'], training_i['target'])

In [39]:
# test performance of SVM
predictedSVM = textClassSVM.predict(testing_i['text'])
np.mean(predictedSVM == testing_i['target'])

0.757715036112935