### Importing Libraries and Packages 

In [1]:
import pandas as pd
import numpy as np 


from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer , TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

pd.set_option('display.max_colwidth', -1)


In [4]:
dataset = pd.read_csv('final_data_sets/cleaned_tweet_train_data.csv')
print(dataset.shape)
dataset.head()

(10008, 3)


Unnamed: 0.1,Unnamed: 0,Tweet Text,Informativeness
0,0,I've got enough candles to supply a Mexican family,off-topic
1,1,Sandy be soooo mad that she be shattering our doors and shiet #HurricaneSandy,on-topic
2,2,@ibexgirl thankfully Hurricane Waugh played it cool and waited this one out. Ready to go at any moment tho.,off-topic
3,3,@taos you never got that magnificent case of Burgundy I sent you to thank you for your tweets?,off-topic
4,4,"I'm at Mad River Bar &amp; Grille (New York, NY) http://t.co/VSiZrzKP",off-topic


In [5]:
dataset = dataset.drop(columns=['Unnamed: 0'])

In [6]:
dataset.head()

Unnamed: 0,Tweet Text,Informativeness
0,I've got enough candles to supply a Mexican family,off-topic
1,Sandy be soooo mad that she be shattering our doors and shiet #HurricaneSandy,on-topic
2,@ibexgirl thankfully Hurricane Waugh played it cool and waited this one out. Ready to go at any moment tho.,off-topic
3,@taos you never got that magnificent case of Burgundy I sent you to thank you for your tweets?,off-topic
4,"I'm at Mad River Bar &amp; Grille (New York, NY) http://t.co/VSiZrzKP",off-topic


### Cleaning the Data

In [7]:
dataset.columns = ['tweet_text', 'label']

In [8]:
dataset.head()

Unnamed: 0,tweet_text,label
0,I've got enough candles to supply a Mexican family,off-topic
1,Sandy be soooo mad that she be shattering our doors and shiet #HurricaneSandy,on-topic
2,@ibexgirl thankfully Hurricane Waugh played it cool and waited this one out. Ready to go at any moment tho.,off-topic
3,@taos you never got that magnificent case of Burgundy I sent you to thank you for your tweets?,off-topic
4,"I'm at Mad River Bar &amp; Grille (New York, NY) http://t.co/VSiZrzKP",off-topic


In [9]:
dataset['tweet_text'] = dataset['tweet_text'].str.replace('[^\w\s]','')
dataset['tweet_text'] = dataset['tweet_text'].str.replace('http\S+|www.\S+', '', case=False)
dataset['tweet_text'] = dataset['tweet_text'].str.replace('\\n', ' ', case=False)
dataset['tweet_text'] = dataset['tweet_text'].str.lower()

In [10]:
dataset.head()

Unnamed: 0,tweet_text,label
0,ive got enough candles to supply a mexican family,off-topic
1,sandy be soooo mad that she be shattering our doors and shiet hurricanesandy,on-topic
2,ibexgirl thankfully hurricane waugh played it cool and waited this one out ready to go at any moment tho,off-topic
3,taos you never got that magnificent case of burgundy i sent you to thank you for your tweets,off-topic
4,im at mad river bar amp grille new york ny,off-topic


In [11]:
dataset.tail()

Unnamed: 0,tweet_text,label
10003,sandy is a weak name for a hurricane,on-topic
10004,seaoftime so freaking excited d and i dont knowi have no plans because of the hurricane,on-topic
10005,rt czd123 i dont find these hurricane jokes funny itsnotajoke,on-topic
10006,best wishes to our friends in the northeast stay safe hurricane sandy,on-topic
10007,update 7threat of hurricane sandy grows as it targets us east coast reuters economic timesupdate 7threat of,on-topic


In [12]:
dataset['label'].value_counts()

on-topic     6138
off-topic    3870
Name: label, dtype: int64

#### Changing label to numeric for classification 

In [13]:
label = {'on-topic':1, 
         'off-topic':0}

In [14]:
dataset['label'] = dataset['label'].map(label)

In [16]:
dataset.head()

Unnamed: 0,tweet_text,label
0,ive got enough candles to supply a mexican family,0
1,sandy be soooo mad that she be shattering our doors and shiet hurricanesandy,1
2,ibexgirl thankfully hurricane waugh played it cool and waited this one out ready to go at any moment tho,0
3,taos you never got that magnificent case of burgundy i sent you to thank you for your tweets,0
4,im at mad river bar amp grille new york ny,0


In [17]:
dataset['label'].value_counts()

1    6138
0    3870
Name: label, dtype: int64

### Stemming the tweet_text

In [18]:
def stemming(tweets):
    tokenizer = RegexpTokenizer(r'\w+')
    stemmer = PorterStemmer()
    tweet_token = tokenizer.tokenize(tweets)
    return ' '.join(map(lambda x: stemmer.stem(x), tweet_token))

In [19]:
dataset['stemmed_tweet'] = dataset['tweet_text'].apply(stemming)

In [20]:
dataset.head()

Unnamed: 0,tweet_text,label,stemmed_tweet
0,ive got enough candles to supply a mexican family,0,ive got enough candl to suppli a mexican famili
1,sandy be soooo mad that she be shattering our doors and shiet hurricanesandy,1,sandi be soooo mad that she be shatter our door and shiet hurricanesandi
2,ibexgirl thankfully hurricane waugh played it cool and waited this one out ready to go at any moment tho,0,ibexgirl thank hurrican waugh play it cool and wait thi one out readi to go at ani moment tho
3,taos you never got that magnificent case of burgundy i sent you to thank you for your tweets,0,tao you never got that magnific case of burgundi i sent you to thank you for your tweet
4,im at mad river bar amp grille new york ny,0,im at mad river bar amp grill new york ny


We tested models with stemmed tweet and without, our model produced better results without using stemming function 


## Modeling

#### 1. Logistic Regression 
#### 2. SVM
#### 3. Random forest 
#### 4. Mutlinomal Naive Bayes

In [21]:
X = dataset['tweet_text']
y = dataset['label']

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X,y,
                                                   random_state = 42,
                                                   stratify = y)

### Baseline Score 

In [23]:
dataset['label'].value_counts(normalize = True)

1    0.613309
0    0.386691
Name: label, dtype: float64

We know our baseline score is about 61.33 %

### Logistic Regression with countvectorize 

In [25]:
pipe_lr_cvec = Pipeline([
    ('cvec', CountVectorizer()),
    ('lr', LogisticRegression())
])

params = {
    'cvec__min_df':[2, 4, 6],                   
    'cvec__ngram_range':[(1,1),(1,2),(1,3)],
    'cvec__stop_words':[None, 'english'],
}

gs_lr_cvec = GridSearchCV(pipe_lr_cvec, 
                          params, 
                          cv=5,
                          n_jobs=2,
                          verbose=1)
                         

gs_lr_cvec.fit(X_train, y_train)

print('Best Params: ',gs_lr_cvec.best_params_)
print('Train Score: ', gs_lr_cvec.best_estimator_.score(X_train, y_train))
print('Test Score: ', gs_lr_cvec.best_estimator_.score(X_test, y_test))

Fitting 5 folds for each of 18 candidates, totalling 90 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    8.8s
[Parallel(n_jobs=2)]: Done  90 out of  90 | elapsed:   16.5s finished


Best Params:  {'cvec__min_df': 6, 'cvec__ngram_range': (1, 2), 'cvec__stop_words': 'english'}
Train Score:  0.9511057820410338
Test Score:  0.9276578737010391


### Logistic Regression with TfidVectorizer

In [26]:
pipe_lr_tf = Pipeline([
    ('tf', TfidfVectorizer()),
    ('lr', LogisticRegression())
])

params_lr_tf = {
    'tf__stop_words':   [None, 'english'],
    'tf__max_features': [500, 1000, 2000], 
    'tf__ngram_range':  [(1, 1), (1, 2)], 
    'tf__min_df' :      [1,2],
    'lr__penalty':      ['l2', 'l1'],
    'lr__C':            [.5, .01, 1],
    'lr__random_state': [42] 
}
gs_lr_tf = GridSearchCV(pipe_lr_tf, 
                        param_grid=params_lr_tf,
                        cv=5,
                        n_jobs=2,
                        verbose=1)

gs_lr_tf.fit(X_train, y_train)


print('Best Params:', gs_lr_tf.best_params_)
print('Train Score:', gs_lr_tf.score(X_train, y_train))
print('Test Score:', gs_lr_tf.score(X_test, y_test))

Fitting 5 folds for each of 144 candidates, totalling 720 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    6.1s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:   24.6s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:   53.5s
[Parallel(n_jobs=2)]: Done 720 out of 720 | elapsed:  1.4min finished


Best Params: {'lr__C': 1, 'lr__penalty': 'l1', 'lr__random_state': 42, 'tf__max_features': 500, 'tf__min_df': 1, 'tf__ngram_range': (1, 1), 'tf__stop_words': None}
Train Score: 0.929656274980016
Test Score: 0.9220623501199041


    Logistic Regression with CountVectorize : 
    Train score is 95.11 %
    Test score is 92.76 %
        
    Logistic Regression with TfidVectorizer : 
    Train score is 92.96 %
    Test score is 92.20 %
    


### SVM model With CountVectorizer 

In [27]:
pipe_svm_cvec = Pipeline([
    ('cvec', CountVectorizer()),
    ('svm', svm.SVC())
])

params = {
    'cvec__min_df':[2,4,6],
    'cvec__stop_words':[None ,'english'],
}

gs_svm_cvec = GridSearchCV(pipe_svm_cvec,
                  params,
                  cv=5, 
                  verbose=2,
                  n_jobs=-1)

gs_svm_cvec.fit(X_train, y_train)

print('Best Params: ',gs_svm_cvec.best_params_)
print('Train Score: ', gs_svm_cvec.best_estimator_.score(X_train, y_train))
print('Test Score: ', gs_svm_cvec.best_estimator_.score(X_test, y_test))

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:   41.6s finished


Best Params:  {'cvec__min_df': 2, 'cvec__stop_words': 'english'}
Train Score:  0.8960831334932055
Test Score:  0.8968824940047961


### SVM Model with TfidVectorizer

In [28]:
pipe_svm_tf = Pipeline([
    ('tf', TfidfVectorizer()),
    ('svm', svm.SVC())
])

params_svm_tf = {
    'tf__stop_words':   [None, 'english'],
    'tf__max_features': [500, 1000, 2000,], 
    'tf__ngram_range':  [(1, 1), (1, 2), (1, 3)], 
}
gs_svm_tf = GridSearchCV(pipe_svm_tf, 
                        param_grid=params_svm_tf,
                        cv=5,
                        n_jobs=2,
                        verbose=1)

gs_svm_tf.fit(X_train, y_train)


print('Best Params: ', gs_svm_tf.best_params_)
print('Train Score:', gs_svm_tf.score(X_train, y_train))
print('Test Score:', gs_svm_tf.score(X_test, y_test))

Fitting 5 folds for each of 18 candidates, totalling 90 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:  1.3min
[Parallel(n_jobs=2)]: Done  90 out of  90 | elapsed:  2.6min finished


Best Params:  {'tf__max_features': 500, 'tf__ngram_range': (1, 1), 'tf__stop_words': None}
Train Score: 0.6133759658939515
Test Score: 0.6131095123900879


    SVM with CountVectorize : 
    Train score is 89.60 %
    Test score is 89.68 %

    SVM with TfidVectorizer : 
    Train score is 61.33 %
    Test score is 61.31 %
    


### Random Forest With Count Vectorizer

In [29]:
pipe_rf_cvec = Pipeline([
    ('cvec', CountVectorizer()),
    ('rf', RandomForestClassifier() )
])

params_rf_cvec = {
    'cvec__min_df':[2,4],
    'cvec__stop_words':[None ,'english'],
    'cvec__ngram_range':[(1,2),(1,3)],
    'rf__n_estimators':[75, 200],
    'rf__max_depth':[25, 75],
    'rf__min_samples_split':[2,4]
}

gs_rf_cvec = GridSearchCV(pipe_rf_cvec,
                          params_rf_cvec,
                          cv=5,
                          verbose=2,
                          n_jobs=-1)

gs_rf_cvec.fit(X_train, y_train)


print('Best Params:' , gs_rf_cvec.best_params_)
print('Train Score:', gs_rf_cvec.score(X_train, y_train))
print('Test Score:', gs_rf_cvec.score(X_test, y_test))

Fitting 5 folds for each of 64 candidates, totalling 320 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   26.7s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 320 out of 320 | elapsed:  3.9min finished


Best Params: {'cvec__min_df': 4, 'cvec__ngram_range': (1, 2), 'cvec__stop_words': 'english', 'rf__max_depth': 75, 'rf__min_samples_split': 4, 'rf__n_estimators': 75}
Train Score: 0.9634958699706901
Test Score: 0.9212629896083133


### Random Forest with Tfidvectorizer

In [30]:
pipe_rf_tf = Pipeline([
    ('tf', TfidfVectorizer()),
    ('rf', RandomForestClassifier())
])

params_rf_tf = {
    'tf__stop_words':   [None, 'english'],
    'tf__max_features': [500, 1000, 2000,], 
    'tf__ngram_range':  [(1, 1), (1, 2), (1, 3)], 
}
gs_rf_tf = GridSearchCV(pipe_rf_tf, 
                        param_grid=params_svm_tf,
                        cv=5,
                        n_jobs=2,
                        verbose=1)

gs_rf_tf.fit(X_train, y_train)


print('Best Params:', gs_rf_tf.best_params_)
print('Train Score:', gs_rf_tf.score(X_train, y_train))
print('Test Score:', gs_rf_tf.score(X_test, y_test))

Fitting 5 folds for each of 18 candidates, totalling 90 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:   10.9s
[Parallel(n_jobs=2)]: Done  90 out of  90 | elapsed:   22.1s finished


Best Params: {'tf__max_features': 2000, 'tf__ngram_range': (1, 1), 'tf__stop_words': 'english'}
Train Score: 0.991207034372502
Test Score: 0.9172661870503597


    Random Forest with CountVectorize : 
    Train score is 96.34 %
    Test score is 92.12 %

    Random Forest with TfidVectorizer : 
    Train score is 99.12 %
    Test score is 91.72 %

### Mutlinomal Naive Bayes Model with CountVectorizer

In [31]:
pipe_nb_cvec = Pipeline([
    ('cvec', CountVectorizer()),
    ('nb', MultinomialNB())
])

params = {
    'cvec__min_df':[1,2,4],
    'cvec__stop_words':[None, 'english'],
    'nb__alpha': [0.1,1,2]
}

gs_nb_cvec = GridSearchCV(pipe_nb_cvec, 
                  params, 
                  cv=5,
                  verbose=2,
                  n_jobs=-1)

gs_nb_cvec.fit(X_train, y_train)

print('Best Params: ', gs_nb_cvec.best_params_)
print('Score Train: ', gs_nb_cvec.score(X_train, y_train))
print('Score Test: ', gs_nb_cvec.score(X_test, y_test))

Fitting 5 folds for each of 18 candidates, totalling 90 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    3.2s
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:    6.5s finished


Best Params:  {'cvec__min_df': 4, 'cvec__stop_words': None, 'nb__alpha': 2}
Score Train:  0.8990141220357047
Score Test:  0.8705035971223022


### Mutlinomal Naive Bayes Model with Tfidvectorizer 

In [32]:
pipe_nb_tf = Pipeline([
    ('tf', TfidfVectorizer()),
    ('nb', MultinomialNB())
])

params = {
    'tf__stop_words':   [None, 'english'],
    'tf__max_features': [1000, 2000],
    'tf__ngram_range':  [(1,1), (1, 2)],
    'nb__alpha': [0.1,1]
}

gs_nb_tf = GridSearchCV(pipe_nb_tf, 
                  params, 
                  cv=5,
                  verbose=2,
                  n_jobs=-1)

gs_nb_tf.fit(X_train, y_train)

print('Best Params:' , gs_nb_tf.best_params_)
print('Score Train: ', gs_nb_tf.score(X_train, y_train))
print('Score Test: ', gs_nb_tf.score(X_test, y_test))

Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    3.5s
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:    8.3s finished


Best Params: {'nb__alpha': 0.1, 'tf__max_features': 2000, 'tf__ngram_range': (1, 2), 'tf__stop_words': None}
Score Train:  0.8878230748734346
Score Test:  0.8481215027977618


    Naive Bayes with CountVectorize : 
    Train score is 89.90 %
    Test score is 87.05 %

    Naive bayes with TfidVectorizer : 
    Train score is 88.78 %
    Test score is 84.88 %

Out of all the model, Logistic regression with Tfidvectorizer has the best score. We will make predictions using that model. 

    Logistic Regression with TfidVectorizer : 
    Train score is 92.96 %
    Test score is 92.20 %

In [33]:
dataset_test = pd.read_csv('final_data_sets/cleaned_tweet_test_data.csv')

In [34]:
dataset_test.head()

Unnamed: 0.1,Unnamed: 0,created_at,text,retweet_count,lang,place_name,coordinates_longitude,coordinates_latitude
0,0,Mon Oct 22 05:00:00 +0000 2012,I suppose she has an appropriate costume for every activity... #ilovemaggiesmith #downtonseasonthree,0,en,"West Long Branch, NJ",-74.037008,40.272289
1,1,Mon Oct 22 05:00:00 +0000 2012,@NOT_savinHOES Not r yu upp,0,en,"Bressler-Enhaut-Oberlin, PA",-76.831479,40.22417
2,2,Mon Oct 22 05:00:00 +0000 2012,Hit and Run is so sad..,0,en,"South Carolina, USA",-83.353955,32.04683
3,3,Mon Oct 22 05:00:00 +0000 2012,Who's up?,0,en,"Malden, MA",-71.089522,42.412466
4,4,Mon Oct 22 05:00:00 +0000 2012,@augustushazel idk I'm just ugly or annoying or something,0,en,"Erie, PA",-80.239991,42.018414


In [35]:
del dataset_test['Unnamed: 0']

In [36]:
dataset_test.head()

Unnamed: 0,created_at,text,retweet_count,lang,place_name,coordinates_longitude,coordinates_latitude
0,Mon Oct 22 05:00:00 +0000 2012,I suppose she has an appropriate costume for every activity... #ilovemaggiesmith #downtonseasonthree,0,en,"West Long Branch, NJ",-74.037008,40.272289
1,Mon Oct 22 05:00:00 +0000 2012,@NOT_savinHOES Not r yu upp,0,en,"Bressler-Enhaut-Oberlin, PA",-76.831479,40.22417
2,Mon Oct 22 05:00:00 +0000 2012,Hit and Run is so sad..,0,en,"South Carolina, USA",-83.353955,32.04683
3,Mon Oct 22 05:00:00 +0000 2012,Who's up?,0,en,"Malden, MA",-71.089522,42.412466
4,Mon Oct 22 05:00:00 +0000 2012,@augustushazel idk I'm just ugly or annoying or something,0,en,"Erie, PA",-80.239991,42.018414


In [37]:
test_text = dataset_test['text']

In [38]:
type(test_text)

pandas.core.series.Series

In [39]:
preds = gs_lr_tf.predict(test_text)

In [40]:
new_preds = pd.DataFrame({'Tweet': dataset_test['text'], 'Prediction' : preds , 
                          'longitude':dataset_test['coordinates_longitude'],
                         'latitude':dataset_test['coordinates_latitude']})

In [41]:
new_preds.head()

Unnamed: 0,Tweet,Prediction,longitude,latitude
0,I suppose she has an appropriate costume for every activity... #ilovemaggiesmith #downtonseasonthree,0,-74.037008,40.272289
1,@NOT_savinHOES Not r yu upp,0,-76.831479,40.22417
2,Hit and Run is so sad..,0,-83.353955,32.04683
3,Who's up?,0,-71.089522,42.412466
4,@augustushazel idk I'm just ugly or annoying or something,0,-80.239991,42.018414


In [42]:
new_preds.shape

(102254, 4)

In [43]:
new_preds['Prediction'].value_counts()

0    101727
1    527   
Name: Prediction, dtype: int64

Out of 102254 tweets that was collected from all over the united states during the time of hurrican sandy, our model predicted that 527 were disaster realted tweets. 


In [47]:
Final_preds = pd.DataFrame(new_preds[new_preds['Prediction'] == 1])

In [48]:
Final_preds.shape

(527, 4)

In [49]:
Final_preds.to_csv('final_pred.csv')

We took our final 527 prediction tweets csv file with its geo location and ploted on the map using tableau public. As you can see most the tweets are coming from states like New york, New jersery, virgnia, basically all the sates along the coast that were impacted the most. Since our model is not 100 percent accurate, we do have noise coming from different states. 

![](prediction_location.png)