In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import StandardScaler

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [2]:
df = pd.read_csv('data/sandy.csv')
df.head()

Unnamed: 0,tweet id,tweet,label
0,262596552399396864,I've got enough candles to supply a Mexican fa...,0
1,263044104500420609,Sandy be soooo mad that she be shattering our ...,1
2,263309629973491712,@ibexgirl thankfully Hurricane Waugh played it...,0
3,263422851133079552,@taos you never got that magnificent case of B...,0
4,262404311223504896,"I'm at Mad River Bar &amp; Grille (New York, N...",0


In [3]:
X = df['tweet']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=50)

In [4]:
stopwords = nltk.corpus.stopwords.words('english')

## Baseline Model

In [7]:
y_test.value_counts(normalize=True)

1    0.597116
0    0.402884
Name: label, dtype: float64

## Logistic Regression

### With CountVectorizer

In [8]:
pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('lr', LogisticRegression(solver = 'liblinear'))
])

pipe_params = {
    'cvec__max_features': [None, 500, 1000, 5000],
    'cvec__min_df': [2, 5],
    'cvec__max_df': [0.1,.9, .95],
    'cvec__ngram_range': [(1,1), (1,2)],
    'cvec__stop_words': ['english']
}

gs = GridSearchCV(pipe, 
                  param_grid=pipe_params, 
                  cv=5)

gs.fit(X_train, y_train)
gs_model = gs.best_estimator_
print(gs_model.score(X_train, y_train))
print(gs_model.score(X_test, y_test))
print(cross_val_score(gs_model, X_train, y_train, cv=5).mean())

0.9389226636505019
0.9198473282442748
0.9261950778578744


### With TfIdfVectorizer

In [10]:
pipe = Pipeline([
    ('tvec', TfidfVectorizer()),
    ('lr', LogisticRegression(solver = 'liblinear'))
])

pipe_params = {
    'tvec__max_features': [None, 500, 1000, 5000],
    'tvec__min_df': [2, 5],
    'tvec__max_df': [0.1,.9, .95],
    'tvec__ngram_range': [(1,1), (1,2)],
    'tvec__stop_words': ['english']
}

gs = GridSearchCV(pipe, 
                  param_grid=pipe_params, 
                  cv=5)

gs.fit(X_train, y_train)
gs_model = gs.best_estimator_
print(gs_model.score(X_train, y_train))
print(gs_model.score(X_test, y_test))
print(cross_val_score(gs_model, X_train, y_train, cv=5).mean())

0.9310052311607522
0.9147582697201018
0.9209645947910872


## Random Forest Classifier

In [15]:
pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('rf', RandomForestClassifier(bootstrap=True))
])

pipe_params = {
    'cvec__max_features': [None, 500, 1000, 5000],
    'cvec__min_df': [2, 5],
    'cvec__max_df': [0.1,.9, .95],
    'cvec__ngram_range': [(1,1), (1,2)],
    'cvec__stop_words': ['english']
}

gs = GridSearchCV(pipe, 
                  param_grid=pipe_params, 
                  cv=5)

gs.fit(X_train, y_train)
gs_model = gs.best_estimator_
print(gs_model.score(X_train, y_train))
print(gs_model.score(X_test, y_test))
print(cross_val_score(gs_model, X_train, y_train, cv=5).mean())

0.9929308638484378
0.9105173876166243
0.9171493391221753
