In [1]:
import pandas as pd
import numpy as np

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB

from sklearn.feature_extraction.text import TfidfVectorizer

from nltk.corpus import stopwords

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df = pd.read_csv('data/clean_data.csv')

X = df['text']
y = df['target_col']

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size = 0.2,
                                                    random_state=42) 

In [3]:
tf = TfidfVectorizer(stop_words=stopwords.words("english"),
                     max_features=None,
                     ngram_range=(1,1),
                     strip_accents='ascii' 
                    )

tf.fit(X_train)
X_train_tf = tf.transform(X_train).todense()
X_test_tf = tf.transform(X_test).todense()

In [4]:
mn = MultinomialNB()

mn.fit(X_train_tf, y_train)
(mn.score(X_train_tf, y_train), mn.score(X_test_tf, y_test))

(0.780375, 0.7065)

In [5]:
gnb = GaussianNB(var_smoothing=1e-1)

gnb.fit(X_train_tf, y_train)
(gnb.score(X_train_tf, y_train), gnb.score(X_test_tf, y_test))

(0.839875, 0.6995)

In [6]:
dt = DecisionTreeClassifier(random_state=42)

dt.fit(X_train_tf, y_train)
(dt.score(X_train_tf, y_train), dt.score(X_test_tf, y_test))

(0.992, 0.717)

In [7]:
rf = RandomForestClassifier(random_state=42)

rf.fit(X_train_tf, y_train)
(rf.score(X_train_tf, y_train), rf.score(X_test_tf, y_test))



(0.983, 0.726)

In [None]:
ada = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(random_state=42))

ada.fit(X_train_tf, y_train)
(ada.score(X_train_tf, y_train), ada.score(X_test_tf, y_test))

In [None]:
grad = GradientBoostingClassifier()

grad.fit(X_train_tf, y_train)
(grad.score(X_train_tf, y_train), grad.score(X_test_tf, y_test))

In [None]:
bag = BaggingClassifier(n_estimators=1001,
                        random_state=42,
                        n_jobs=-1)

bag.fit(X_train_tf, y_train)
(bag.score(X_train_tf, y_train), bag.score(X_test_tf, y_test))