In [13]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score

# CNN imports
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense

# logistic Regression
from sklearn.linear_model import LogisticRegression

# Naive Bayes models
import sklearn.naive_bayes as nb

# ensemble Decision Trees
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

# Support Vector Machines
from sklearn.svm import SVC

# Boosting methods
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier

### Functions used for the notebook

In [4]:
# function to quickly call the accuracy of the model
def model_acc(model):
    print(f'training score: {round(model.score(X_train,y_train) * 100, 4)}%')
    print(f'testing score: {round(model.score(X_test,y_test) * 100, 4)}%')
    
# function that formats the predictions of the CNN into a 0-1 binary
def predictions(cnn, x):
    preds = cnn.predict_on_batch(x)
    return preds > 0.5

# Preprocessing
-----------------

In [7]:
# reading in the data
train = pd.read_csv("../Data/Train & Test/eq_tweets_train.csv")
test = pd.read_csv("../Data/Train & Test/eq_tweets_test.csv")

# creating variables to fit the model
X_train = train["tweet_text"]
y_train = train["label"]
X_test = test["tweet_text"]
y_test = test["relevant"]

# fitting the TFiDF for the below models
tvec = TfidfVectorizer(max_features=5000, 
                       ngram_range=(1,2), 
                       stop_words="english")
tvec.fit(X_train)

# resetting the x variables to be the correctly formatted matrix
X_train = tvec.transform(X_train).todense()
X_test = tvec.transform(X_test).todense()

# Baseline Score
________________________

### 50% accuracy

# Modeling
___________________
### Neural Network



In [17]:
# instatiate the model with adding the layers and nodes
classifier = Sequential()

classifier.add(Dense(units = 1000,
                     kernel_initializer = 'uniform', 
                     activation = 'relu',
                     input_dim=5000)
              )

classifier.add(Dense(units = 1000,
                     kernel_initializer = 'uniform', 
                     activation = 'relu'
                    )
              )

classifier.add(Dense(units = 1,
                     kernel_initializer = 'uniform', 
                     activation = 'sigmoid'
                    )
              )

classifier.compile(optimizer='adam', 
                   loss = 'binary_crossentropy', 
                   metrics = ['accuracy'])
# fit the model with train
classifier.fit(X_train, 
               y_train, 
               batch_size = 10, 
               epochs = 5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x20241faa2e8>

In [18]:
# finding the accuracy of the model
y_hat_train = predictions(classifier, X_train)
y_hat_test = predictions(classifier, X_test)

print(f'training score: {round(accuracy_score(y_train, y_hat_train) * 100, 4)}%')
print(f'testing score: {round(accuracy_score(y_test, y_hat_test) * 100, 4)}%')

training score: 99.6175%
testing score: 97.7119%


### Logistic Regression
______________________________

In [19]:
# instantiate the model
logr = LogisticRegression()

# fit the model
logr.fit(X_train, y_train)

model_acc(logr)



training score: 96.4451%
testing score: 97.9661%


### Naive Bayes
________________

In [20]:
# Multinomial Naive Bayes
# instantiate the model
mnnb = nb.MultinomialNB()

# fit the model
mnnb.fit(X_train, y_train)

model_acc(mnnb)

training score: 95.0402%
testing score: 97.3729%


In [21]:
# Gaussian Naive Bayes
# instantiate the model
gaus = nb.GaussianNB()

# fit the model
gaus.fit(X_train, y_train)

model_acc(gaus)

training score: 93.8596%
testing score: 93.8983%


### Random Forrest
_________________

In [8]:
# instantiate the model
rfclass = RandomForestClassifier(n_estimators=100)

# fit the model
rfclass.fit(X_train, y_train)

model_acc(rfclass)

training score: 99.9275%
testing score: 98.5593%


### Extra Trees Classifier
________________

In [23]:
# instatiate the model
etclass = ExtraTreesClassifier(n_estimators=100)

# fit the model
etclass.fit(X_train, y_train)

model_acc(etclass)

training score: 99.9275%
testing score: 97.8814%


### Support Vector Classifier
____________

In [None]:
# # instatiate the model
# svc = SVC(kernel="rbf", gamma="scale")

# # fit the model
# svc.fit(X_train, y_train)

# model_acc(svc)

### AdaBoost Classifier
_____________________________

In [25]:
# instatiate the model
ada_boost = AdaBoostClassifier(random_state=42)

# fit the model
ada_boost.fit(X_train, y_train)

model_acc(ada_boost)

training score: 95.2579%
testing score: 98.8136%


### Gradient Boosting Classifier
___________

In [26]:
# instatiate the model
gradient_boost = GradientBoostingClassifier(random_state=42)

# fit the model
gradient_boost.fit(X_train, y_train)

model_acc(gradient_boost)

training score: 94.7039%
testing score: 98.8983%


# Saving model
__________________

In [9]:
import pickle

pickle.dump(rfclass, open('rndm_forest.sav', 'wb'))

In [12]:
# testing the save and if it works
load = pickle.load(open('rndm_forest.sav', 'rb'))
model_acc(load)

training score: 99.9275%
testing score: 98.5593%


In [14]:
pickle.dump(tvec, open('tvec.sav', 'wb'))