# Data exploration

#### This **.ipynb** file inspects the data

In [19]:
data.shape

(44898, 2)

In [20]:
for i in data:
    print(i)

text
label


In [21]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
test = le.fit(data.text)

In [22]:
from sklearn.model_selection import train_test_split

In [23]:
X_train, X_test, y_train, y_test = train_test_split(data.text, data.label, test_size = 0.3, random_state = 0)

#### Functions for creating the model 

In [24]:
from sklearn.linear_model     import LogisticRegression
from sklearn.ensemble         import RandomForestClassifier
from sklearn.ensemble         import VotingClassifier
from sklearn.model_selection  import cross_val_score

In [25]:
lgr_clf = LogisticRegression(random_state=0)
rdf_clf = RandomForestClassifier(n_estimators=50, random_state=0)

es_clf = VotingClassifier(
             estimators = [('lgr', lgr_clf), ('rdf', rdf_clf)],
             voting = 'soft'
         )

In [26]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

pipe = Pipeline ([
    ("vect", CountVectorizer()),
    ("tfidf", TfidfTransformer()),
    ("clf", es_clf)
])


In [27]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf',
                 VotingClassifier(estimators=[('lgr',
                                               LogisticRegression(random_state=0)),
                                              ('rdf',
                                               RandomForestClassifier(n_estimators=50,
                                                                      random_state=0))],
                                  voting='soft'))])

In [28]:
# Accuracy
from sklearn.metrics import accuracy_score

prediction = pipe.predict(X_test)
print("Accuracy: {}%".format(round(accuracy_score(y_test, prediction)*100,2)))

Accuracy: 99.3%


#### Saving the model as **logistic_regression_model.sav**

In [29]:
import _pickle as pickle

In [30]:
model_file = open('ensModel.sav', 'wb')
pickle.dump(pipe, model_file)
model_file.close()