# Loading the Modules

In [12]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.metrics import roc_curve
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

# Loading the Data

In [2]:
active = pd.read_csv("active19-clean.csv", 
                     usecols=['Text','Date','Name','Location','search term','clean_text'])
active20 = pd.read_csv("active20-clean.csv", 
                     usecols=['Text','Date','Name','Location','search term','clean_text'])
active = active.append(active20)
active['type'] = 'active'
active.head()

Unnamed: 0,Text,Date,Name,Location,search term,clean_text,type
0,New #knife post on our forum! » New Opinel #6 ...,2019-03-24,mtjsblog,,outdoor,new knife post forum new opinel httpstcoprbmwh...,active
1,New #shtf discussion on our forum! » https://t...,2019-03-18,mtjsblog,,outdoor,new shtf discussion forum httpstcowzretpdwb,active
2,Good time this weekend with our @twistedtea @c...,2019-04-01,tydillon,"Welcome, North Carolina",outdoor,good time weekend twistedtea chevy texas call_...,active
3,Proud of GEICO’s dedication to NASCAR! https:/...,2019-03-25,tydillon,"Welcome, North Carolina",outdoor,proud geico dedication nascar httpstcouqxaszkrv,active
4,Summer is in the air! Call us to get started o...,2019-04-10,JSQimprovements,"Huntington Beach, CA",outdoor,summer air call us get started custom swimming...,active


In [3]:
lazy = pd.read_csv("lazy19-clean.csv", 
                     usecols=['Text','Date','Name','Location','search term','clean_text'])
lazy20 = pd.read_csv("lazy20-clean.csv", 
                     usecols=['Text','Date','Name','Location','search term','clean_text'])
lazy = lazy.append(lazy20)
lazy['type']='lazy'

In [4]:
df = active.append(lazy)

In [5]:
df.head()

Unnamed: 0,Text,Date,Name,Location,search term,clean_text,type
0,New #knife post on our forum! » New Opinel #6 ...,2019-03-24,mtjsblog,,outdoor,new knife post forum new opinel httpstcoprbmwh...,active
1,New #shtf discussion on our forum! » https://t...,2019-03-18,mtjsblog,,outdoor,new shtf discussion forum httpstcowzretpdwb,active
2,Good time this weekend with our @twistedtea @c...,2019-04-01,tydillon,"Welcome, North Carolina",outdoor,good time weekend twistedtea chevy texas call_...,active
3,Proud of GEICO’s dedication to NASCAR! https:/...,2019-03-25,tydillon,"Welcome, North Carolina",outdoor,proud geico dedication nascar httpstcouqxaszkrv,active
4,Summer is in the air! Call us to get started o...,2019-04-10,JSQimprovements,"Huntington Beach, CA",outdoor,summer air call us get started custom swimming...,active


# Naive Bayes Model

We decided to build a prediction of whether the user is active or lazy based on the words used for active and lazy users. We will use a Naive Bayes Multinomial Model to predict the type of the user. We want to predict a category with labelled data and over 100K samples of text data, so it is appropriate to use Naive Bayes as a model.

In [8]:
# Store tweet dataset into feature matrix and response vector
X_words = df['clean_text']
y_words = df['type']

# Instantiate CountVectorizer and TfidfVectorizer
count_vect = CountVectorizer(min_df=1, ngram_range=(1, 2)) 
tfidf_vect = TfidfVectorizer(min_df=1, ngram_range=(1, 2))


# Apply CountVectorizer 
X_count = count_vect.fit_transform(df['clean_text'].apply(str))
X_count = X_count.tocsc() 

# Apply TfidfVectorizer
X_tfidf = tfidf_vect.fit_transform(df['clean_text'].apply(str))
X_tfidf = X_tfidf.tocsc()


# Split train/test data for all data
Xtrain_count, Xtest_count, ytrain_count, ytest_count = train_test_split(X_count, y_words, random_state=17)
Xtrain_tfidf, Xtest_tfidf, ytrain_tfidf, ytest_tfidf = train_test_split(X_tfidf, y_words, random_state=17)

In [13]:
def evaluate_model(xtest, ytest, clf):
    """ 
    This function evaluates the effectiveness of a ML model and outputs F1 Scores, AUC score and Confusion Matrix
    """
    # Make predictions for Xtest
    y_pred = clf.predict(xtest)
    
    # Confusion matrix
    cm = metrics.confusion_matrix(ytest, y_pred)
    
    print(classification_report(ytest, y_pred))
    print('\nConfusion Matrix:\n', cm)

In [14]:
# Instantiate multinomialNB()
nb_words_count = MultinomialNB(alpha=1, fit_prior=True)
nb_words_tfidf = MultinomialNB(alpha=1, fit_prior=True)

# Train model
nb_words_count.fit(Xtrain_count, ytrain_count)
nb_words_tfidf.fit(Xtrain_tfidf, ytrain_tfidf)

MultinomialNB(alpha=1, class_prior=None, fit_prior=True)

In [15]:
evaluate_model(Xtest_count, ytest_count, nb_words_count)

              precision    recall  f1-score   support

      active       0.83      0.71      0.77     15523
        lazy       0.70      0.82      0.76     12657

    accuracy                           0.76     28180
   macro avg       0.76      0.77      0.76     28180
weighted avg       0.77      0.76      0.76     28180


Confusion Matrix:
 [[11080  4443]
 [ 2285 10372]]


The F1-score is 77%. The number of False Positives (2,285) is low compared to True Positives (11,080). True Negatives number (10,372) is also much higher than False Negatives (4,443).

In [17]:
evaluate_model(Xtest_tfidf, ytest_tfidf, nb_words_tfidf)

              precision    recall  f1-score   support

      active       0.71      0.90      0.79     15523
        lazy       0.81      0.54      0.65     12657

    accuracy                           0.74     28180
   macro avg       0.76      0.72      0.72     28180
weighted avg       0.75      0.74      0.73     28180


Confusion Matrix:
 [[13911  1612]
 [ 5789  6868]]


The F-1 score is 77%. The proportion for False Positives to True Positives and False Negatives to True Negatives stay around the same.

Overall we can conclude that the model is good, but we will try to tune in the hyperparameter alpha to make the predictions even better.

In [21]:
from sklearn.model_selection import GridSearchCV
param_grid = {'alpha': (1,0.1,0.01,0.001,2,5,10,25)} 

multinom_cv = GridSearchCV(nb_words_count, param_grid, cv=5, scoring='roc_auc') 
multinom_cv.fit(Xtrain_count, ytrain_count) 
multinom_cv.best_params_

{'alpha': 1}

In [22]:
multinom_cv.best_score_

0.8421094523447299

In [24]:
from sklearn.model_selection import GridSearchCV
param_grid = {'alpha': (1,0.1,0.01,0.001,2,5,10,25)} 

tdidf_multinom_cv = GridSearchCV(nb_words_tfidf, param_grid, cv=5, scoring='roc_auc') 
tdidf_multinom_cv.fit(Xtrain_tfidf, ytrain_tfidf) 
tdidf_multinom_cv.best_params_

{'alpha': 0.1}

In [25]:
tdidf_multinom_cv.best_score_

0.8480343803693777

The ROC-AUC score has improved to 84% for both models.

## Feature Importances

Let's review top 20 most important words for predicting the active users on twitter by using sklearn feature_importances_

In [28]:
import warnings
warnings.filterwarnings(action='ignore', category=FutureWarning)

nb_words_count = 
nb_words_count.fit(Xtest_count, ytest_count) 
zipped = zip(df.clean_text,nb_words_count.feature_importances_)
res = sorted(zipped, key = lambda x: x[1], reverse=True)
_ = plt.figure(figsize=(20,10))
for i in res[0:20]:
    _ = plt.bar(i[0],i[1], color='blue')
    _ = plt.title('Top 20 Most Important Words to Predict Active Users on Twitter')
    _ = plt.xlabel('Features')
    _ = plt.xticks(rotation=45)
    _ = plt.ylabel('Feature Importance Score')

AttributeError: 'MultinomialNB' object has no attribute 'feature_importances_'

In [36]:
NB_optimal = MultinomialNB(alpha=1, fit_prior=True)
NBtdidf_optimal = MultinomialNB(alpha=0.1, fit_prior=True)

#neg_class_prob_sorted = NB_optimal.coef_[0, :]
#pos_class_prob_sorted = NB_optimal.coef_[1, :].argsort()

#print(np.take(count_vect.get_feature_names(), neg_class_prob_sorted[:10]))
#print(np.take(count_vect.get_feature_names(), pos_class_prob_sorted[:10]))

def show_most_informative_features(vectorizer, clf, n=20):
    feature_names = vectorizer.get_feature_names()
    coefs_with_fns = sorted(zip(clf.coef_[0], feature_names))
    top = zip(coefs_with_fns[:n], coefs_with_fns[:-(n + 1):-1])
    for (coef_1, fn_1), (coef_2, fn_2) in top:
        print("\t%.4f\t%-15s\t\t%.4f\t%-15s" % (coef_1, fn_1, coef_2, fn_2))

In [37]:
show_most_informative_features(count_vect, NB_optimal)

AttributeError: 'MultinomialNB' object has no attribute 'classes_'