<img src="../images/GA-logo.png" style="float: left; margin: 20px; height: 55px">

# Project 3: Web APIs and NLP

**Primary Objectives:**

1. Scrape tweets from two Twitter accounts
2. Use NLP to train a classifier to predict the account a given tweet comes from (i.e. binary classification)


In [480]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import f1_score, confusion_matrix, ConfusionMatrixDisplay

from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

from scipy.stats import uniform

import dill

%matplotlib inline

# remb to remove all the models, since this is only preproc feature engineering

----

## Data Import and Split

In [186]:
# read in the cleaned data files
tweets = pd.read_csv('../datasets/tweets_cleaned.csv')

In [312]:
# split into train and test datasets
X = tweets[['cleanedContent', 'content_length', 'hashtags', 'sentiment_score',
            'num_links_TextLink', 'num_media_Video', 'num_media_Photo', 'num_media_Gif',
            'likeCount', 'quoteCount', 'part_of_convo']]
y = tweets['account']
# convert y into boolean, where pap=1 and wp=0
y = pd.Series([1 if row == 'pap' else 0 for row in y], name=y.name)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

----
## Feature Engineering

Taking into account the results from the EDA, we will engineer the features as follows:
| Feature | Input Variables | Pre-Processing | Output Variables |
|--------|--------|--------|--------|
| Messaging | `cleanedContent` | Vectorise with TFIDF | `content_vec` |
|  | `content_length` | No further processing | `content_length` |
|  | `hashtags` | Vectorise with TFIDF | `hashtags_vec` |
| Tone | `sentiment_score` | No further processing | `sentiment_score` |
| Content Type | `num_links_TextLink`, `num_media_Video`, `num_media_Photo`, `num_media_Gif` | No further processing | `num_links_TextLink`, `num_media_Video`, `num_media_Photo`, `num_media_Gif` |
| Level of Engagement | `likeCount`, `quoteCount`, `part_of_convo` | No further processing | `likeCount`, `quoteCount`, `part_of_convo` |

MAYBE REMOVE OUTPUT VARIABLES

### Messaging: vectorise `cleanedContent`

In [189]:
def fill_na_with_blank(df):
    
    return df.fillna('')

In [292]:
def lemmatize_column(df):
    
    lemmatizer = WordNetLemmatizer()
    lemm_col = []

    for text in df:
        tokens = word_tokenize(text)
        lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
        lemm_col.append(' '.join(lemmatized_tokens))

    return pd.Series(lemm_col, name=df.name)
    #else:
    #    return ''
    

In [None]:
for content in tweets['cleanedContent']:
    print(type(content))
    print(content)

In [293]:
# create pipeline
tvec_content_pipe = Pipeline(
    steps=[
        ('filler', FunctionTransformer(fill_na_with_blank, feature_names_out='one-to-one')),
        ('lemmatizer', FunctionTransformer(lambda x: lemmatize_column(x), feature_names_out='one-to-one')),
        ('tvec_content', TfidfVectorizer())
        #('denser', FunctionTransformer(lambda x: x.todense(), feature_names_out='one-to-one'))
    ]
)

### Messaging: vectorise `hashtags`

In [83]:
# custom function to vectorise and create dense matrix
def convert_list_to_string(col):
    
    #col = col.fillna([])
    
    #return col.apply(lambda x: ' '.join(x))
    #return col.apply(lambda x: ' '.join(x) if isinstance(x, list) else '')
    return col.apply(lambda x: ' '.join(ast.literal_eval(x)) if isinstance(x, str) else '')

In [219]:
# create pipeline for vectorising hashtags
# hashtags is a column in the dataframe with a list of strings for each row
tvec_hashtag_pipe = Pipeline(
    steps=[
        ('filler', FunctionTransformer(fill_na_with_blank, feature_names_out='one-to-one')),
        # join the list of words into a single string for each row
        #('joiner', FunctionTransformer(lambda x: ' '.join(x) if isinstance(x, list) else '', feature_names_out='one-to-one')),
        #('joiner', FunctionTransformer(convert_list_to_string, feature_names_out='one-to-one')),
        # vectorise
        ('tvec_hashtag', TfidfVectorizer()),
        # convert to dense matrix
        #('denser', FunctionTransformer(lambda x: x.todense(), feature_names_out='one-to-one'))
    ]
)

### Tone: `sentiment_scores`

We realise that `sentiment_scores` ranges from -1.0 to 1.0 and this will raise an error in Multinomial Naive Bayes classifier which only takes in positive values. As such, we need to transform `sentiment_scores` into variables - one for positive sentiment and one for negative.

In [496]:
# create function to transform sentiments_scores into positive and negative scores
def transform_scores(df):
    
    df_trans = df.apply(lambda x: pd.Series([max(x,0), max(-x,0)]))
    df_trans.columns = [df.name+'_pos', df.name+'_neg']
    
    return df_trans

### Pre-processor for all predictor variables

In [518]:
# putting all the preprocessing steps together in one pipeline for Multinomial Naive Bayes classifier which cannot take in negative values
preprocessor_NB = ColumnTransformer(
    transformers=[
        ('sent_transformer', FunctionTransformer(transform_scores, feature_names_out='one-to_one'), 'sentiment_score'),
        ('content_transformer', tvec_content_pipe, 'cleanedContent'),
        ('hashtag_transformer', tvec_hashtag_pipe, 'hashtags')
    ],
    remainder='passthrough', verbose_feature_names_out=True
)

In [498]:
# putting all the preprocessing steps together in one pipeline for other classifiers
preprocessor = ColumnTransformer(
    transformers=[
        ('content_transformer', tvec_content_pipe, 'cleanedContent'),
        ('hashtag_transformer', tvec_hashtag_pipe, 'hashtags')
    ],
    remainder='passthrough', verbose_feature_names_out=True
)

----

## Model fitting and evaluation

For model evaluation, there are many metrics that can be used. For this problem statement, we will use F1 score to provide a balance between precision and recall. This is because:
1. The dataset is quite balanced between the 2 Twitter accounts
2. Classifying the tweet wrongly either way is equally detrimental

Hence, we will not prioritise precision or recall but aim to get the best F1 score instead.

### Baseline Model

The baseline model takes a mean strategy. We check the F1 score for this.

In [315]:
y_pred = y_test.value_counts(normalize=True).sort_values(ascending=False).index[0]

In [319]:
print(f'F1 Score for baseline model is {f1_score(y_test, y_pred*np.ones(y_test.shape)):.3f}.')

F1 Score for baseline model is 0.674.


### Model A: Naive Bayes

We test out a Naive Bayes model and check its F1 score.

In [519]:
# construct pipeline for model A
model_NB = Pipeline(
    steps=[
        ('preproc', preprocessor_NB),
        ('model', MultinomialNB())
    ]
)

In [520]:
# set hyperparameters for tuning
model_NB_params = {
    #'preproc__content_transformer__tvec_content__max_features': [100, 500, 1000, 2000, 3000, 4000, 5000, 6000],
    'preproc__content_transformer__tvec_content__max_df': uniform(0,1),
    'preproc__content_transformer__tvec_content__stop_words': [None, 'english'],
    'preproc__content_transformer__tvec_content__ngram_range': [(1,1), (1,3), (2,3)],
    'preproc__hashtag_transformer__tvec_hashtag__max_df': uniform(0,1),
    'preproc__hashtag_transformer__tvec_hashtag__stop_words': [None, 'english'],
    #'preproc__hashtag_transformer__tvec_hashtag__ngram_range': [(1,1)],
    'model__alpha': uniform(0,1)
}

In [521]:
# construct RandomizedSearchCV object to tune hyperparameters with F1 score as objective
model_NB_randsearch = RandomizedSearchCV(
    model_NB, model_NB_params, n_iter=20, cv=5, scoring='f1',
    random_state=200, n_jobs=-1
)

In [None]:
%%time
# fit model and tune hyperparamters
model_NB_randsearch.fit(X_train, y_train)

In [523]:
# look at metrics
model_NB_train_score = f1_score(y_train, model_NB_randsearch.predict(X_train))
model_NB_cv_score = model_NB_randsearch.best_score_

print(f'Train F1 Score: \t{model_NB_train_score:.3f}')
print(f'5-Fold CV F1 Score: \t{model_NB_cv_score:.3f}')

NotFittedError: This RandomizedSearchCV instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [None]:
cm_plot = ConfusionMatrixDisplay(confusion_matrix(y_train, model_NB_randsearch.predict(X_train)), display_labels=['pap', 'wp'])

cm_plot.plot(cmap='Blues')

### Model A: Naive Bayes

We test out a Naive Bayes model and check its F1 score.

## Later fitting

In [None]:
# Set up a pipeline with tf-idf vectorizer and multinomial naive bayes

pipe_tvec = Pipeline([
    ('tvec', TfidfVectorizer()),
    ('nb', MultinomialNB())
])

In [None]:
# Search over the following values of hyperparameters:
# Maximum number of features fit: 2000, 3000, 4000, 5000
# No stop words and english stop words
# Check (individual tokens) and also check (individual tokens and 2-grams).

pipe_tvec_params = {
    'tvec__max_features': [2_000, 3_000, 4_000, 5_000],
    'tvec__stop_words': [None, 'english'],
    'tvec__ngram_range': [(1,1), (1,2)]
}

In [None]:
# Instantiate GridSearchCV.

gs_tvec = GridSearchCV(pipe_tvec, # what object are we optimizing?
                        param_grid = pipe_tvec_params, # what parameters values are we searching?
                        cv=5) # 5-fold cross-validation.

In [None]:
# Fit GridSearch to training data.
gs_tvec.fit(X_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('tvec',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        no