In [47]:
# imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.dummy import DummyClassifier

import nltk
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import stopwords, wordnet

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report, mean_absolute_error, mean_squared_error, accuracy_score, precision_score, recall_score

import pickle

In [2]:
most_recent_extract = '2023-06-11 16:25'
df = pd.read_csv(f'data/reddit_posts_raw_{most_recent_extract}.csv')

In [3]:
df.shape

(1971, 6)

In [4]:
df.head(1)

Unnamed: 0,subreddit,id,created_utc,title,selftext,top_comment_text
0,dating,1471ube,2023-06-11 18:49:33,Am I Clueless?,So there is this girl I’ve known my whole life...,


### Self-text only

In [6]:
X = pd.Series(df['selftext'])
y = df['subreddit'].map({'dating': 0,
                    'datingoverthirty':1})

### Self Text and Top Comment - Alternative Path

In [7]:
# df['self_text_and_comment'] = df['self_text'].astype(str) + df['top_comment_text'].astype(str)
# X = pd.Series(df['self_text_and_comment'])
# y = df['subreddit'].map({'dating': 0,
#                    'datingoverthirty':1})

### Train-Test Split

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.25, 
                                                    random_state=42)
X_train.to_pickle('./pickled_models/X_train.pkl')
X_test.to_pickle('./pickled_models/X_test.pkl')
y_train.to_pickle('./pickled_models/y_train.pkl')
y_test.to_pickle('./pickled_models/y_test.pkl')

#### Baseline

> The majority class holds 50.63% of responses.  This is the baseline score to beat.

> Even class distribution makes a 75/25 train test split possible.

In [34]:
dummy = DummyClassifier()
dummy.fit(X_train, y_train)
dummy_preds = dummy.predict(y_test)

# mean_absolute_error, mean_squared_error, accuracy_score, precision_score, recall_score

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [5]:
df['subreddit'].value_counts(normalize = True)

datingoverthirty    0.506342
dating              0.493658
Name: subreddit, dtype: float64

## Baseline Investigation with Standard Vectorizers

#### CountVectorizer

In [22]:
cvec0 = CountVectorizer() #standard CountVectorizer
cvec0.fit(X_train)
pickle.dump(cvec0, open('./pickled_models/cvec0_baseline', 'wb'))

> See Model Investigaion for Investigations

#### Tf-Idf Vectorizer

In [23]:
tvec0 = TfidfVectorizer()
pickle.dump(tvec0, open('./pickled_models/tvec0_baseline', 'wb'))

> See Model Investigation for Investigations

## Next

#### Stemming and Lematizing

In [24]:
p_stemmer = PorterStemmer()
def stem_post(post):
    split_post = post.split(' ')
    return ' '.join([p_stemmer.stem(word) for word in split_post])
#cite 6/9 Breakfast Hour

In [75]:
lemmatizer = WordNetLemmatizer()
# cite: Lesson 504 NLP 1 - Modified to handle complete words.
def lemmatize_post(post):
    mapper = { 
        'J': wordnet.ADJ,
        'V': wordnet.VERB,
        'N': wordnet.NOUN,
        'R': wordnet.ADV
    }
    post_split = post.split(' ')
    post_tokens = [(token, tag) for token, tag in nltk.pos_tag(post_split)]
    post_lem = []
    for token in post_tokens:
        pos = mapper.get(token[1][0])
        # post_lem.append((token[0],pos) if pos != None else (token[0]))
        post_lem.append(lemmatizer.lemmatize(token[0], pos) if pos != None else token[0])
    return ' '.join(post_lem).lower()

## RandomSearchCV over Multiple Model Types with Tfidf Vectorization

In [42]:
# I want to evaluate multiple classifiers in the same RandomSearchCV, trying different combinations of Tfidf / CountVectorizer and LogisticRegression() / MultinomialNB
# Inspiration: Wrapper Class (https://stackoverflow.com/questions/50285973/pipeline-multiple-classifiers).  Content: DSI Lesson 507 on OOP (https://git.generalassemb.ly/bobadams1/507-lesson-object-oriented-programming)
'''
Notes from Inspiration above (no copy-paste):
1. Need BaseEstimator() as the base class for all sklearn estimators - as a stand in for the estimator being selected
2. The class only really needs to to have self and the estimator as objects in the class.
3. The methods you would normally call for the estimator should be defined as functions within the model (don't forget to pass self every time!)
'''
from sklearn.base import BaseEstimator

class Multi_Classifier(BaseEstimator):
    def __init__(self, estimator = MultinomialNB()): #LogisticRegression as default
        self.estimator = estimator
    
    def fit(self, X, y): # interested in LogisticRegression, NB... both take primarily X,y
        return self.estimator.fit(X,y)

    def predict(self, X):
        return self.estimator.predict(X)
    
    def score(self, X,y):
        return self.estimator.score(X,y)


### Pipeline

In [96]:
pipe0 = Pipeline([
    ('tvec' , TfidfVectorizer()),
    ('cls' , Multi_Classifier())
])

params0 = [{ # list of params... one for each estimator (order matters here). Cite: Tim Office Hours
## Logistic Regression
         'tvec__preprocessor': [None, stem_post, lemmatize_post],     
         'tvec__max_df': [1.0, 0.9],
         # 'tvec__max_features': None,
         # 'tvec__min_df': 1,
         'tvec__ngram_range': [(1, 1), (1,2)],
         'tvec__stop_words': [None, 'english'],
        
        'cls__estimator': [LogisticRegression()],
        'cls__estimator__C': np.linspace(0.00001, 1, 10),
        # 'cls__estimator__max_iter': 100,
        # 'cls__estimator__penalty': 'l2'
},
## Multinomial Naive Bayes
{        'tvec__preprocessor': [None, stem_post, lemmatize_post],     
         'tvec__max_df': [1.0, 0.9],
         # 'tvec__max_features': None,
         # 'tvec__min_df': 1,
         'tvec__ngram_range': [(1, 1), (1,2)],
         'tvec__stop_words': [None, 'english'],
         'cls__estimator': [MultinomialNB()]
}]

In [97]:
pipe0.get_params() ## For LogisticRegression

{'memory': None,
 'steps': [('tvec', TfidfVectorizer()), ('cls', Multi_Classifier())],
 'verbose': False,
 'tvec': TfidfVectorizer(),
 'cls': Multi_Classifier(),
 'tvec__analyzer': 'word',
 'tvec__binary': False,
 'tvec__decode_error': 'strict',
 'tvec__dtype': numpy.float64,
 'tvec__encoding': 'utf-8',
 'tvec__input': 'content',
 'tvec__lowercase': True,
 'tvec__max_df': 1.0,
 'tvec__max_features': None,
 'tvec__min_df': 1,
 'tvec__ngram_range': (1, 1),
 'tvec__norm': 'l2',
 'tvec__preprocessor': None,
 'tvec__smooth_idf': True,
 'tvec__stop_words': None,
 'tvec__strip_accents': None,
 'tvec__sublinear_tf': False,
 'tvec__token_pattern': '(?u)\\b\\w\\w+\\b',
 'tvec__tokenizer': None,
 'tvec__use_idf': True,
 'tvec__vocabulary': None,
 'cls__estimator__alpha': 1.0,
 'cls__estimator__class_prior': None,
 'cls__estimator__fit_prior': True,
 'cls__estimator__force_alpha': 'warn',
 'cls__estimator': MultinomialNB()}

In [98]:
pipe0.get_params() ## for Multinomial Naive Bayes

{'memory': None,
 'steps': [('tvec', TfidfVectorizer()), ('cls', Multi_Classifier())],
 'verbose': False,
 'tvec': TfidfVectorizer(),
 'cls': Multi_Classifier(),
 'tvec__analyzer': 'word',
 'tvec__binary': False,
 'tvec__decode_error': 'strict',
 'tvec__dtype': numpy.float64,
 'tvec__encoding': 'utf-8',
 'tvec__input': 'content',
 'tvec__lowercase': True,
 'tvec__max_df': 1.0,
 'tvec__max_features': None,
 'tvec__min_df': 1,
 'tvec__ngram_range': (1, 1),
 'tvec__norm': 'l2',
 'tvec__preprocessor': None,
 'tvec__smooth_idf': True,
 'tvec__stop_words': None,
 'tvec__strip_accents': None,
 'tvec__sublinear_tf': False,
 'tvec__token_pattern': '(?u)\\b\\w\\w+\\b',
 'tvec__tokenizer': None,
 'tvec__use_idf': True,
 'tvec__vocabulary': None,
 'cls__estimator__alpha': 1.0,
 'cls__estimator__class_prior': None,
 'cls__estimator__fit_prior': True,
 'cls__estimator__force_alpha': 'warn',
 'cls__estimator': MultinomialNB()}

In [99]:
rs0 = RandomizedSearchCV(estimator=pipe0,
                        param_distributions=params0,
                        cv = 5
                       )

In [None]:
rs0.fit(X_train, y_train)