## **Import Libraries**

In [8]:
import pandas as pd
import re
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from sklearn.model_selection import train_test_split, GridSearchCV
from bs4 import BeautifulSoup       
from nltk.corpus import stopwords
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB

## **Import Dataframes**

In [9]:
data_ai = pd.read_csv('../project_3-master/data/data_ai.csv')
data_ml = pd.read_csv('../project_3-master/data/data_ml.csv')

In [12]:
data_ai.head(1)

Unnamed: 0,subreddit,title,selftext
0,artificial,Artificial Intelligence Easily Beats Human Fig...,


In [13]:
data_ml.head(1)

Unnamed: 0,subreddit,title,selftext
0,MachineLearning,Predicting NBA upsets with GANs,


## **Merge the Data**

In [14]:
df = data_ai.append(data_ml).reset_index()

In [15]:
df.drop(columns='index',inplace=True)

In [17]:
df

Unnamed: 0,subreddit,title,selftext
0,artificial,Artificial Intelligence Easily Beats Human Fig...,
1,artificial,Foiling illicit cryptocurrency mining with art...,
2,artificial,A.I. taking over?,
3,artificial,Reviewing recent advancements in the developme...,
4,artificial,Argonne scientists use artificial intelligence...,
...,...,...,...
49143,MachineLearning,[D] 2nd Order Approximation in XGboost's Objec...,"Hi all,\n\nI have a quick question regarding X..."
49144,MachineLearning,Neural Network Based Optimal Control: Resilien...,
49145,MachineLearning,Random noise on weights is L2 regularisation?,[removed]
49146,MachineLearning,[Research] Rotated Mask RCNN,# The Problem With MaskRCNN (and Bounding Boxe...


In [18]:
df.isnull().sum()

subreddit        0
title            0
selftext     24347
dtype: int64

**Let's see what a title might look like:**

In [19]:
df['title'][0]

'Artificial Intelligence Easily Beats Human Fighter Pilot in DARPA Trial'

## **Train/Test Split**

In [20]:
X = df[['title']]
y = df['subreddit']

In [21]:
X.head(1)

Unnamed: 0,title
0,Artificial Intelligence Easily Beats Human Fig...


In [22]:
X.shape

(49148, 1)

In [23]:
y.shape

(49148,)

In [24]:
# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

In [25]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)


(36861, 1)
(12287, 1)
(36861,)
(12287,)


In [26]:
X_train['title']

38049                    Language decision for productions
6212     Trading Awesomemeorabilia items at Awesomemeor...
28794    [D] How to select the categorical encoding met...
5174                                 Invisible AI keyboard
28488                        [D] Question about MOCO paper
                               ...                        
1065     Here is a link to a project on my github that ...
23850    Inspiring Loft Conversion Ideas for Trendy Loo...
22161    Tesla’s Autopilot Trouble, a Mercedes Hypercar...
43154    can I use transformer for name-entity-recognit...
48307    [D][P] Tutorial: Cloning your voice using Micr...
Name: title, Length: 36861, dtype: object

## **Function for Cleaning**

In [27]:
def review_to_words(raw_review):
    # Function to convert a raw review to a string of words
    # The input is a single string (a raw movie review), and 
    # the output is a single string (a preprocessed movie review)
    
    # 1. Remove HTML.
    review_text = raw_review
    
    # 2. Remove non-letters.
    letters_only = re.sub("[^a-zA-Z]", " ", review_text)
    
    # 3. Convert to lower case, split into individual words.
    words = letters_only.lower().split()
    
    # 4. In Python, searching a set is much faster than searching
    # a list, so convert the stop words to a set.
    stops = set(stopwords.words('english'))
    
    # 5. Remove stop words.
    meaningful_words = [w for w in words if not w in stops]
    
    # 6. Join the words back into one string separated by space, 
    # and return the result.
    return(" ".join(meaningful_words))

In [28]:
# Get the number of reviews based on the dataframe size.
total_titles = df.shape[0]
print(f'There are {total_titles} titles.')

# Initialize an empty list to hold the clean reviews.
clean_train_titles = []
clean_test_titles = []

There are 49148 titles.


In [29]:
print("Cleaning and parsing the training set for titles...")

j = 0

for train_title in X_train['title']:
    # Convert review to words, then append to clean_train_reviews.
    clean_train_titles.append(review_to_words(train_title))
    
    # If the index is divisible by 1000, print a message
    if (j + 1) % 1000 == 0:
        print(f'Review {j + 1} of {total_titles}.')
    
    j += 1

# Let's do the same for our testing set.

print("Cleaning and parsing the testing set for titles...")

for test_title in X_test['title']:
    # Convert review to words, then append to clean_train_reviews.
    clean_test_titles.append(review_to_words(test_title))
    
    # If the index is divisible by 1000, print a message
    if (j + 1) % 1000 == 0:
        print(f'Review {j + 1} of {total_titles}.')
        
    j += 1

Cleaning and parsing the training set for titles...
Review 1000 of 49148.
Review 2000 of 49148.
Review 3000 of 49148.
Review 4000 of 49148.
Review 5000 of 49148.
Review 6000 of 49148.
Review 7000 of 49148.
Review 8000 of 49148.
Review 9000 of 49148.
Review 10000 of 49148.
Review 11000 of 49148.
Review 12000 of 49148.
Review 13000 of 49148.
Review 14000 of 49148.
Review 15000 of 49148.
Review 16000 of 49148.
Review 17000 of 49148.
Review 18000 of 49148.
Review 19000 of 49148.
Review 20000 of 49148.
Review 21000 of 49148.
Review 22000 of 49148.
Review 23000 of 49148.
Review 24000 of 49148.
Review 25000 of 49148.
Review 26000 of 49148.
Review 27000 of 49148.
Review 28000 of 49148.
Review 29000 of 49148.
Review 30000 of 49148.
Review 31000 of 49148.
Review 32000 of 49148.
Review 33000 of 49148.
Review 34000 of 49148.
Review 35000 of 49148.
Review 36000 of 49148.
Cleaning and parsing the testing set for titles...
Review 37000 of 49148.
Review 38000 of 49148.
Review 39000 of 49148.
Review 40

In [30]:
len(clean_train_titles)

36861

In [31]:
len(clean_test_titles)

12287

## **Word EDA**

In [32]:
# Instantiate the "CountVectorizer" object, which is scikit-learn's bag of words tool
vectorizer = CountVectorizer(analyzer = "word",
                             tokenizer = None,
                             preprocessor = None,
                             stop_words = None,
                             max_features = 2500,
                             min_df=2)

In [58]:
# fit_transform() does two functions: First, it fits the model
# and learns the vocabulary; second, it transforms our training data
# into feature vectors. The input to fit_transform should be a list of 
# strings.

train_data_features = vectorizer.fit_transform(clean_train_titles)

test_data_features = vectorizer.transform(clean_test_titles)

# Numpy arrays are easy to work with, so convert the result to an 
# array.
train_data_features = train_data_features.toarray()

In [60]:
word_list = vectorizer.get_feature_names();    
count_list = train_data_features.sum(axis=0) 

In [61]:
word_dict = dict(zip(word_list,count_list))

In [63]:
cleaned_df = pd.DataFrame(train_data_features, columns= word_list)

In [64]:
from collections import Counter
counter = Counter()
counter.update(word_dict)
most_common = counter.most_common(15)
most_df = pd.DataFrame(most_common)

In [65]:
most_df

Unnamed: 0,0,1
0,ai,9671
1,learning,6083
2,machine,3918
3,artificial,3581
4,intelligence,3548
5,data,2221
6,using,1615
7,deep,1527
8,ml,1461
9,new,1337


In [66]:
print(train_data_features.shape)
print(test_data_features.shape)

(36861, 2500)
(12287, 2500)


In [38]:
train_data_features[0:6]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [55]:
vocab = vectorizer.get_feature_names()
# print(vocab)

## **Simple Logistic Regression without Gridsearch**

In [68]:
# Import logistic regression.

from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(solver = 'liblinear')

In [69]:
lr.fit(train_data_features, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [72]:
print(f"Simple LogisticRegression training accuracy is: {lr.score(train_data_features, y_train)}")
print(f"Simple LogisticRegression testing accuracy is: {lr.score(test_data_features, y_test)}")

Simple LogisticRegression training accuracy is: 0.8256694066899976
Simple LogisticRegression testing accuracy is: 0.8057296329453895


## **Gridsearched Count Vectorizer for Logistic Regression and Naive Bayes**

In [44]:
# Pipeline for CountVectorizer for Naive Bayes and Logisitc Regression
pipe_cvec_lr = Pipeline([
    ('cvec', CountVectorizer()),
    ('lr', LogisticRegression(solver='liblinear'))
])

pipe_cvec_nb = Pipeline([
    ('cvec', CountVectorizer()),
    ('nb', MultinomialNB())    
])

In [45]:
# Count Vectorizer Hyper Paramters
pipe_params = {
    'cvec__max_features': [2000, 3000, 4000, 5000],
    'cvec__min_df': [2, 3],
    'cvec__ngram_range': [(1, 1), (1,2)],
#     'cvec__max_df': [.90, .95]
    }

In [46]:
# Instantiate GridSearchCV.

#LR
gs_cvec_lr = GridSearchCV(pipe_cvec_lr, # what object are we optimizing?
                  param_grid = pipe_params, # what parameters values are we searching?
                  cv = 5) # 5-fold cross-validation.

#NB
gs_cvec_nb = GridSearchCV(pipe_cvec_nb, # what object are we optimizing?
                  param_grid = pipe_params, # what parameters values are we searching?
                  cv = 5) # 5-fold cross-validation.

In [47]:
gs_cvec_lr

GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('cvec',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        prep

In [48]:
gs_cvec_nb

GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('cvec',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        prep

In [49]:
gs_cvec_lr.fit(clean_train_titles, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('cvec',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        prep

In [50]:
gs_cvec_nb.fit(clean_train_titles, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('cvec',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        prep

In [51]:
print(gs_cvec_lr.best_score_)
print(gs_cvec_nb.best_score_)

0.796370006150081
0.7998698047656541


In [74]:
# Score model train & test for Logisitic Regression
print(f"LogisticRegression with Gridsearch training accuracy is: {gs_cvec_lr.score(clean_train_titles, y_train)}")
print(f"LogisticRegression with Gridsearch testing accuracy is: {gs_cvec_lr.score(clean_test_titles, y_test)}")

LogisticRegression with Gridsearch training accuracy is: 0.8475895933371314
LogisticRegression with Gridsearch testing accuracy is: 0.8102872955155855


In [75]:
# Score model on train & test for Naive Bayes
print(f"Naive Bayes with Gridsearch training accuracy is: {gs_cvec_nb.score(clean_train_titles, y_train)}")
print(f"Naive Bayes with Gridsearch training accuracy is: {gs_cvec_nb.score(clean_test_titles, y_test)}")

Naive Bayes with Gridsearch training accuracy is: 0.816391307886384
Naive Bayes with Gridsearch training accuracy is: 0.8040205094815659


## **Scores**

Simple LogisticRegression training accuracy is: 0.8256694066899976  
Simple LogisticRegression testing accuracy is: 0.8057296329453895  

LogisticRegression with Gridsearch training accuracy is: 0.8475895933371314  
LogisticRegression with Gridsearch testing accuracy is: 0.8102872955155855  


Naive Bayes with Gridsearch training accuracy is: 0.816391307886384  
Naive Bayes with Gridsearch training accuracy is: 0.8040205094815659  