## **Import Libraries**

In [3]:
import pandas as pd
import re
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from sklearn.model_selection import train_test_split, GridSearchCV
from bs4 import BeautifulSoup       
from nltk.corpus import stopwords
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB

## **Import Dataframes**

In [4]:
data_ai = pd.read_csv('../project_3-master/data/data_ai.csv')
data_ml = pd.read_csv('../project_3-master/data/data_ml.csv')

In [5]:
data_ai.head(1)

Unnamed: 0,subreddit,title,selftext
0,artificial,Could AI ethics draw on non-Western philosophi...,


In [6]:
data_ml.head(1)

Unnamed: 0,subreddit,title,selftext
0,MachineLearning,[R] Taming pretrained transformers for eXtreme...,New X-Transformer model from Amazon Research\n...


## **Merge the Data**

In [7]:
df = data_ai.append(data_ml).reset_index()

In [8]:
df.drop(columns='index',inplace=True)

In [9]:
df

Unnamed: 0,subreddit,title,selftext
0,artificial,Could AI ethics draw on non-Western philosophi...,
1,artificial,Realistic simulation of tearing meat and peeli...,
2,artificial,[R] Using Deep RL to Model Human Locomotion Co...,In the new paper [*Deep Reinforcement Learning...
3,artificial,Artificial Intelligence Easily Beats Human Fig...,
4,artificial,Foiling illicit cryptocurrency mining with art...,
...,...,...,...
62593,MachineLearning,What are some things that you wish you knew be...,[removed]
62594,MachineLearning,[D] Does anyone created a formal database for ...,I'm looking for a database that has sufficient...
62595,MachineLearning,"[P] Demo of ""Arbitrary Style Transfer with Sty...",Hi MachineLearning\n\nI'll introduce awsome st...
62596,MachineLearning,[R] Triplet loss for image retrieval,"Hi, there!\n\n \nThis is an example of image ..."


In [10]:
df.isnull().sum()

subreddit        0
title            0
selftext     31046
dtype: int64

**Let's see what a title might look like:**

In [11]:
df['title'][0]

'Could AI ethics draw on non-Western philosophies to help reframe AI ethics'

## **Train/Test Split**

In [12]:
X = df[['title']]
y = df['subreddit']

In [13]:
X.head(1)

Unnamed: 0,title
0,Could AI ethics draw on non-Western philosophi...


In [14]:
X.shape

(62598, 1)

In [15]:
y.shape

(62598,)

In [16]:
# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

In [17]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)


(46948, 1)
(15650, 1)
(46948,)
(15650,)


In [18]:
X_train['title']

2432         ESTADÍSTICA DESCRIPTIVA PARA LA INVESTIGACIÓN
54386                         [R] OgmaNeo plays Atari Pong
40950                       Great Review of Linear Algebra
16495    Microsoft CEO Satya Nadella says Artificial In...
57103    [D] What are the performance metrices of word ...
                               ...                        
518                        Here's an interesting AI Video!
18857    How enhanced reality technologies contribute t...
56723    What is the absolute latest and greatest resea...
5392           Software for Building AI Assistant/ Chatbot
9086     There's a magazine written by AI. It's a littl...
Name: title, Length: 46948, dtype: object

## **Function for Cleaning**

In [19]:
def review_to_words(raw_review):
    # Function to convert a raw review to a string of words
    # The input is a single string (a raw movie review), and 
    # the output is a single string (a preprocessed movie review)
    
    # 1. Remove HTML.
    review_text = raw_review
    
    # 2. Remove non-letters.
    letters_only = re.sub("[^a-zA-Z]", " ", review_text)
    
    # 3. Convert to lower case, split into individual words.
    words = letters_only.lower().split()
    
    # 4. In Python, searching a set is much faster than searching
    # a list, so convert the stop words to a set.
    stops = set(stopwords.words('english'))
    
    # 5. Remove stop words.
    meaningful_words = [w for w in words if not w in stops]
    
    # 6. Join the words back into one string separated by space, 
    # and return the result.
    return(" ".join(meaningful_words))

In [20]:
# Get the number of reviews based on the dataframe size.
total_titles = df.shape[0]
print(f'There are {total_titles} titles.')

# Initialize an empty list to hold the clean reviews.
clean_train_titles = []
clean_test_titles = []

There are 62598 titles.


In [21]:
print("Cleaning and parsing the training set for titles...")

j = 0

for train_title in X_train['title']:
    # Convert review to words, then append to clean_train_reviews.
    clean_train_titles.append(review_to_words(train_title))
    
    # If the index is divisible by 1000, print a message
    if (j + 1) % 1000 == 0:
        print(f'Review {j + 1} of {total_titles}.')
    
    j += 1

# Let's do the same for our testing set.

print("Cleaning and parsing the testing set for titles...")

for test_title in X_test['title']:
    # Convert review to words, then append to clean_train_reviews.
    clean_test_titles.append(review_to_words(test_title))
    
    # If the index is divisible by 1000, print a message
    if (j + 1) % 1000 == 0:
        print(f'Review {j + 1} of {total_titles}.')
        
    j += 1

Cleaning and parsing the training set for titles...
Review 1000 of 62598.
Review 2000 of 62598.
Review 3000 of 62598.
Review 4000 of 62598.
Review 5000 of 62598.
Review 6000 of 62598.
Review 7000 of 62598.
Review 8000 of 62598.
Review 9000 of 62598.
Review 10000 of 62598.
Review 11000 of 62598.
Review 12000 of 62598.
Review 13000 of 62598.
Review 14000 of 62598.
Review 15000 of 62598.
Review 16000 of 62598.
Review 17000 of 62598.
Review 18000 of 62598.
Review 19000 of 62598.
Review 20000 of 62598.
Review 21000 of 62598.
Review 22000 of 62598.
Review 23000 of 62598.
Review 24000 of 62598.
Review 25000 of 62598.
Review 26000 of 62598.
Review 27000 of 62598.
Review 28000 of 62598.
Review 29000 of 62598.
Review 30000 of 62598.
Review 31000 of 62598.
Review 32000 of 62598.
Review 33000 of 62598.
Review 34000 of 62598.
Review 35000 of 62598.
Review 36000 of 62598.
Review 37000 of 62598.
Review 38000 of 62598.
Review 39000 of 62598.
Review 40000 of 62598.
Review 41000 of 62598.
Review 42000 o

In [22]:
len(clean_train_titles)

46948

In [23]:
len(clean_test_titles)

15650

## **Word EDA**

In [24]:
# Instantiate the "CountVectorizer" object, which is scikit-learn's bag of words tool
vectorizer = CountVectorizer(analyzer = "word",
                             tokenizer = None,
                             preprocessor = None,
                             stop_words = None,
                             max_features = 2500,
                             min_df=2)

In [25]:
# fit_transform() does two functions: First, it fits the model
# and learns the vocabulary; second, it transforms our training data
# into feature vectors. The input to fit_transform should be a list of 
# strings.

train_data_features = vectorizer.fit_transform(clean_train_titles)

test_data_features = vectorizer.transform(clean_test_titles)

# Numpy arrays are easy to work with, so convert the result to an 
# array.
train_data_features = train_data_features.toarray()

In [27]:
word_list = vectorizer.get_feature_names();    
count_list = train_data_features.sum(axis=0) 

In [28]:
word_dict = dict(zip(word_list,count_list))

In [29]:
cleaned_df = pd.DataFrame(train_data_features, columns= word_list)

In [30]:
from collections import Counter
counter = Counter()
counter.update(word_dict)
most_common = counter.most_common(15)
most_df = pd.DataFrame(most_common)

In [31]:
most_df

Unnamed: 0,0,1
0,ai,11655
1,learning,7549
2,machine,4904
3,artificial,4685
4,intelligence,4623
5,data,2646
6,using,1991
7,deep,1955
8,ml,1808
9,new,1664


In [32]:
print(train_data_features.shape)
print(test_data_features.shape)

(46948, 2500)
(15650, 2500)


In [33]:
train_data_features[0:6]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [34]:
vocab = vectorizer.get_feature_names()
# print(vocab)

## **Simple Logistic Regression without Gridsearch**

In [35]:
# Import logistic regression.

from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(solver = 'liblinear')

In [36]:
lr.fit(train_data_features, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [37]:
print(f"Simple LogisticRegression training accuracy is: {lr.score(train_data_features, y_train)}")
print(f"Simple LogisticRegression testing accuracy is: {lr.score(test_data_features, y_test)}")

Simple LogisticRegression training accuracy is: 0.8259776774303484
Simple LogisticRegression testing accuracy is: 0.7986581469648563


## **Gridsearched Count Vectorizer for Logistic Regression and Naive Bayes**

In [61]:
# Pipeline for CountVectorizer for Naive Bayes and Logisitc Regression
pipe_cvec_lr = Pipeline([
    ('cvec', CountVectorizer(analyzer='word')),
    ('lr', LogisticRegression(solver='liblinear'))
])

pipe_cvec_nb = Pipeline([
    ('cvec', CountVectorizer(analyzer='word')),
    ('nb', MultinomialNB())    
])

In [62]:
# Count Vectorizer Hyper Paramters
pipe_params = {
    'cvec__max_features': [5000, 10_000, 15_000, 20_000],
    'cvec__min_df': [2, 3],
    'cvec__ngram_range': [(1, 1), (1,2)],
#     'cvec__max_df': [.90, .95]
    }

In [63]:
# Instantiate GridSearchCV.

#LR
gs_cvec_lr = GridSearchCV(pipe_cvec_lr, # what object are we optimizing?
                  param_grid = pipe_params, # what parameters values are we searching?
                  cv = 5) # 5-fold cross-validation.

#NB
gs_cvec_nb = GridSearchCV(pipe_cvec_nb, # what object are we optimizing?
                  param_grid = pipe_params, # what parameters values are we searching?
                  cv = 5) # 5-fold cross-validation.

In [64]:
gs_cvec_lr.fit(clean_train_titles, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('cvec',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        prep

In [65]:
gs_cvec_nb.fit(clean_train_titles, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('cvec',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        prep

In [49]:
gs_cvec_lr.best_params_

{'cvec__max_features': 5000, 'cvec__min_df': 2, 'cvec__ngram_range': (1, 1)}

In [66]:
print(gs_cvec_lr.best_score_)
print(gs_cvec_nb.best_score_)

0.8095126907963696
0.8126863795362007


In [67]:
# Score model train & test for Logisitic Regression
print(f"LogisticRegression with Gridsearch training accuracy is: {gs_cvec_lr.score(clean_train_titles, y_train)}")
print(f"LogisticRegression with Gridsearch testing accuracy is: {gs_cvec_lr.score(clean_test_titles, y_test)}")

LogisticRegression with Gridsearch training accuracy is: 0.8865127374968049
LogisticRegression with Gridsearch testing accuracy is: 0.8090095846645368


In [68]:
# Score model on train & test for Naive Bayes
print(f"Naive Bayes with Gridsearch training accuracy is: {gs_cvec_nb.score(clean_train_titles, y_train)}")
print(f"Naive Bayes with Gridsearch training accuracy is: {gs_cvec_nb.score(clean_test_titles, y_test)}")

Naive Bayes with Gridsearch training accuracy is: 0.8427835051546392
Naive Bayes with Gridsearch training accuracy is: 0.8133546325878594


## **Scores**

Simple LogisticRegression training accuracy is: 0.8256694066899976  
Simple LogisticRegression testing accuracy is: 0.8057296329453895  

**Max_Features --> 2000, 3000, 4000, 5000:**   
LogisticRegression with Gridsearch training accuracy is: 0.8475895933371314    
LogisticRegression with Gridsearch testing accuracy is: 0.8102872955155855    


Naive Bayes with Gridsearch training accuracy is: 0.816391307886384  
Naive Bayes with Gridsearch training accuracy is: 0.8040205094815659  

**Max_Features --> 5000, 10_000, 15_000, 20_000:**  
LogisticRegression with Gridsearch training accuracy is: 0.8865127374968049  
LogisticRegression with Gridsearch testing accuracy is: 0.8090095846645368  

Naive Bayes with Gridsearch training accuracy is: 0.8427835051546392  
Naive Bayes with Gridsearch training accuracy is: 0.8133546325878594  