## **Import Libraries**

In [140]:
import pandas as pd
import re
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from sklearn.model_selection import train_test_split
from bs4 import BeautifulSoup       
from nltk.corpus import stopwords
from sklearn.feature_extraction import stop_words
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

## **Import Dataframes**

In [95]:
data_ai = pd.read_csv('./data/data_ai.csv')
data_ml = pd.read_csv('./data/data_ml.csv')

In [96]:
data_ai.head(1)

Unnamed: 0,subreddit,title,selftext
0,artificial,Could AI ethics draw on non-Western philosophi...,


In [97]:
data_ml.head(1)

Unnamed: 0,subreddit,title,selftext
0,MachineLearning,[R] Taming pretrained transformers for eXtreme...,New X-Transformer model from Amazon Research\n...


In [98]:
print(f'The shape of the AI dataframe is {data_ai.shape}')
print(f'The shape of the ML dataframe is {data_ml.shape}')

The shape of the AI dataframe is (31299, 3)
The shape of the ML dataframe is (31299, 3)


## **Merge the Data**

In [99]:
df = data_ai.append(data_ml).reset_index()

In [100]:
df.drop(columns='index',inplace=True)

In [101]:
print(f'The shape of the merged AI and ML dataframes are {df.shape}')

The shape of the merged AI and ML dataframes are (62598, 3)


In [102]:
df.isnull().sum()

subreddit        0
title            0
selftext     31046
dtype: int64

**Let's see what a title might look like:**

In [103]:
df['title'][0]

'Could AI ethics draw on non-Western philosophies to help reframe AI ethics'

**Let's see what a selftext might look like:**

In [104]:
df['selftext'][34]

'I need to learn AI to make a project. Because of corona virus, our internal exams are probably not happening, so our teacher has decided to give high amount of marks to AI project.\n\nHow can I learn AI to build projects? I am saying university explicitly, because we have lots of other tasks to do in university as well, so please keep that in mind while suggesting me anything.'

## **Word Cleaning**

In [105]:
# We want only rows where their is text for title and self text
df = df.dropna().reset_index()
df = df.drop(columns='index')

In [108]:
df

Unnamed: 0,subreddit,title,selftext
0,artificial,[R] Using Deep RL to Model Human Locomotion Co...,In the new paper [*Deep Reinforcement Learning...
1,artificial,Questions about MSc CS at Oxford/MPhil Advance...,Hello everyone.\n\n I'm a 2020 graduate in Bac...
2,artificial,One more AI assistant product idea,Hi everyone!\n\nLooking for community feedbac...
3,artificial,NER For Document Digitization,The problem is quite simple to define. How do ...
4,artificial,Top 15 Highest Paying Jobs in India In 2020,&amp;#x200B;\n\n# Top 15 Highest Paying Jobs i...
...,...,...,...
31543,MachineLearning,[D]HELP in machine learning,I am so much confused and tired for looking on...
31546,MachineLearning,Request and sell data on our new Data Market,We've run a community for anyone interested in...
31548,MachineLearning,[D] Does anyone created a formal database for ...,I'm looking for a database that has sufficient...
31549,MachineLearning,"[P] Demo of ""Arbitrary Style Transfer with Sty...",Hi MachineLearning\n\nI'll introduce awsome st...


In [109]:
# Get rid of all of the [removed] amd [deleted] values in 'selftext'
df = df[(df['selftext'] != '[removed]') & (df['selftext'] != '[deleted]')]

In [110]:
# Check the balance of the classes these are unbalanced
df['subreddit'].value_counts()

MachineLearning    10528
artificial          7053
Name: subreddit, dtype: int64

In [111]:
# Balance the classes
df = df[:-3475]

In [112]:
# They are now balanced
df['subreddit'].value_counts()

artificial         7053
MachineLearning    7053
Name: subreddit, dtype: int64

In [113]:
# Reset the index
df = df.dropna().reset_index()
df = df.drop(columns='index')

In [115]:
df

Unnamed: 0,subreddit,title,selftext
0,artificial,[R] Using Deep RL to Model Human Locomotion Co...,In the new paper [*Deep Reinforcement Learning...
1,artificial,Questions about MSc CS at Oxford/MPhil Advance...,Hello everyone.\n\n I'm a 2020 graduate in Bac...
2,artificial,One more AI assistant product idea,Hi everyone!\n\nLooking for community feedbac...
3,artificial,NER For Document Digitization,The problem is quite simple to define. How do ...
4,artificial,Top 15 Highest Paying Jobs in India In 2020,&amp;#x200B;\n\n# Top 15 Highest Paying Jobs i...
...,...,...,...
14101,MachineLearning,[D] Five major deep learning papers by Geoff H...,still milking Jurgen's very dense [inaugural t...
14102,MachineLearning,[Project]Recommender web app for short stories,Developed a bare-bone web app which helps in r...
14103,MachineLearning,[D] Can Recurrent Neural Networks have loops t...,[https://youtu.be/oJNHXPs0XDk?t=333](https://y...
14104,MachineLearning,Can someone shed light on Adversarial and Gene...,I am new to graph learning. Spend a couple of ...


In [116]:
# TEXT CLEANING FUNCTION FOR EVERY POST IN BOTH SUBREDDITS

# These will be replaced by a space ' ' 
symbol_replace_space = re.compile('[/(){}\[\]\|@,;]')

# We will get rid of all these in the function below
bad_symbols = re.compile('[^0-9a-z #+_]')

# We will get rid of all of the stopwords
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):

    # Make all of the text lower case
    text = text.lower() 

    # Replace symbol_replace_space with a space 
    text = symbol_replace_space.sub(' ', text)
    
    # Replace bad_symbols with a space
    text = bad_symbols.sub('', text) 
    
    # This gets rid of the integers
    text = re.sub(r'\d+', '', text) 

    # remove stopwords from text
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) 

    return text

# Applying the clean_text function above to every title in df['title']
df['title'] = df['title'].apply(clean_text)
df['selftext'] = df['selftext'].apply(clean_text)

In [117]:
df

Unnamed: 0,subreddit,title,selftext
0,artificial,r using deep rl model human locomotion control...,new paper deep reinforcement learning modeling...
1,artificial,questions msc cs oxford mphil advanced cs camb...,hello everyone im graduate bachelors degree ti...
2,artificial,one ai assistant product idea,hi everyonelooking community feedback trying c...
3,artificial,ner document digitization,problem quite simple define automate make fast...
4,artificial,top highest paying jobs india,amp #xb # top highest paying jobs india https ...
...,...,...,...
14101,MachineLearning,five major deep learning papers geoff hinton c...,still milking jurgens dense inaugural tweet ht...
14102,MachineLearning,project recommender web app short stories,developed barebone web app helps reading short...
14103,MachineLearning,recurrent neural networks loops go backward,https youtube ojnhxpsxdkt https youtube ojnhxp...
14104,MachineLearning,someone shed light adversarial generative grap...,new graph learning spend couple hours adversar...


**We need to concat the title and selftext so we can put it into one columns so we can count vectorize**
- https://stackoverflow.com/questions/34710281/use-featureunion-in-scikit-learn-to-combine-two-pandas-columns-for-tfidf

In [118]:
df['title_selftext'] = df['title'] + ' ' + df['selftext']

In [127]:
df.head()

Unnamed: 0,subreddit,title,selftext,title_selftext
0,artificial,r using deep rl model human locomotion control...,new paper deep reinforcement learning modeling...,r using deep rl model human locomotion control...
1,artificial,questions msc cs oxford mphil advanced cs camb...,hello everyone im graduate bachelors degree ti...,questions msc cs oxford mphil advanced cs camb...
2,artificial,one ai assistant product idea,hi everyonelooking community feedback trying c...,one ai assistant product idea hi everyonelooki...
3,artificial,ner document digitization,problem quite simple define automate make fast...,ner document digitization problem quite simple...
4,artificial,top highest paying jobs india,amp #xb # top highest paying jobs india https ...,top highest paying jobs india amp #xb # top hi...


## **Train/Test Split**

In [128]:
X = df[['title_selftext']]
y = df['subreddit']

In [129]:
X.shape

(14106, 1)

In [130]:
y.shape

(14106,)

In [205]:
# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = .65, stratify = y, random_state=42)

In [206]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)


(9168, 1)
(4938, 1)
(9168,)
(4938,)


In [207]:
y_train.value_counts()

MachineLearning    4584
artificial         4584
Name: subreddit, dtype: int64

In [208]:
y_test.value_counts()

MachineLearning    2469
artificial         2469
Name: subreddit, dtype: int64

## **Count Vectorizer**

In [223]:
# Instantiate the "CountVectorizer" object, which is scikit-learn's bag of words tool
vectorizer = CountVectorizer(analyzer = "word",
                             tokenizer = None,
                             preprocessor = None,
                             stop_words = None,
                             max_features = 1000,
                             )

In [224]:
# fit_transform() does two functions: First, it fits the model
# and learns the vocabulary; second, it transforms our training data
# into feature vectors. The input to fit_transform should be a list of 
# strings.

train_data_features = vectorizer.fit_transform(X_train['title_selftext'])

test_data_features = vectorizer.transform(X_test['title_selftext'])

# Numpy arrays are easy to work with, so convert the result to an array.
train_data_features = train_data_features.toarray()

In [225]:
train_data_features

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

## **LogisticRegression**

In [226]:
# Instantiate the Logistic Regression
lr = LogisticRegression(solver='liblinear')

In [227]:
lr.fit(train_data_features, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [228]:
lr.score(train_data_features, y_train)

0.8699825479930192

In [222]:
lr.score(test_data_features, y_test)

0.8110571081409478

### **Model 1**

Model one LASSO Logistic Regression with alpha = 10 gets a train score of .83 and a test score of .82