## **Import Libraries**

In [55]:
import pandas as pd
import re
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from sklearn.model_selection import train_test_split
from bs4 import BeautifulSoup       
from nltk.corpus import stopwords

## **Import Dataframes**

In [87]:
data_ai = pd.read_csv('../project_3-master/data/data_ai.csv')
data_ml = pd.read_csv('../project_3-master/data/data_ml.csv')

In [88]:
data_ai.head()

Unnamed: 0,subreddit,title,selftext
0,ArtificialInteligence,How MSMEs Manipulates Marketing Strategies for...,
1,ArtificialInteligence,Digital marketing trends that Paves the Way of...,
2,ArtificialInteligence,How to Boost Your Team’s Performance and Produ...,
3,ArtificialInteligence,How is Artificial Intelligence Bringing Pivota...,
4,ArtificialInteligence,Very promising and developing project. Modern ...,


## **Merge the Data**

In [92]:
df = data_ai.append(data_ml).reset_index()

In [95]:
df.drop(columns='index',inplace=True)

In [97]:
df.head()

Unnamed: 0,subreddit,title,selftext
0,ArtificialInteligence,How MSMEs Manipulates Marketing Strategies for...,
1,ArtificialInteligence,Digital marketing trends that Paves the Way of...,
2,ArtificialInteligence,How to Boost Your Team’s Performance and Produ...,
3,ArtificialInteligence,How is Artificial Intelligence Bringing Pivota...,
4,ArtificialInteligence,Very promising and developing project. Modern ...,


In [99]:
df.isnull().sum()

subreddit        0
title            0
selftext     13652
dtype: int64

**Let's see what a title might look like:**

In [100]:
df['title'][0]

'How MSMEs Manipulates Marketing Strategies for Success https://onpassive.pt/how-msmes-manipulates-marketing-strategies-for-success/?feed_id=14768&amp;_unique_id=5f3a974c04f3e'

## **Train/Test Split**

In [101]:
X = df[['title']]
y = df['subreddit']

In [102]:
X.head(1)

Unnamed: 0,title
0,How MSMEs Manipulates Marketing Strategies for...


In [103]:
X.shape

(27986, 1)

In [104]:
y.shape

(27986,)

In [105]:
# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

In [106]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)


(20989, 1)
(6997, 1)
(20989,)
(6997,)


In [107]:
X_train['title']

8909     How Artificial Intelligence Has Transformed Ba...
18689                                      NLP Theoretical
20762            [D] Help me choosing the best valued GPU.
18314    Machine Learning in Materials Modeling -- Fund...
16651    Perceptron Learning Algorithm Explained in Detail
                               ...                        
17787                  MLPs to Find Extrema of Functionals
21793    [R] Speeding Up Neural Network Training with D...
7063           Filter Out Your Data With Today's Simple AI
16868    [R] Style-Controllable Speech-Driven Gesture S...
22904    Is there a Python version of Dr. Koller's Prob...
Name: title, Length: 20989, dtype: object

## **Function for Cleaning**

In [118]:
def review_to_words(raw_review):
    # Function to convert a raw review to a string of words
    # The input is a single string (a raw movie review), and 
    # the output is a single string (a preprocessed movie review)
    
    # 1. Remove HTML.
    review_text = raw_review
#     BeautifulSoup(raw_review).get_text()
    
    # 2. Remove non-letters.
    letters_only = re.sub("[^a-zA-Z]", " ", review_text)
    
    # 3. Convert to lower case, split into individual words.
    words = letters_only.lower().split()
    
    # 4. In Python, searching a set is much faster than searching
    # a list, so convert the stop words to a set.
    stops = set(stopwords.words('english'))
    
    # 5. Remove stop words.
    meaningful_words = [w for w in words if not w in stops]
    
    # 6. Join the words back into one string separated by space, 
    # and return the result.
    return(" ".join(meaningful_words))

In [119]:
# Get the number of reviews based on the dataframe size.
total_titles = df.shape[0]
print(f'There are {total_titles} titles.')

# Initialize an empty list to hold the clean reviews.
clean_train_titles = []
clean_test_titles = []

There are 27986 titles.


In [120]:
print("Cleaning and parsing the training set for titles...")

j = 0

for train_title in X_train['title']:
    # Convert review to words, then append to clean_train_reviews.
    clean_train_titles.append(review_to_words(train_title))
    
    # If the index is divisible by 1000, print a message
    if (j + 1) % 1000 == 0:
        print(f'Review {j + 1} of {total_titles}.')
    
    j += 1

# Let's do the same for our testing set.

print("Cleaning and parsing the testing set for titles...")

for test_title in X_test['title']:
    # Convert review to words, then append to clean_train_reviews.
    clean_test_titles.append(review_to_words(test_title))
    
    # If the index is divisible by 1000, print a message
    if (j + 1) % 1000 == 0:
        print(f'Review {j + 1} of {total_titles}.')
        
    j += 1

Cleaning and parsing the training set for titles...
Review 1000 of 27986.
Review 2000 of 27986.
Review 3000 of 27986.
Review 4000 of 27986.
Review 5000 of 27986.
Review 6000 of 27986.
Review 7000 of 27986.
Review 8000 of 27986.
Review 9000 of 27986.
Review 10000 of 27986.
Review 11000 of 27986.
Review 12000 of 27986.
Review 13000 of 27986.
Review 14000 of 27986.
Review 15000 of 27986.
Review 16000 of 27986.
Review 17000 of 27986.
Review 18000 of 27986.
Review 19000 of 27986.
Review 20000 of 27986.
Cleaning and parsing the testing set for titles...
Review 21000 of 27986.
Review 22000 of 27986.
Review 23000 of 27986.
Review 24000 of 27986.
Review 25000 of 27986.
Review 26000 of 27986.
Review 27000 of 27986.


In [121]:
len(clean_train_titles)

20989

In [122]:
len(clean_test_titles)

6997

In [127]:
from sklearn.feature_extraction.text import CountVectorizer

In [129]:
# Instantiate the "CountVectorizer" object, which is scikit-learn's bag of words tool
vectorizer = CountVectorizer(analyzer = "word",
                             tokenizer = None,
                             preprocessor = None,
                             stop_words = None,
                             max_features = 5000)

In [130]:
# fit_transform() does two functions: First, it fits the model
# and learns the vocabulary; second, it transforms our training data
# into feature vectors. The input to fit_transform should be a list of 
# strings.

train_data_features = vectorizer.fit_transform(clean_train_titles)

test_data_features = vectorizer.transform(clean_test_titles)

# Numpy arrays are easy to work with, so convert the result to an 
# array.
train_data_features = train_data_features.toarray()

In [131]:
print(train_data_features.shape)

(20989, 5000)


In [132]:
print(test_data_features.shape)

(6997, 5000)


In [133]:
train_data_features[0:6]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [134]:
vocab = vectorizer.get_feature_names()
print(vocab)



In [155]:
# Import logistic regression.

from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(penalty='l2', solver = 'liblinear', C = .10)

In [156]:
lr.fit(train_data_features, y_train)

LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [None]:
lr.

In [157]:
lr.score(train_data_features, y_train)

0.8393920625089333

In [158]:
lr.score(test_data_features, y_test)

0.8200657424610548

In [154]:
lr.coef_

array([[-0.00563485,  0.00382158, -0.65031521, ..., -0.35071622,
         0.74403866, -0.62460665]])

### **Model 1**

Model one LASSO Logistic Regression with alpha = 10 gets a train score of .83 and a test score of .82