## **Import Libraries**

In [95]:
import pandas as pd
import re
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from sklearn.model_selection import train_test_split
from bs4 import BeautifulSoup       
from nltk.corpus import stopwords
from sklearn.feature_extraction import stop_words

## **Import Dataframes**

In [2]:
data_ai = pd.read_csv('../project_3-master/data/data_ai.csv')
data_ml = pd.read_csv('../project_3-master/data/data_ml.csv')

In [3]:
data_ai.head()

Unnamed: 0,subreddit,title,selftext
0,ArtificialInteligence,How MSMEs Manipulates Marketing Strategies for...,
1,ArtificialInteligence,Digital marketing trends that Paves the Way of...,
2,ArtificialInteligence,How to Boost Your Team’s Performance and Produ...,
3,ArtificialInteligence,How is Artificial Intelligence Bringing Pivota...,
4,ArtificialInteligence,Very promising and developing project. Modern ...,


## **Merge the Data**

In [4]:
df = data_ai.append(data_ml).reset_index()

In [5]:
df.drop(columns='index',inplace=True)

In [6]:
df.head()

Unnamed: 0,subreddit,title,selftext
0,ArtificialInteligence,How MSMEs Manipulates Marketing Strategies for...,
1,ArtificialInteligence,Digital marketing trends that Paves the Way of...,
2,ArtificialInteligence,How to Boost Your Team’s Performance and Produ...,
3,ArtificialInteligence,How is Artificial Intelligence Bringing Pivota...,
4,ArtificialInteligence,Very promising and developing project. Modern ...,


In [7]:
df.isnull().sum()

subreddit        0
title            0
selftext     13652
dtype: int64

**Let's see what a title might look like:**

In [8]:
df['title'][0]

'How MSMEs Manipulates Marketing Strategies for Success https://onpassive.pt/how-msmes-manipulates-marketing-strategies-for-success/?feed_id=14768&amp;_unique_id=5f3a974c04f3e'

In [11]:
df['selftext'][0]

nan

In [26]:
df['selftext'] = df['selftext'].fillna('None')

In [30]:
df['selftext'].isna().sum()

0

## **Train/Test Split**

In [31]:
X = df[['title', 'selftext']]
y = df['subreddit']

In [32]:
X.head(1)

Unnamed: 0,title,selftext
0,How MSMEs Manipulates Marketing Strategies for...,


In [33]:
X.shape

(27986, 2)

In [34]:
y.shape

(27986,)

In [35]:
# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

In [36]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)


(20989, 2)
(6997, 2)
(20989,)
(6997,)


In [37]:
X_train['title']

8909     How Artificial Intelligence Has Transformed Ba...
18689                                      NLP Theoretical
20762            [D] Help me choosing the best valued GPU.
18314    Machine Learning in Materials Modeling -- Fund...
16651    Perceptron Learning Algorithm Explained in Detail
                               ...                        
17787                  MLPs to Find Extrema of Functionals
21793    [R] Speeding Up Neural Network Training with D...
7063           Filter Out Your Data With Today's Simple AI
16868    [R] Style-Controllable Speech-Driven Gesture S...
22904    Is there a Python version of Dr. Koller's Prob...
Name: title, Length: 20989, dtype: object

In [38]:
X_train['selftext']

8909                                                  None
18689                                                 None
20762    I'm a newbie in ML, DL. Using an old laptop to...
18314                                            [removed]
16651                                            [deleted]
                               ...                        
17787                                                 None
21793    Abstract:\n\nIn the twilight of Moore's law, G...
7063     [https://todayssimpleai.blogspot.com/2019/10/t...
16868                                                 None
22904                                            [removed]
Name: selftext, Length: 20989, dtype: object

## **Function for Cleaning**

In [47]:
def review_to_words(raw_review):
    # Function to convert a raw review to a string of words
    # The input is a single string (a raw movie review), and 
    # the output is a single string (a preprocessed movie review)
    
    # 1. Remove HTML.
    review_text = raw_review
#     BeautifulSoup(raw_review).get_text()
    
    # 2. Remove non-letters.
    letters_only = re.sub("[^a-zA-Z]", " ", review_text)
    
    # 3. Convert to lower case, split into individual words.
    words = letters_only.lower().split()
    
    # 4. In Python, searching a set is much faster than searching
    # a list, so convert the stop words to a set.
    stops = set(stopwords.words('english'))
    
    # 5. Remove stop words.
    meaningful_words = [w for w in words if not w in stops]
    
    # 6. Join the words back into one string separated by space, 
    # and return the result.
    return(" ".join(meaningful_words))

In [48]:
# Get the number of reviews based on the dataframe size.
total_titles = df.shape[0]
print(f'There are {total_titles} titles.')

# Initialize an empty list to hold the clean titles.
clean_train_titles = []
clean_test_titles = []

# Initialize an empty list to hold the clean titles.
clean_train_selftext = []
clean_test_selftext = []

There are 27986 titles.


In [49]:
print("Cleaning and parsing the training set for titles...")

j = 0

for train_title in X_train['title']:
    # Convert review to words, then append to clean_train_reviews.
    clean_train_titles.append(review_to_words(train_title))
    
    # If the index is divisible by 1000, print a message
    if (j + 1) % 1000 == 0:
        print(f'Review {(j + 1) / 2} of {total_titles}.')
    
    j += 1

# Let's do the same for our testing set.

print("Cleaning and parsing the testing set for titles...")

for test_title in X_test['title']:
    # Convert review to words, then append to clean_train_reviews.
    clean_test_titles.append(review_to_words(test_title))
    
    # If the index is divisible by 1000, print a message
    if (j + 1) % 1000 == 0:
        print(f'Review {(j + 1) / 2} of {total_titles}.')
        
    j += 1
    
# SELFTEXT
print("Cleaning and parsing the training set for selftext...")
    
for train_selftext in X_train['selftext']:
    # Convert review to words, then append to clean_train_reviews.
    clean_train_selftext.append(review_to_words(train_selftext))
    
    # If the index is divisible by 1000, print a message
    if (j + 1) % 1000 == 0:
        print(f'Review {(j + 1) / 2} of {total_titles}.')
    
    j += 1

    
#    Testing 
print("Cleaning and parsing the testing set for selftext...")

for test_selftext in X_test['selftext']:
    # Convert review to words, then append to clean_train_reviews.
    clean_test_selftext.append(review_to_words(test_selftext))
    
    # If the index is divisible by 1000, print a message
    if (j + 1) % 1000 == 0:
        print(f'Review {(j + 1) / 2} of {total_titles}.')
        
    j += 1

Cleaning and parsing the training set for titles...
Review 500.0 of 27986.
Review 1000.0 of 27986.
Review 1500.0 of 27986.
Review 2000.0 of 27986.
Review 2500.0 of 27986.
Review 3000.0 of 27986.
Review 3500.0 of 27986.
Review 4000.0 of 27986.
Review 4500.0 of 27986.
Review 5000.0 of 27986.
Review 5500.0 of 27986.
Review 6000.0 of 27986.
Review 6500.0 of 27986.
Review 7000.0 of 27986.
Review 7500.0 of 27986.
Review 8000.0 of 27986.
Review 8500.0 of 27986.
Review 9000.0 of 27986.
Review 9500.0 of 27986.
Review 10000.0 of 27986.
Cleaning and parsing the testing set for titles...
Review 10500.0 of 27986.
Review 11000.0 of 27986.
Review 11500.0 of 27986.
Review 12000.0 of 27986.
Review 12500.0 of 27986.
Review 13000.0 of 27986.
Review 13500.0 of 27986.
Cleaning and parsing the training set for selftext...
Review 14000.0 of 27986.
Review 14500.0 of 27986.
Review 15000.0 of 27986.
Review 15500.0 of 27986.
Review 16000.0 of 27986.
Review 16500.0 of 27986.
Review 17000.0 of 27986.
Review 17500.

In [61]:
clean_train_titles[:10]

['artificial intelligence transformed banking',
 'nlp theoretical',
 'help choosing best valued gpu',
 'machine learning materials modeling fundamentals opportunities materials',
 'perceptron learning algorithm explained detail',
 'integration machine learning amp artificial intelligence data analytics',
 'paper explained object centric learning slot attention full video analysis',
 'largest country africa',
 'discussion deoldify says use gans anymore',
 'augmented virtual reality development ar vr app solutions india']

In [51]:
len(clean_test_titles)

6997

In [52]:
len(clean_train_selftext)

20989

In [53]:
len(clean_test_selftext)

6997

In [74]:
type(clean_train_titles)

list

In [75]:
train_total = [clean_train_titles, clean_train_selftext]
test_total = [clean_test_titles, clean_test_selftext]

In [None]:
for n in train_total:

In [76]:
type(train_total)

tuple

In [80]:
from sklearn.feature_extraction.text import CountVectorizer

In [103]:
# Instantiate the "CountVectorizer" object, which is scikit-learn's bag of words tool
cvec = CountVectorizer(max_features=1000, lowercase=True, stop_words='english')

In [104]:
for n in train_total:
    cvec.fit(n)

In [105]:
X_train = cvec.transform(n)
print(type(X_train))
print(X_train)

<class 'scipy.sparse.csr.csr_matrix'>
  (2, 100)	2
  (2, 115)	1
  (2, 259)	1
  (2, 352)	1
  (2, 376)	1
  (2, 566)	1
  (2, 571)	1
  (2, 575)	1
  (2, 590)	1
  (2, 605)	1
  (2, 805)	1
  (2, 855)	1
  (2, 871)	1
  (2, 910)	1
  (2, 913)	1
  (2, 945)	1
  (2, 981)	1
  (3, 735)	1
  (4, 230)	1
  (6, 1)	1
  (6, 2)	2
  (6, 32)	1
  (6, 38)	4
  (6, 57)	1
  (6, 64)	2
  :	:
  (20985, 588)	2
  (20985, 613)	1
  (20985, 614)	1
  (20985, 617)	1
  (20985, 622)	2
  (20985, 634)	1
  (20985, 640)	3
  (20985, 654)	1
  (20985, 666)	1
  (20985, 713)	1
  (20985, 762)	1
  (20985, 901)	2
  (20985, 910)	1
  (20985, 915)	6
  (20985, 940)	1
  (20985, 945)	1
  (20985, 953)	3
  (20986, 63)	2
  (20986, 99)	2
  (20986, 158)	2
  (20986, 414)	2
  (20986, 416)	2
  (20986, 458)	2
  (20986, 906)	2
  (20988, 735)	1


In [106]:
# Convert X_train into a DataFrame
X_train_df = pd.DataFrame(X_train.toarray(),
                   columns=cvec.get_feature_names())
X_train_df

Unnamed: 0,ability,able,abs,abstract,academic,accepted,access,according,account,accuracy,...,written,wrong,wrote,www,year,years,yes,youtu,youtube,zero
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20984,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20985,0,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20986,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20987,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [107]:
# Transform test
for n in test_total:
    X_test = cvec.transform(n)
    X_test_df = pd.DataFrame(X_test.toarray(), columns=cvec.get_feature_names())
    
X_test_df

Unnamed: 0,ability,able,abs,abstract,academic,accepted,access,according,account,accuracy,...,written,wrong,wrote,www,year,years,yes,youtu,youtube,zero
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6992,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6993,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6994,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [108]:
# Import logistic regression.

from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(penalty='l2', solver = 'liblinear', C = .10)

In [109]:
lr.fit(X_train, y_train)

LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [110]:
lr.score(X_train, y_train)

0.8191910048120444

In [111]:
lr.score(X_test, y_test)

0.7986279834214663

In [114]:
lr.coef_[0][1]

-0.12032121941389032

### **Model 1**

Model one LASSO Logistic Regression with alpha = 10 gets a train score of .83 and a test score of .82