## Importing libraries and reading in data

In [50]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
#from sklearn.neighbors import KNeighborsClassifier

import nltk
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

%matplotlib inline

In [51]:
reddit_posts_df = pd.read_csv('combinedData.csv')
reddit_posts_df

Unnamed: 0,selftext,subreddit,title
0,"""It's A Little Fit Bunny.""",1,Elton John has bought his pet rabbit a treadmill.
1,A Perfect Urkel,1,OK hear me out. Jaleel White. Maynard James Ke...
2,"""How do you determine whether or not an older ...",1,THE BATHTUB TEST: During a visit to my doctors...
3,A Luigi board!,1,How does the Italian plumber talk to spirits?
4,A quack-job.,1,What do you call a blowjob from someone preten...
...,...,...,...
3468,"Without missing a beat, she replies with ""Buga...",0,Proud of my wife for this one. I'm playing a n...
3469,\n\n\n\n\n\n\nYee-Hanukkah\n\n\nEdit: I showed...,0,From my 10 year old daughter… What does a Jewi...
3470,‘I knead it for work.’,0,Baker to wife: ‘can you pick up some flour?’
3471,Wet.,0,What do you get when you cross a like and a ri...


### Creating a features list with text and title (X) and the subreddit as the target (Y)

In [52]:
features = ['selftext', 'title']
X = reddit_posts_df[features]
y = reddit_posts_df.subreddit

In [53]:
X.head()

Unnamed: 0,selftext,title
0,"""It's A Little Fit Bunny.""",Elton John has bought his pet rabbit a treadmill.
1,A Perfect Urkel,OK hear me out. Jaleel White. Maynard James Ke...
2,"""How do you determine whether or not an older ...",THE BATHTUB TEST: During a visit to my doctors...
3,A Luigi board!,How does the Italian plumber talk to spirits?
4,A quack-job.,What do you call a blowjob from someone preten...


In [84]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=5,test_size=0.20,stratify=y)

## Cleaning data
### Using a count vectorizer to remove english stop words, strip ascii accents and clean any noise from text/title data

In [85]:
cv_text = CountVectorizer(stop_words='english', strip_accents = 'ascii', ngram_range=(1, 6), min_df=.03) #0.20,0.95
cv_title = CountVectorizer(stop_words='english', strip_accents = 'ascii', ngram_range=(1, 3), min_df=.01)

### Splitting data further into seperate text and title dataframes

In [86]:
X_train_text = cv_text.fit_transform(X_train.selftext.values.astype('U'))
X_train_title = cv_title.fit_transform(X_train.title)

X_test_text = cv_text.transform(X_test.selftext.values.astype('U'))
X_test_title = cv_title.transform(X_test.title)
#x = v.fit_transform(df['Review'].values.astype('U'))
#print(X_test_text)

In [87]:
X_train_text_df = pd.DataFrame(X_train_text.todense(), columns=[x+'_selftext' for x in cv_text.get_feature_names()])
X_train_text_df.shape

(2778, 14)

In [88]:
X_train_title_df = pd.DataFrame(X_train_title.todense(), columns=[y+'_title' for y in cv_title.get_feature_names()])
X_train_title_df.shape

(2778, 46)

In [89]:
X_test_text_df = pd.DataFrame(X_test_text.todense(), columns=[x+'_selftext' for x in cv_text.get_feature_names()])
X_test_text_df.shape

(695, 14)

In [90]:
X_test_title_df = pd.DataFrame(X_test_title.todense(), columns=[y+'_title' for y in cv_title.get_feature_names()])
X_test_title_df.shape

(695, 46)

### Concatenating both training and testing data again

In [92]:
vecced_train_reddit_posts = pd.concat([X_train_text_df, X_train_title_df], axis=1)
vecced_test_reddit_posts = pd.concat([X_test_text_df, X_test_title_df], axis=1)
vecced_train_reddit_posts.shape

(2778, 60)

In [93]:
# making sure no null values in data
vecced_train_reddit_posts.isnull().sum().sum()

0

## Logistic regression model

In [94]:
def run_the_lr_models(model):
    
    if model == 'lr_1':
        
        lr_1_params = {
            'penalty': ['l1'],
            'C': [1, 1.5, 2, 2.5],
            'class_weight': ['balanced'],
            'warm_start': [True, False],
            'random_state': [5],
            'solver': ['liblinear']}
        
        M = GridSearchCV(LogisticRegression(),
                        lr_1_params,
                        cv = 5,
                        verbose = 1,
                        n_jobs = -1)
        
    elif model == 'lr_2':
        
        lr_2_params = {
            'penalty': ['l2'],
            'C': [1, 1.5, 2, 2.5],
            'class_weight': ['balanced'],
            'warm_start': [True, False],            
            'random_state': [5],
            'solver': ['lbfgs', 'liblinear']}
        
        M = GridSearchCV(LogisticRegression(),
                        lr_2_params,
                        cv = 5,
                        verbose = 1,
                        n_jobs = -1)

    else:
        print('There is an error.')
        
    M.fit(vecced_train_reddit_posts.values, y_train)
     
    print(f'Train score = {M.score(vecced_train_reddit_posts.values, y_train)}')
    print(f'Test score = {M.score(vecced_test_reddit_posts.values, y_test)}')
    
    predictions = M.predict(vecced_test_reddit_posts.values)
    print('--------')
    print(confusion_matrix(y_test, predictions))
    print(f'Best params = {M.best_params_}')


In [95]:
run_the_lr_models('lr_1')

Fitting 5 folds for each of 8 candidates, totalling 40 fits
Train score = 0.6166306695464363
Test score = 0.6172661870503597
--------
[[287  62]
 [204 142]]
Best params = {'C': 1, 'class_weight': 'balanced', 'penalty': 'l1', 'random_state': 5, 'solver': 'liblinear', 'warm_start': True}


In [96]:
run_the_lr_models('lr_2')

Fitting 5 folds for each of 16 candidates, totalling 80 fits
Train score = 0.6184305255579554
Test score = 0.6187050359712231
--------
[[281  68]
 [197 149]]
Best params = {'C': 1, 'class_weight': 'balanced', 'penalty': 'l2', 'random_state': 5, 'solver': 'lbfgs', 'warm_start': True}
