In [37]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import regex as re
from nltk.tokenize import word_tokenize
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, ENGLISH_STOP_WORDS, TfidfVectorizer
from sklearn.metrics import roc_auc_score, confusion_matrix, roc_curve
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.neighbors import KNeighborsClassifier
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.feature_selection import chi2
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier


import nltk
nltk.download('punkt')

%matplotlib inline

[nltk_data] Downloading package punkt to C:\Users\Brianna
[nltk_data]     Lytle\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# IMPORT DATA FRAMES

In [38]:
college = pd.read_csv('./data/college_oct14')
grad_school = pd.read_csv('./data/grad_school_oct14')

In [39]:
#replacing all nan values with a space
college = college.replace(np.nan, " ")
grad_school = grad_school.replace(np.nan, " ")

In [40]:
#appending dataframes on top of eachother
c_gs = college.append(grad_school)

In [41]:
#combining all text of title and body into one column
c_gs['all_text'] = c_gs['title'] + ' ' + c_gs['body']
c_gs = c_gs.drop(['title', 'body'], axis = 1)
c_gs = c_gs.reset_index(drop=True)

In [42]:
c_gs['all_text'] = c_gs['all_text'].str.replace('[^\w\s]','')
c_gs.head()

Unnamed: 0,num_comments,up_votes,age,subreddit,all_text
0,435,88,47.065377,college,FAFSAfinancial aid verification questions Get ...
1,1,2,0.006776,college,Failed my midterm I failed my midterm and I wa...
2,1,1,0.008998,college,Professor gives butt load of homework but is o...
3,0,1,0.013535,college,Does a GPA round up My girlfriend is graduatin...
4,0,1,0.017274,college,Should I take CALC II Im a freshman Biochem Ma...


## Modeling

In [43]:
#map, change the labels of Subreddit 
#grad_school = 1, college = 0
c_gs['subreddit'] = c_gs['subreddit'].map({'college': 0, 'GradSchool' : 1})
c_gs.head()

Unnamed: 0,num_comments,up_votes,age,subreddit,all_text
0,435,88,47.065377,0,FAFSAfinancial aid verification questions Get ...
1,1,2,0.006776,0,Failed my midterm I failed my midterm and I wa...
2,1,1,0.008998,0,Professor gives butt load of homework but is o...
3,0,1,0.013535,0,Does a GPA round up My girlfriend is graduatin...
4,0,1,0.017274,0,Should I take CALC II Im a freshman Biochem Ma...


In [44]:
c_gs['tokenized_text'] = c_gs['all_text'].apply(word_tokenize)

In [45]:
c_gs['tokenized_text']

0       [FAFSAfinancial, aid, verification, questions,...
1       [Failed, my, midterm, I, failed, my, midterm, ...
2       [Professor, gives, butt, load, of, homework, b...
3       [Does, a, GPA, round, up, My, girlfriend, is, ...
4       [Should, I, take, CALC, II, Im, a, freshman, B...
                              ...                        
6624    [Cheap, graduate, schools, for, Economics, Eve...
6625    [Depressing, six, weeks, in, PhD, Hi, all, I, ...
6626    [Help, Im, eating, like, crap, x200B, Im, work...
6627    [3rd, year, and, I, just, need, to, rant, Im, ...
6628    [How, to, rereviewe, a, revised, and, resubmit...
Name: tokenized_text, Length: 6629, dtype: object

In [46]:
pd.set_option('display.max_rows', 500)

In [47]:
eda_words = list(ENGLISH_STOP_WORDS)
eda_words+=['im', 'grad', 'school', 'id', 'ive', 'college', 'phd', 'masters', 'just', 'like']

In [48]:
 pd.Series(' '.join(c_gs['all_text']).lower().split()).value_counts()[:100]

i           38059
to          30230
the         24488
a           23916
and         23907
my          16907
in          15109
of          14767
for         10420
that        10007
is           9733
it           9131
have         7738
im           7634
this         7543
you          7360
but          6921
with         6324
me           6142
on           6103
be           5421
do           5341
so           5175
not          5125
was          4898
or           4525
if           4461
at           4410
just         4358
school       4106
what         4053
as           4043
like         3959
are          3909
about        3678
am           3648
an           3475
get          3426
dont         3285
how          3235
all          3040
out          3013
would        2969
college      2835
time         2826
from         2807
your         2744
know         2726
can          2716
its          2551
they         2499
up           2483
want         2420
work         2366
one          2366
any       

In [49]:
c_gs.head()

Unnamed: 0,num_comments,up_votes,age,subreddit,all_text,tokenized_text
0,435,88,47.065377,0,FAFSAfinancial aid verification questions Get ...,"[FAFSAfinancial, aid, verification, questions,..."
1,1,2,0.006776,0,Failed my midterm I failed my midterm and I wa...,"[Failed, my, midterm, I, failed, my, midterm, ..."
2,1,1,0.008998,0,Professor gives butt load of homework but is o...,"[Professor, gives, butt, load, of, homework, b..."
3,0,1,0.013535,0,Does a GPA round up My girlfriend is graduatin...,"[Does, a, GPA, round, up, My, girlfriend, is, ..."
4,0,1,0.017274,0,Should I take CALC II Im a freshman Biochem Ma...,"[Should, I, take, CALC, II, Im, a, freshman, B..."


In [50]:
c_gs['word_count'] = c_gs['all_text'].apply(lambda x: len(x.split()))
c_gs.head()

Unnamed: 0,num_comments,up_votes,age,subreddit,all_text,tokenized_text,word_count
0,435,88,47.065377,0,FAFSAfinancial aid verification questions Get ...,"[FAFSAfinancial, aid, verification, questions,...",263
1,1,2,0.006776,0,Failed my midterm I failed my midterm and I wa...,"[Failed, my, midterm, I, failed, my, midterm, ...",92
2,1,1,0.008998,0,Professor gives butt load of homework but is o...,"[Professor, gives, butt, load, of, homework, b...",126
3,0,1,0.013535,0,Does a GPA round up My girlfriend is graduatin...,"[Does, a, GPA, round, up, My, girlfriend, is, ...",68
4,0,1,0.017274,0,Should I take CALC II Im a freshman Biochem Ma...,"[Should, I, take, CALC, II, Im, a, freshman, B...",167


# Modeling

In [51]:
#determine X and Y variables
X = c_gs['all_text']
y = c_gs['subreddit']

In [52]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.30,
                                                    stratify=y,
                                                    random_state=248)

In [53]:
pipe_1 = Pipeline([
    ('cvec', CountVectorizer()),
    ('lr', LogisticRegression())
])

pipe_2 = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('lr', LogisticRegression())
    
])

pipe_3 = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('lr', LogisticRegression())
    
])

pipe_ngram = Pipeline([
    ('cvec', CountVectorizer()),
    ('lr', LogisticRegression())
])

In [54]:
pipe_1_params = {
    'cvec__stop_words': [eda_words],
    'cvec__max_features': [3000, 4000, 5000],
    'cvec__min_df': [10, 25, 50],
    'cvec__max_df': [0.8, 0.9],
    'cvec__ngram_range': [(1,3), (1,2), (2,2)]
}

pipe_2_params = {
    'tfidf__stop_words': [eda_words],
    'tfidf__max_features': [1000, 1500, 1700],
    'tfidf__min_df': [20, 25, 40],
    'tfidf__max_df': [0.8, 0.85, 0.9],
}

pipe_3_params = {
    'tfidf__stop_words': [eda_words],
    'tfidf__max_features': [1500, 1700, 2000],
    'tfidf__min_df': [35, 37, 42],
    'tfidf__max_df': [0.85, 0.9],    
}

param_ngram = {
    'cvec__stop_words': [eda_words],
    'cvec__ngram_range': [(2,3), (3,3)]
}

In [55]:
gs_1 = GridSearchCV(pipe_1,
                   pipe_1_params,
                   cv = 5)
print('gs_1 completed')

gs_2 = GridSearchCV(pipe_2,
                   pipe_2_params,
                   cv = 5)
print('gs_2 completed')

gs_3 = GridSearchCV(pipe_3,
                   pipe_3_params,
                   cv = 5)
print('gs_3 completed')

gs_n_gram = GridSearchCV(pipe_ngram,
                        param_ngram,
                        cv = 5)
print('n_gram completed')

gs_1 completed
gs_2 completed
gs_3 completed
n_gram completed


In [56]:
gs_1.fit(X_train, y_train)
print(f'gs_1 best score is {gs_1.best_score_}. The best model from this grid search is {gs_1.best_estimator_}')

gs_2.fit(X_train, y_train)
print(f'gs_2 best score is {gs_2.best_score_}. The best model from this grid search is {gs_2.best_estimator_}')

gs_3.fit(X_train, y_train)
print(f'gs_3 best score is {gs_3.best_score_}. The best model from this grid search is {gs_3.best_estimator_}')

gs_n_gram.fit(X_train, y_train)
print(f'gs_n_gram best score is {gs_n_gram.best_score_}. The best model from this grid search is {gs_n_gram.best_estimator_}')



gs_1 best score is 0.8482758620689655. The best model from this grid search is Pipeline(memory=None,
     steps=[('cvec', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.8, max_features=4000, min_df=10,
        ngram_range=(1, 2), preprocessor=None,
        stop_words=['nine', '...penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))])




gs_2 best score is 0.8474137931034482. The best model from this grid search is Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.8, max_features=1500, min_df=20,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True...penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))])




gs_3 best score is 0.8400862068965518. The best model from this grid search is Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.85, max_features=1500, min_df=35,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=Tru...penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))])




gs_n_gram best score is 0.7064655172413793. The best model from this grid search is Pipeline(memory=None,
     steps=[('cvec', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(2, 3), preprocessor=None,
        stop_words=['nine', 'w...penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))])


## Overall Best Model

In [57]:
best_model = gs_3.best_estimator_
log_feature_names = best_model.named_steps['tfidf'].get_feature_names()

In [58]:
log_feature_coef = best_model.named_steps['lr'].coef_[0]

In [59]:
log_feature_exp = [np.exp(i) for i in log_feature_coef]

feature_dict = {
    
    'feature_names' : log_feature_names,
    'feature_coef' : log_feature_coef,
    'feature_exp' : log_feature_exp
    
}

best_model_words = pd.DataFrame(feature_dict)

In [60]:
best_model_words.sort_values(by = 'feature_exp', ascending = False).head(15)

Unnamed: 0,feature_names,feature_coef,feature_exp
760,program,5.34946,210.494647
1052,undergrad,4.927594,138.046949
822,research,4.370004,79.043937
1008,thesis,3.885991,48.715185
404,graduate,3.714981,41.057806
409,gre,3.585843,36.083755
22,academia,2.886974,17.938949
257,dissertation,2.749327,15.632102
761,programs,2.670937,14.453502
524,lab,2.3711,10.709161


In [61]:
best_model_words = best_model_words.sort_values(by = 'feature_exp', ascending = False).head(15)

# More Modeling with Classifiers

## Random Forest

In [62]:
cvec = CountVectorizer (lowercase=True, stop_words= eda_words ,ngram_range = (1,1), max_df = .95, min_df= 5)
c_gs_matrix = (cvec.fit_transform(c_gs['all_text']))
c_gs_df = pd.DataFrame(c_gs_matrix.toarray(),
                        columns = cvec.get_feature_names())
c_gs_df['subreddit'] = c_gs['subreddit']


In [22]:
X = c_gs_df.loc[:, c_gs_df.columns !='subreddit']
y = c_gs_df['subreddit']

In [23]:
X_train, X_val, y_train, y_val = train_test_split(X, y,
                                                  random_state=42,
                                                  stratify=y)
rf = RandomForestClassifier(n_estimators = 10)
cross_val_score(rf, X_train, y_train, cv=5).mean()

0.8316251276503241

In [24]:
rf = RandomForestClassifier(random_state = 248)
rf_params = {
    'n_estimators' : [50, 100, 200],
    'max_depth' : [50, 100, 200]
}
gs = GridSearchCV(rf, param_grid = rf_params, cv = 5)
gs.fit(X_train, y_train)
print(gs.best_score_)
gs.best_params_

0.8626030979682157


{'max_depth': 200, 'n_estimators': 200}

## AdaBoostClassifier

In [25]:
ada = AdaBoostClassifier(base_estimator=DecisionTreeClassifier())
ada_params = {
    'n_estimators': [50,100],
    'base_estimator__max_depth': [1,2],
    'learning_rate': [.9, 1.]
}
gs = GridSearchCV(ada, param_grid=ada_params, cv=3)
gs.fit(X_train, y_train)
print(gs.best_score_)
gs.best_params_

0.8257895795614565


{'base_estimator__max_depth': 2, 'learning_rate': 1.0, 'n_estimators': 100}